diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh index a701bfe15e0..e6a2527643a 100755 --- a/ci/test_wheel_cudf.sh +++ b/ci/test_wheel_cudf.sh @@ -34,22 +34,7 @@ RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"} RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" - -rapids-logger "pytest pylibcudf" -pushd python/pylibcudf/pylibcudf/tests -python -m pytest \ - --cache-clear \ - --numprocesses=8 \ - --dist=worksteal \ - . -popd - rapids-logger "pytest cudf" pushd python/cudf/cudf/tests -python -m pytest \ - --cache-clear \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . +python -m pytest --cache-clear . popd diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 99b759e2166..893047d0379 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -1,5 +1,3 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - # If libcudf was installed as a wheel, we must request it to load the library symbols. # Otherwise, we assume that the library was installed in a system path that ld can find. try: @@ -10,147 +8,5 @@ libcudf.load_library() del libcudf -# _setup_numba _must be called before numba.cuda is imported, because -# it sets the numba config variable responsible for enabling -# Minor Version Compatibility. Setting it after importing numba.cuda has no effect. -from cudf.utils._numba import _setup_numba -from cudf.utils.gpu_utils import validate_setup - -_setup_numba() -validate_setup() - -import cupy -from numba import config as numba_config, cuda - -import rmm -from rmm.allocators.cupy import rmm_cupy_allocator -from rmm.allocators.numba import RMMNumbaManager - -from cudf import api, core, datasets, testing -from cudf._version import __git_commit__, __version__ -from cudf.api.extensions import ( - register_dataframe_accessor, - register_index_accessor, - register_series_accessor, -) -from cudf.api.types import dtype -from cudf.core.algorithms import factorize, unique -from cudf.core.cut import cut -from cudf.core.dataframe import DataFrame, from_dataframe, from_pandas, merge -from cudf.core.dtypes import ( - CategoricalDtype, - Decimal32Dtype, - Decimal64Dtype, - Decimal128Dtype, - IntervalDtype, - ListDtype, - StructDtype, -) -from cudf.core.groupby import Grouper, NamedAgg -from cudf.core.index import ( - BaseIndex, - CategoricalIndex, - DatetimeIndex, - Index, - IntervalIndex, - RangeIndex, - TimedeltaIndex, - interval_range, -) -from cudf.core.missing import NA, NaT -from cudf.core.multiindex import MultiIndex -from cudf.core.reshape import ( - concat, - crosstab, - get_dummies, - melt, - pivot, - pivot_table, - unstack, -) -from cudf.core.scalar import Scalar -from cudf.core.series import Series, isclose -from cudf.core.tools.datetimes import DateOffset, date_range, to_datetime -from cudf.core.tools.numeric import to_numeric -from cudf.io import ( - from_dlpack, - read_avro, - read_csv, - read_feather, - read_hdf, - read_json, - read_orc, - read_parquet, - read_text, -) -from cudf.options import ( - describe_option, - get_option, - option_context, - set_option, -) -from cudf.utils.utils import clear_cache - -cuda.set_memory_manager(RMMNumbaManager) -cupy.cuda.set_allocator(rmm_cupy_allocator) - - -rmm.register_reinitialize_hook(clear_cache) - - -__all__ = [ - "BaseIndex", - "CategoricalDtype", - "CategoricalIndex", - "DataFrame", - "DateOffset", - "DatetimeIndex", - "Decimal32Dtype", - "Decimal64Dtype", - "Decimal128Dtype", - "Grouper", - "Index", - "IntervalDtype", - "IntervalIndex", - "ListDtype", - "MultiIndex", - "NA", - "NaT", - "RangeIndex", - "Scalar", - "Series", - "StructDtype", - "TimedeltaIndex", - "api", - "concat", - "crosstab", - "cut", - "date_range", - "describe_option", - "factorize", - "from_dataframe", - "from_dlpack", - "from_pandas", - "get_dummies", - "get_option", - "interval_range", - "isclose", - "melt", - "merge", - "option_context", - "pivot", - "pivot_table", - "read_avro", - "read_csv", - "read_feather", - "read_hdf", - "read_json", - "read_orc", - "read_parquet", - "read_text", - "set_option", - "testing", - "to_datetime", - "to_numeric", - "unstack", -] +from ptxcompiler.patch import safe_get_versions +from cudf._lib import strings_udf diff --git a/python/cudf/cudf/_fuzz_testing/__init__.py b/python/cudf/cudf/_fuzz_testing/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py deleted file mode 100644 index d9974037daa..00000000000 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -import copy -import io -import logging -import random - -import numpy as np - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pandas_to_avro, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class AvroReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._df = None - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - - {"category"} - # No unsigned support in avro: - # https://avro.apache.org/docs/current/spec.html - - cudf.utils.dtypes.UNSIGNED_TYPES - # TODO: Remove DATETIME_TYPES once - # following bug is fixed: - # https://github.com/rapidsai/cudf/issues/6482 - - cudf.utils.dtypes.DATETIME_TYPES - # TODO: Remove DURATION_TYPES once - # following bug is fixed: - # https://github.com/rapidsai/cudf/issues/6604 - - cudf.utils.dtypes.TIMEDELTA_TYPES - ) - - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_cols"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - self._df = df - logging.info(f"Shape of DataFrame generated: {table.shape}") - - file_obj = io.BytesIO() - pandas_to_avro(df, file_io_obj=file_obj) - file_obj.seek(0) - buf = file_obj.read() - self._current_buffer = copy.copy(buf) - return (df, buf) - - def write_data(self, file_name): - if self._current_buffer is not None: - with open(file_name + "_crash.avro", "wb") as crash_dataset: - crash_dataset.write(self._current_buffer) - - def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "columns": - col_size = self._rand(len(self._df.columns)) - params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) - ) - elif param in ("skiprows", "num_rows"): - params_dict[param] = np.random.choice( - [None, self._rand(len(self._df))] - ) - else: - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py deleted file mode 100644 index 67211a1c4bf..00000000000 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import logging -import random - -import numpy as np - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg -from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class CSVReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - - logging.info(f"Shape of DataFrame generated: {df.shape}") - self._current_buffer = df - return df.to_csv() - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_csv(file_name + "_crash.csv") - - def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "usecols": - col_size = self._rand(len(self._df.columns)) - col_val = np.random.choice( - [ - None, - np.unique( - np.random.choice(self._df.columns, col_size) - ), - ] - ) - params_dict[param] = ( - col_val if col_val is None else list(col_val) - ) - elif param == "dtype": - dtype_val = np.random.choice( - [None, self._df.dtypes.to_dict()] - ) - if dtype_val is not None: - dtype_val = { - col_name: "category" - if isinstance(dtype, cudf.CategoricalDtype) - else pandas_dtypes_to_np_dtypes[dtype] - for col_name, dtype in dtype_val.items() - } - params_dict[param] = dtype_val - elif param == "header": - header_val = np.random.choice( - ["infer", np.random.randint(low=0, high=len(self._df))] - ) - params_dict[param] = header_val - elif param == "skiprows": - params_dict[param] = np.random.randint( - low=0, high=len(self._df) - ) - elif param == "skipfooter": - params_dict[param] = np.random.randint( - low=0, high=len(self._df) - ) - elif param == "nrows": - nrows_val = np.random.choice( - [None, np.random.randint(low=0, high=len(self._df))] - ) - params_dict[param] = nrows_val - else: - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) - - -class CSVWriter(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - - logging.info(f"Shape of DataFrame generated: {df.shape}") - self._current_buffer = df - return df - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_csv(file_name + "_crash.csv") - - def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "columns": - col_size = self._rand(len(self._current_buffer.columns)) - params_dict[param] = list( - np.unique( - np.random.choice( - self._current_buffer.columns, col_size - ) - ) - ) - elif param == "chunksize": - params_dict[param] = np.random.choice( - [ - None, - np.random.randint( - low=1, high=max(1, len(self._current_buffer)) - ), - ] - ) - else: - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/fuzzer.py b/python/cudf/cudf/_fuzz_testing/fuzzer.py deleted file mode 100644 index ee1b2c1f1c4..00000000000 --- a/python/cudf/cudf/_fuzz_testing/fuzzer.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import datetime -import json -import logging -import os -import sys -import traceback - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class Fuzzer: - def __init__( - self, - target, - data_handler_class, - dirs=None, - crash_reports_dir=None, - regression=False, - max_rows_size=100_000, - max_cols_size=1000, - runs=-1, - max_string_length=None, - params=None, - write_data_on_failure=True, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - self._target = target - self._dirs = [] if dirs is None else dirs - self._crash_dir = crash_reports_dir - self._data_handler = data_handler_class( - dirs=self._dirs, - max_rows=max_rows_size, - max_columns=max_cols_size, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._total_executions = 0 - self._regression = regression - self._start_time = None - self.runs = runs - self.params = params - self.write_data_on_failure = write_data_on_failure - - def log_stats(self): - end_time = datetime.datetime.now() - total_time_taken = end_time - self._start_time - - logging.info(f"Run-Time elapsed (hh:mm:ss.ms) {total_time_taken}") - - def write_crash(self, error): - error_file_name = str(datetime.datetime.now()) - if self._crash_dir: - crash_path = os.path.join( - self._crash_dir, - error_file_name + "_crash.json", - ) - crash_log_path = os.path.join( - self._crash_dir, - error_file_name + "_crash.log", - ) - else: - crash_path = error_file_name + "_crash.json" - crash_log_path = error_file_name + "_crash.log" - - with open(crash_path, "w") as f: - json.dump( - self._data_handler.current_params, f, sort_keys=True, indent=4 - ) - - logging.info(f"Crash params was written to {crash_path}") - - with open(crash_log_path, "w") as f: - f.write(str(error)) - logging.info(f"Crash exception was written to {crash_log_path}") - - if self.write_data_on_failure: - self._data_handler.write_data(error_file_name) - - def start(self): - while True: - logging.info(f"Running test {self._total_executions}") - file_name = self._data_handler.generate_input() - try: - self._start_time = datetime.datetime.now() - if self.params is None: - self._target(file_name) - else: - self._data_handler.set_rand_params(self.params) - kwargs = self._data_handler._current_params["test_kwargs"] - logging.info(f"Parameters passed: {str(kwargs)}") - self._target(file_name, **kwargs) - except KeyboardInterrupt: - logging.info( - f"Keyboard Interrupt encountered, stopping after " - f"{self.runs} runs." - ) - sys.exit(0) - except Exception as e: - logging.exception(e) - self.write_crash(traceback.format_exc()) - self.log_stats() - if self.runs != -1 and self._total_executions >= self.runs: - logging.info(f"Completed {self.runs}, stopping now.") - break - - self._total_executions += 1 diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py deleted file mode 100644 index ffb7171a855..00000000000 --- a/python/cudf/cudf/_fuzz_testing/io.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -import copy -import json -import logging -import os -import random -import sys - -import numpy as np - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class IOFuzz: - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - max_structs_nesting_depth=None, - max_struct_null_frequency=None, - max_struct_types_at_each_level=None, - ): - dirs = [] if dirs is None else dirs - self._inputs = [] - self._max_rows = max_rows - self._max_columns = max_columns - self._max_string_length = max_string_length - self._max_lists_length = max_lists_length - self._max_lists_nesting_depth = max_lists_nesting_depth - self._max_structs_nesting_depth = max_structs_nesting_depth - self._max_struct_null_frequency = max_struct_null_frequency - self._max_struct_types_at_each_level = max_struct_types_at_each_level - - for i, path in enumerate(dirs): - if i == 0 and not os.path.exists(path): - raise FileNotFoundError(f"No {path} exists") - - if os.path.isfile(path) and path.endswith("_crash.json"): - self._load_params(path) - else: - for i in os.listdir(path): - file_name = os.path.join(path, i) - if os.path.isfile(file_name) and file_name.endswith( - "_crash.json" - ): - self._load_params(file_name) - self._regression = bool(self._inputs) - self._idx = 0 - self._current_params = {} - self._current_buffer = None - - def _load_params(self, path): - with open(path) as f: - params = json.load(f) - self._inputs.append(params) - - @staticmethod - def _rand(n): - return random.randrange(0, n + 1) - - def generate_input(self): - raise NotImplementedError("Must be implemented by inherited class") - - @property - def current_params(self): - return self._current_params - - def get_next_regression_params(self): - if self._idx >= len(self._inputs): - logging.info( - "Reached the end of all crash.json files to run..Exiting.." - ) - sys.exit(0) - param = self._inputs[self._idx] - dtypes_meta = param["dtypes_meta"] - num_rows = param["num_rows"] - num_cols = param["num_columns"] - seed = param["seed"] - random.seed(seed) - self._idx += 1 - self._current_params = copy.copy(param) - return dtypes_meta, num_rows, num_cols, seed - - def set_rand_params(self, params): - params_dict = { - param: np.random.choice(values) for param, values in params.items() - } - self._current_params["test_kwargs"] = self.process_kwargs( - params_dict=params_dict - ) - - def process_kwargs(self, params_dict): - return { - key: bool(value) - if isinstance(value, np.bool_) - else str(value) - if isinstance(value, np.dtype) - else value - for key, value in params_dict.items() - } diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py deleted file mode 100644 index e987529c8ba..00000000000 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import logging -import random -from collections import abc - -import numpy as np - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg -from cudf.utils.dtypes import pandas_dtypes_to_np_dtypes - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -def _get_dtype_param_value(dtype_val): - if dtype_val is not None and isinstance(dtype_val, abc.Mapping): - processed_dtypes = {} - for col_name, dtype in dtype_val.items(): - if isinstance(dtype, cudf.CategoricalDtype): - processed_dtypes[col_name] = "category" - else: - processed_dtypes[col_name] = str( - pandas_dtypes_to_np_dtypes.get(dtype, dtype) - ) - return processed_dtypes - return dtype_val - - -class JSONReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - # https://github.com/pandas-dev/pandas/issues/20599 - - {"uint64"} - # TODO: Remove DATETIME_TYPES after this is fixed: - # https://github.com/rapidsai/cudf/issues/6586 - - set(cudf.utils.dtypes.DATETIME_TYPES) - ) - # TODO: Uncomment following after following - # issue is fixed: - # https://github.com/rapidsai/cudf/issues/7086 - # dtypes_list.extend(["list"]) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - self._current_buffer = df - logging.info(f"Shape of DataFrame generated: {df.shape}") - - return df.to_json(orient="records", lines=True) - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_json( - file_name + "_crash_json.json", orient="records", lines=True - ) - - def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if param == "dtype" and values == ALL_POSSIBLE_VALUES: - dtype_val = np.random.choice( - [True, self._current_buffer.dtypes.to_dict()] - ) - params_dict[param] = _get_dtype_param_value(dtype_val) - else: - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) - - -class JSONWriter(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - # https://github.com/pandas-dev/pandas/issues/20599 - - {"uint64"} - # TODO: Remove DATETIME_TYPES after this is fixed: - # https://github.com/rapidsai/cudf/issues/6586 - - set(cudf.utils.dtypes.DATETIME_TYPES) - ) - # TODO: Uncomment following after following - # issue is fixed: - # https://github.com/rapidsai/cudf/issues/7086 - # dtypes_list.extend(["list"]) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - - logging.info(f"Shape of DataFrame generated: {df.shape}") - self._current_buffer = df - return df - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_json( - file_name + "_crash_json.json", lines=True, orient="records" - ) - - def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if param == "dtype" and values == ALL_POSSIBLE_VALUES: - dtype_val = np.random.choice( - [True, self._current_buffer.dtypes.to_dict()] - ) - params_dict[param] = _get_dtype_param_value(dtype_val) - else: - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/main.py b/python/cudf/cudf/_fuzz_testing/main.py deleted file mode 100644 index 54e49b63e41..00000000000 --- a/python/cudf/cudf/_fuzz_testing/main.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -from cudf._fuzz_testing import fuzzer - - -class PythonFuzz: - def __init__(self, func, params=None, data_handle=None, **kwargs): - self.function = func - self.data_handler_class = data_handle - self.fuzz_worker = fuzzer.Fuzzer( - target=self.function, - data_handler_class=self.data_handler_class, - dirs=kwargs.get("dir", None), - crash_reports_dir=kwargs.get("crash_reports_dir", None), - regression=kwargs.get("regression", False), - max_rows_size=kwargs.get("max_rows_size", 100_000), - max_cols_size=kwargs.get("max_cols_size", 1000), - runs=kwargs.get("runs", -1), - max_string_length=kwargs.get("max_string_length", None), - params=params, - write_data_on_failure=kwargs.get("write_data_on_failure", True), - max_lists_length=kwargs.get("max_lists_length", None), - max_lists_nesting_depth=kwargs.get( - "max_lists_nesting_depth", None - ), - ) - - def __call__(self, *args, **kwargs): - self.fuzz_worker.start() - - -# wrap PythonFuzz to allow for deferred calling -def pythonfuzz(function=None, data_handle=None, params=None, **kwargs): - if function: - return PythonFuzz(function, params, **kwargs) - else: - - def wrapper(function): - return PythonFuzz(function, params, data_handle, **kwargs) - - return wrapper - - -if __name__ == "__main__": - PythonFuzz(None) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py deleted file mode 100644 index ecddc72fa85..00000000000 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -import copy -import io -import logging -import random - -import numpy as np -import pyarrow as pa - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class OrcReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._df = None - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - - {"category"} - # Following dtypes are not supported by orc - # https://orc.apache.org/specification/ORCv0/ - - cudf.utils.dtypes.TIMEDELTA_TYPES - - cudf.utils.dtypes.UNSIGNED_TYPES - - {"datetime64[ns]"} - ) - - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - - self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_cols"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - logging.info(f"Shape of DataFrame generated: {table.shape}") - self._df = df - file_obj = io.BytesIO() - pa.orc.write_table(table, file_obj, stripe_size=self._rand(len(df))) - file_obj.seek(0) - buf = file_obj.read() - self._current_buffer = copy.copy(buf) - return (df, buf) - - def write_data(self, file_name): - if self._current_buffer is not None: - with open(file_name + "_crash.orc", "wb") as crash_dataset: - crash_dataset.write(self._current_buffer) - - def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "columns": - col_size = self._rand(len(self._df.columns)) - params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) - ) - elif param == "stripes": - f = io.BytesIO(self._current_buffer) - orcFile = pa.orc.ORCFile(f) - stripes = list(range(orcFile.nstripes)) - params_dict[param] = np.random.choice( - [ - None, - list( - map( - int, - np.unique( - np.random.choice( - stripes, orcFile.nstripes - ) - ), - ) - ), - ] - ) - elif param == "use_index": - params_dict[param] = np.random.choice([True, False]) - elif param in ("skiprows", "num_rows"): - params_dict[param] = np.random.choice( - [None, self._rand(len(self._df))] - ) - else: - if not isinstance(values, list): - raise TypeError("values must be of type list") - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) - - -class OrcWriter(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._df = None - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - # TODO: Remove "bool" from below - # list after following issue is fixed: - # https://github.com/rapidsai/cudf/issues/6763 - - {"category", "bool"} - # Following dtypes are not supported by orc - # https://orc.apache.org/specification/ORCv0/ - - cudf.utils.dtypes.TIMEDELTA_TYPES - - cudf.utils.dtypes.UNSIGNED_TYPES - # TODO: Remove `DATETIME_TYPES` once - # following bug is fixed: - # https://github.com/rapidsai/cudf/issues/7355 - - cudf.utils.dtypes.DATETIME_TYPES - ) - - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_cols"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - logging.info(f"Shape of DataFrame generated: {table.shape}") - self._df = df - return df - - def write_data(self, file_name): - # Due to the lack of really fast reference writer we are dumping - # the dataframe to a parquet file - if self._df is not None: - self._df.to_parquet(file_name + "_crash.parquet") diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py deleted file mode 100644 index 2d934e4816d..00000000000 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -import logging -import random - -import numpy as np - -import cudf -from cudf._fuzz_testing.io import IOFuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - _generate_rand_meta, - pyarrow_to_pandas, -) -from cudf.testing import dataset_generator as dg - -logging.basicConfig( - format="%(asctime)s %(levelname)-8s %(message)s", - level=logging.INFO, - datefmt="%Y-%m-%d %H:%M:%S", -) - - -class ParquetReader(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - self._df = None - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - - {"category", "datetime64[ns]"} - - cudf.utils.dtypes.TIMEDELTA_TYPES - # TODO: Remove uint32 below after this bug is fixed - # https://github.com/pandas-dev/pandas/issues/37327 - - {"uint32"} - | {"list", "decimal64"} - ) - - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_cols"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - logging.info(f"Shape of DataFrame generated: {table.shape}") - - # TODO: Change this to write into - # a BytesIO object once below issue is fixed - # https://issues.apache.org/jira/browse/ARROW-10123 - - # file = io.BytesIO() - - df.to_parquet("temp_file") - # file.seek(0) - # self._current_buffer = copy.copy(file.read()) - # return self._current_buffer - self._df = df - return "temp_file" - - def write_data(self, file_name): - if self._current_buffer is not None: - with open(file_name + "_crash.parquet", "wb") as crash_dataset: - crash_dataset.write(self._current_buffer) - - def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if param == "columns" and values == ALL_POSSIBLE_VALUES: - col_size = self._rand(len(self._df.columns)) - params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) - ) - else: - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) - - -class ParquetWriter(IOFuzz): - def __init__( - self, - dirs=None, - max_rows=100_000, - max_columns=1000, - max_string_length=None, - max_lists_length=None, - max_lists_nesting_depth=None, - ): - super().__init__( - dirs=dirs, - max_rows=max_rows, - max_columns=max_columns, - max_string_length=max_string_length, - max_lists_length=max_lists_length, - max_lists_nesting_depth=max_lists_nesting_depth, - ) - - def generate_input(self): - if self._regression: - ( - dtypes_meta, - num_rows, - num_cols, - seed, - ) = self.get_next_regression_params() - else: - seed = random.randint(0, 2**32 - 1) - random.seed(seed) - dtypes_list = list( - cudf.utils.dtypes.ALL_TYPES - - {"category", "timedelta64[ns]", "datetime64[ns]"} - # TODO: Remove uint32 below after this bug is fixed - # https://github.com/pandas-dev/pandas/issues/37327 - - {"uint32"} - | {"list", "decimal64"} - ) - dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list - ) - self._current_params["dtypes_meta"] = dtypes_meta - self._current_params["seed"] = seed - self._current_params["num_rows"] = num_rows - self._current_params["num_columns"] = num_cols - logging.info( - f"Generating DataFrame with rows: {num_rows} " - f"and columns: {num_cols}" - ) - - table = dg.rand_dataframe(dtypes_meta, num_rows, seed) - df = pyarrow_to_pandas(table) - - logging.info(f"Shape of DataFrame generated: {df.shape}") - self._current_buffer = df - return df - - def write_data(self, file_name): - if self._current_buffer is not None: - self._current_buffer.to_parquet(file_name + "_crash.parquet") diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_avro.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_avro.py deleted file mode 100644 index 5a90aec5828..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_avro.py +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - -import sys - -import cudf -from cudf._fuzz_testing.avro import AvroReader -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - compare_dataframe, - run_test, -) - - -@pythonfuzz( - data_handle=AvroReader, - params={ - "columns": ALL_POSSIBLE_VALUES, - "skiprows": ALL_POSSIBLE_VALUES, - "num_rows": ALL_POSSIBLE_VALUES, - }, -) -def avro_reader_test(input_tuple, columns, skiprows, num_rows): - pdf, parquet_buffer = input_tuple - expected_pdf = pdf[skiprows:] - if num_rows is not None: - expected_pdf = expected_pdf.head(num_rows) - if skiprows is not None or num_rows is not None: - expected_pdf = expected_pdf.reset_index(drop=True) - - gdf = cudf.read_avro( - parquet_buffer, columns=columns, skiprows=skiprows, num_rows=num_rows - ) - compare_dataframe(expected_pdf, gdf) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py deleted file mode 100644 index d90f3ea1aca..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import sys -from io import StringIO - -import pandas as pd - -import cudf -from cudf._fuzz_testing.csv import CSVReader, CSVWriter -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - compare_content, - run_test, -) -from cudf.testing import assert_eq - - -@pythonfuzz(data_handle=CSVReader) -def csv_reader_test(csv_buffer): - pdf = pd.read_csv(StringIO(csv_buffer)) - gdf = cudf.read_csv(StringIO(csv_buffer)) - - assert_eq(gdf, pdf) - - -@pythonfuzz(data_handle=CSVWriter) -def csv_writer_test(pdf): - gdf = cudf.from_pandas(pdf) - - pd_buffer = pdf.to_csv() - gd_buffer = gdf.to_csv() - - compare_content(pd_buffer, gd_buffer) - actual = cudf.read_csv(StringIO(gd_buffer)) - expected = pd.read_csv(StringIO(pd_buffer)) - assert_eq(actual, expected) - - -@pythonfuzz( - data_handle=CSVWriter, - params={ - "sep": list([",", "|", "\t", "\r", "~"]), - "header": [True, False], - "na_rep": [ - "", - "", - "NA", - "_NA_", - "__", - "<<<<>>>>>", - "--<>--", - "-+><+-", - ], - "columns": ALL_POSSIBLE_VALUES, - "index": [True, False], - "lineterminator": ["\n", "\r", "\r\n"], - "chunksize": ALL_POSSIBLE_VALUES, - }, -) -def csv_writer_test_params( - pdf, sep, header, na_rep, columns, index, lineterminator, chunksize -): - gdf = cudf.from_pandas(pdf) - - pd_buffer = pdf.to_csv( - sep=sep, - header=header, - na_rep=na_rep, - columns=columns, - index=index, - lineterminator=lineterminator, - chunksize=chunksize, - ) - gd_buffer = gdf.to_csv( - sep=sep, - header=header, - na_rep=na_rep, - columns=columns, - index=index, - lineterminator=lineterminator, - chunksize=chunksize, - ) - - # TODO: Uncomment once this issue is fixed - # https://github.com/rapidsai/cudf/issues/6418 - # compare_content(pd_buffer, gd_buffer) - - actual = cudf.read_csv( - StringIO(gd_buffer), - delimiter=sep, - na_values=na_rep, - lineterminator=lineterminator, - ) - expected = pd.read_csv( - StringIO(pd_buffer), - delimiter=sep, - na_values=na_rep, - lineterminator=lineterminator, - ) - if not header: - # TODO: Remove renaming columns once the following bug is fixed: - # https://github.com/rapidsai/cudf/issues/6418 - actual.columns = expected.columns - - assert_eq(actual, expected) - - -@pythonfuzz( - data_handle=CSVReader, - params={ - "dtype": ALL_POSSIBLE_VALUES, - "usecols": ALL_POSSIBLE_VALUES, - "header": ALL_POSSIBLE_VALUES, - "skiprows": ALL_POSSIBLE_VALUES, - "skipfooter": ALL_POSSIBLE_VALUES, - "nrows": ALL_POSSIBLE_VALUES, - }, -) -def csv_reader_test_params(csv_buffer, dtype, header, skiprows): - pdf = pd.read_csv( - StringIO(csv_buffer), dtype=dtype, header=header, skiprows=skiprows - ) - gdf = cudf.read_csv( - StringIO(csv_buffer), dtype=dtype, header=header, skiprows=skiprows - ) - - assert_eq(gdf, pdf) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py deleted file mode 100644 index 69e9437be93..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import io -import sys - -import pandas as pd - -import cudf -from cudf._fuzz_testing.json import JSONReader, JSONWriter -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test -from cudf.testing import assert_eq - - -@pythonfuzz(data_handle=JSONReader) -def json_reader_test(json_buffer): - pdf = pd.read_json(io.StringIO(json_buffer), orient="records", lines=True) - # Difference in behaviour with pandas - # cudf reads column as strings only. - pdf.columns = pdf.columns.astype("str") - gdf = cudf.read_json(io.StringIO(json_buffer), engine="cudf", lines=True) - - assert_eq(gdf, pdf) - - -@pythonfuzz(data_handle=JSONReader, params={"dtype": ALL_POSSIBLE_VALUES}) -def json_reader_test_params(json_buffer, dtype): - pdf = pd.read_json(json_buffer, dtype=dtype, orient="records", lines=True) - pdf.columns = pdf.columns.astype("str") - - gdf = cudf.read_json(json_buffer, dtype=dtype, engine="cudf", lines=True) - - assert_eq(gdf, pdf) - - -@pythonfuzz(data_handle=JSONWriter) -def json_writer_test(pdf): - gdf = cudf.from_pandas(pdf) - - pdf_buffer = pdf.to_json(lines=True, orient="records") - gdf_buffer = gdf.to_json(lines=True, orient="records") - - # TODO: Uncomment once this is fixed: - # https://github.com/rapidsai/cudf/issues/6429 - # compare_content(pdf_buffer, gdf_buffer) - - actual = cudf.read_json( - gdf_buffer, engine="cudf", lines=True, orient="records" - ) - expected = pd.read_json(pdf_buffer, lines=True, orient="records") - expected.columns = expected.columns.astype("str") - assert_eq(actual, expected) - - -@pythonfuzz( - data_handle=JSONWriter, - params={ - "compression": ["gzip", "bz2", "zip", "xz", None], - "dtype": ALL_POSSIBLE_VALUES, - }, -) -def json_writer_test_params(pdf, compression, dtype): - gdf = cudf.from_pandas(pdf) - - pdf_buffer = pdf.to_json( - lines=True, orient="records", compression=compression - ) - gdf_buffer = gdf.to_json( - lines=True, orient="records", compression=compression - ) - - # TODO: Uncomment once this is fixed: - # https://github.com/rapidsai/cudf/issues/6429 - # compare_content(pdf_buffer, gdf_buffer) - - actual = cudf.read_json( - io.StringIO(gdf_buffer), - engine="cudf", - lines=True, - orient="records", - dtype=dtype, - ) - expected = pd.read_json( - io.StringIO(pdf_buffer), lines=True, orient="records", dtype=dtype - ) - - # Difference in behaviour with pandas - # cudf reads column as strings only. - expected.columns = expected.columns.astype("str") - assert_eq(actual, expected) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py deleted file mode 100644 index 977038d1fcb..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -import io -import sys - -import cudf -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.orc import OrcReader, OrcWriter -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - compare_dataframe, - orc_to_pandas, - run_test, -) - - -@pythonfuzz( - data_handle=OrcReader, - params={ - "columns": ALL_POSSIBLE_VALUES, - "skiprows": ALL_POSSIBLE_VALUES, - "num_rows": ALL_POSSIBLE_VALUES, - "use_index": ALL_POSSIBLE_VALUES, - }, -) -def orc_reader_test(input_tuple, columns, skiprows, num_rows, use_index): - pdf, file_buffer = input_tuple - expected_pdf = pdf.iloc[skiprows:] - if num_rows is not None: - expected_pdf = expected_pdf.head(num_rows) - if skiprows is not None or num_rows is not None: - expected_pdf.reset_index(drop=True, inplace=True) - if columns is not None and len(columns) > 0: - # ORC reader picks columns if only - # there are any elements in `columns` - expected_pdf = expected_pdf[columns] - if use_index is False: - expected_pdf.reset_index(drop=True, inplace=True) - - gdf = cudf.read_orc( - io.BytesIO(file_buffer), - columns=columns, - skiprows=skiprows, - num_rows=num_rows, - use_index=use_index, - ) - - compare_dataframe(expected_pdf, gdf) - - -@pythonfuzz( - data_handle=OrcReader, - params={"columns": ALL_POSSIBLE_VALUES, "stripes": ALL_POSSIBLE_VALUES}, -) -def orc_reader_stripes_test(input_tuple, columns, stripes): - _, file_buffer = input_tuple - expected_pdf = orc_to_pandas( - file_io_obj=io.BytesIO(file_buffer), stripes=stripes - ) - - if columns is not None and len(columns) > 0: - # ORC reader picks columns if only - # there are any elements in `columns` - expected_pdf = expected_pdf[columns] - - gdf = cudf.read_orc( - io.BytesIO(file_buffer), columns=columns, stripes=stripes - ) - - compare_dataframe(expected_pdf, gdf) - - -@pythonfuzz( - data_handle=OrcWriter, - params={ - "compression": [None, "snappy"], - "enable_statistics": ["NONE", "STRIPE", "ROWGROUP"], - }, -) -def orc_writer_test(pdf, compression, enable_statistics): - file_to_strore = io.BytesIO() - - gdf = cudf.from_pandas(pdf) - - gdf.to_orc( - file_to_strore, - compression=compression, - enable_statistics=enable_statistics, - ) - file_to_strore.seek(0) - - actual_df = cudf.read_orc(file_to_strore) - - compare_dataframe(pdf, actual_df) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py deleted file mode 100644 index 3d070576a12..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. - -import sys - -import numpy as np -import pandas as pd - -import cudf -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.parquet import ParquetReader, ParquetWriter -from cudf._fuzz_testing.utils import ( - ALL_POSSIBLE_VALUES, - compare_dataframe, - run_test, -) - - -@pythonfuzz(data_handle=ParquetReader) -def parquet_reader_test(parquet_buffer): - pdf = pd.read_parquet(parquet_buffer) - gdf = cudf.read_parquet(parquet_buffer) - - compare_dataframe(gdf, pdf) - - -@pythonfuzz( - data_handle=ParquetReader, - params={ - "columns": ALL_POSSIBLE_VALUES, - "use_pandas_metadata": [True, False], - }, -) -def parquet_reader_columns(parquet_buffer, columns, use_pandas_metadata): - pdf = pd.read_parquet( - parquet_buffer, - columns=columns, - use_pandas_metadata=use_pandas_metadata, - ) - - gdf = cudf.read_parquet( - parquet_buffer, - columns=columns, - use_pandas_metadata=use_pandas_metadata, - ) - - compare_dataframe(gdf, pdf) - - -@pythonfuzz(data_handle=ParquetWriter) -def parquet_writer_test(pdf): - pd_file_name = "cpu_pdf.parquet" - gd_file_name = "gpu_pdf.parquet" - - gdf = cudf.from_pandas(pdf) - - pdf.to_parquet(pd_file_name) - gdf.to_parquet(gd_file_name) - - actual = cudf.read_parquet(gd_file_name) - expected = pd.read_parquet(pd_file_name) - compare_dataframe(actual, expected) - - actual = cudf.read_parquet(pd_file_name) - expected = pd.read_parquet(gd_file_name) - compare_dataframe(actual, expected) - - -@pythonfuzz( - data_handle=ParquetWriter, - params={ - "row_group_size": np.random.random_integers(1, 10000, 100), - "compression": ["snappy", None], - }, -) -def parquet_writer_test_rowgroup_index_compression( - pdf, compression, row_group_size -): - pd_file_name = "cpu_pdf.parquet" - gd_file_name = "gpu_pdf.parquet" - - gdf = cudf.from_pandas(pdf) - - pdf.to_parquet( - pd_file_name, - compression=compression, - row_group_size=row_group_size, - ) - gdf.to_parquet( - gd_file_name, - compression=compression, - row_group_size=row_group_size, - ) - - actual = cudf.read_parquet(gd_file_name) - expected = pd.read_parquet(pd_file_name) - compare_dataframe(actual, expected) - - actual = cudf.read_parquet(pd_file_name) - expected = pd.read_parquet(gd_file_name) - compare_dataframe(actual, expected, nullable=False) - - -if __name__ == "__main__": - run_test(globals(), sys.argv) diff --git a/python/cudf/cudf/_fuzz_testing/tests/readme.md b/python/cudf/cudf/_fuzz_testing/tests/readme.md deleted file mode 100644 index f9ef1119a21..00000000000 --- a/python/cudf/cudf/_fuzz_testing/tests/readme.md +++ /dev/null @@ -1,100 +0,0 @@ -# Fuzz Tests - -This directory contains all the Fuzz tests for cudf library. - - -## Steps to write a fuzz test - -1. Add a Data Handler class which actually generates the necessary random data according to your requirements. This class should be added in `cudf/cudf/testing/`. A sample data handler class is: `CSVWriter`: https://github.com/rapidsai/cudf/blob/branch-0.16/python/cudf/cudf/testing/csv.py -2. Data Handlers are registered by the `pythonfuzz` decorator. At runtime, the Fuzzer will continuously run registered fuzz tests. - -```python -from cudf.testing.csv import CSVWriter - -@pythonfuzz(data_handle=CSVWriter) -def csv_writer_test(data_from_generate_input): - ... - ... - ... - -if __name__ == "__main__": - ... - ... - -``` -## Steps to run fuzz tests - -1. To run a fuzz test, for example a test(method) is in `write_csv.py`: - -```bash -python write_csv.py your_function_name -``` - -To run a basic csv write test in `write_csv.py`: -```bash -python write_csv.py csv_writer_test -``` - -## Tips to run specific crash file/files - -Using the `pythonfuzz` decorator pass in `regression=True` with `dirs` having list of directories -```python -@pythonfuzz(data_handle=CSVWriter, regression=True, dir=["/cudf/python/cudf/cudf/_fuzz_testing"]) -``` - - -## Tips to run for varying parameter combinations - -In the `pythonfuzz` decorator you can pass in the function parameters you would like to pass to the -fuzz-test being written via `params` as a dictionary. The values in dictionary are sampled randomly -and passed to the `your_custom_fuzz_test`. - -If a parameter value depends the kind of input generated by the `data_handle`(in this case `CSVReader`), -then you can assign `ALL_POSSIBLE_VALUES` constant to it. This constant is used as an identifier by the -`data_handle` to generate random parameter values for that specific parameter purely based on data. -To perform this customization `set_rand_params` should be implemented as shown in the below example. -```python -from cudf._fuzz_testing.main import pythonfuzz -from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES -@pythonfuzz( - data_handle=CSVWriter, - params={ - "columns": ALL_POSSIBLE_VALUES, - "is_folder": [True, False, None], - "chunksize": ALL_POSSIBLE_VALUES, - }, -) -def your_custom_fuzz_test(data_from_data_handle, dtype, is_folder, header): - ... - ... - ... -``` - -A sample implementation of `set_rand_params` in a `data_handle` class: -``` -def set_rand_params(self, params): - params_dict = {} - for param, values in params.items(): - if values == ALL_POSSIBLE_VALUES: - if param == "columns": - col_size = self._rand(len(self._current_buffer.columns)) - params_dict[param] = list( - np.unique( - np.random.choice( - self._current_buffer.columns, col_size - ) - ) - ) - elif param == "chunksize": - params_dict[param] = np.random.choice( - [ - None, - np.random.randint( - low=1, high=max(1, len(self._current_buffer)) - ), - ] - ) - else: - params_dict[param] = np.random.choice(values) - self._current_params["test_kwargs"] = self.process_kwargs(params_dict) -``` diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py deleted file mode 100644 index 8ce92e1c0f6..00000000000 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ /dev/null @@ -1,256 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import random - -import fastavro -import numpy as np -import pandas as pd -import pyarrow as pa - -import cudf -from cudf.testing import assert_eq -from cudf.utils.dtypes import ( - pandas_dtypes_to_np_dtypes, - pyarrow_dtypes_to_pandas_dtypes, -) - -ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES" - -_PANDAS_TO_AVRO_SCHEMA_MAP = { - cudf.dtype("int8"): "int", - pd.Int8Dtype(): ["int", "null"], - pd.Int16Dtype(): ["int", "null"], - pd.Int32Dtype(): ["int", "null"], - pd.Int64Dtype(): ["long", "null"], - pd.Float32Dtype(): ["float", "null"], - pd.Float64Dtype(): ["double", "null"], - pd.BooleanDtype(): ["boolean", "null"], - pd.StringDtype(): ["string", "null"], - cudf.dtype("bool_"): "boolean", - cudf.dtype("int16"): "int", - cudf.dtype("int32"): "int", - cudf.dtype("int64"): "long", - cudf.dtype("O"): "string", - cudf.dtype("str"): "string", - cudf.dtype("float32"): "float", - cudf.dtype("float64"): "double", - cudf.dtype("") - -include(${rapids-cmake-dir}/export/find_package_root.cmake) -include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) -target_link_libraries(interop PUBLIC nanoarrow) - -add_subdirectory(io) -add_subdirectory(nvtext) -add_subdirectory(strings) +target_link_libraries(column PUBLIC cudf_strings_udf) diff --git a/python/cudf/cudf/_lib/__init__.pxd b/python/cudf/cudf/_lib/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 918edb6d3f1..e69de29bb2d 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,45 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import numpy as np - -from . import ( - avro, - binaryop, - concat, - copying, - csv, - datetime, - filling, - groupby, - hash, - interop, - join, - json, - labeling, - merge, - null_mask, - nvtext, - orc, - parquet, - partitioning, - quantiles, - reduce, - replace, - reshape, - rolling, - round, - search, - sort, - stream_compaction, - string_casting, - strings, - strings_udf, - text, - timezone, - transpose, - unary, -) - -MAX_COLUMN_SIZE = np.iinfo(np.int32).max -MAX_COLUMN_SIZE_STR = "INT32_MAX" -MAX_STRING_COLUMN_BYTES = np.iinfo(np.int32).max -MAX_STRING_COLUMN_BYTES_STR = "INT32_MAX" diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx deleted file mode 100644 index 3c96b90f0a1..00000000000 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pandas as pd -from numba.np import numpy_support - -import pylibcudf - -import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES -from cudf.utils import cudautils - -_agg_name_map = { - "COUNT_VALID": "COUNT", - "COUNT_ALL": "SIZE", - "VARIANCE": "VAR", - "NTH_ELEMENT": "NTH", - "COLLECT_LIST": "COLLECT", - "COLLECT_SET": "UNIQUE", -} - - -class Aggregation: - def __init__(self, agg): - self.c_obj = agg - - @property - def kind(self): - name = self.c_obj.kind().name - return _agg_name_map.get(name, name) - - @classmethod - def sum(cls): - return cls(pylibcudf.aggregation.sum()) - - @classmethod - def min(cls): - return cls(pylibcudf.aggregation.min()) - - @classmethod - def max(cls): - return cls(pylibcudf.aggregation.max()) - - @classmethod - def idxmin(cls): - return cls(pylibcudf.aggregation.argmin()) - - @classmethod - def idxmax(cls): - return cls(pylibcudf.aggregation.argmax()) - - @classmethod - def mean(cls): - return cls(pylibcudf.aggregation.mean()) - - @classmethod - def count(cls, dropna=True): - return cls(pylibcudf.aggregation.count( - pylibcudf.types.NullPolicy.EXCLUDE - if dropna else pylibcudf.types.NullPolicy.INCLUDE - )) - - @classmethod - def ewma(cls, com=1.0, adjust=True): - return cls(pylibcudf.aggregation.ewma( - com, - pylibcudf.aggregation.EWMHistory.INFINITE - if adjust else pylibcudf.aggregation.EWMHistory.FINITE - )) - - @classmethod - def size(cls): - return cls(pylibcudf.aggregation.count(pylibcudf.types.NullPolicy.INCLUDE)) - - @classmethod - def collect(cls): - return cls( - pylibcudf.aggregation.collect_list(pylibcudf.types.NullPolicy.INCLUDE) - ) - - @classmethod - def nunique(cls, dropna=True): - return cls(pylibcudf.aggregation.nunique( - pylibcudf.types.NullPolicy.EXCLUDE - if dropna else pylibcudf.types.NullPolicy.INCLUDE - )) - - @classmethod - def nth(cls, size): - return cls(pylibcudf.aggregation.nth_element(size)) - - @classmethod - def product(cls): - return cls(pylibcudf.aggregation.product()) - prod = product - - @classmethod - def sum_of_squares(cls): - return cls(pylibcudf.aggregation.sum_of_squares()) - - @classmethod - def var(cls, ddof=1): - return cls(pylibcudf.aggregation.variance(ddof)) - - @classmethod - def std(cls, ddof=1): - return cls(pylibcudf.aggregation.std(ddof)) - - @classmethod - def median(cls): - return cls(pylibcudf.aggregation.median()) - - @classmethod - def quantile(cls, q=0.5, interpolation="linear"): - if not pd.api.types.is_list_like(q): - q = [q] - - return cls(pylibcudf.aggregation.quantile( - q, pylibcudf.types.Interpolation[interpolation.upper()] - )) - - @classmethod - def unique(cls): - return cls(pylibcudf.aggregation.collect_set( - pylibcudf.types.NullPolicy.INCLUDE, - pylibcudf.types.NullEquality.EQUAL, - pylibcudf.types.NanEquality.ALL_EQUAL, - - )) - - @classmethod - def first(cls): - return cls( - pylibcudf.aggregation.nth_element(0, pylibcudf.types.NullPolicy.EXCLUDE) - ) - - @classmethod - def last(cls): - return cls( - pylibcudf.aggregation.nth_element(-1, pylibcudf.types.NullPolicy.EXCLUDE) - ) - - @classmethod - def corr(cls, method, min_periods): - return cls(pylibcudf.aggregation.correlation( - pylibcudf.aggregation.CorrelationType[method.upper()], - min_periods - - )) - - @classmethod - def cov(cls, min_periods, ddof=1): - return cls(pylibcudf.aggregation.covariance( - min_periods, - ddof - )) - - # scan aggregations - @classmethod - def cumcount(cls): - return cls.count(False) - - cumsum = sum - cummin = min - cummax = max - cumprod = product - - @classmethod - def rank(cls, method, ascending, na_option, pct): - return cls(pylibcudf.aggregation.rank( - pylibcudf.aggregation.RankMethod[method.upper()], - (pylibcudf.types.Order.ASCENDING if ascending else - pylibcudf.types.Order.DESCENDING), - (pylibcudf.types.NullPolicy.EXCLUDE if na_option == "keep" else - pylibcudf.types.NullPolicy.INCLUDE), - (pylibcudf.types.NullOrder.BEFORE - if (na_option == "top") == ascending else - pylibcudf.types.NullOrder.AFTER), - (pylibcudf.aggregation.RankPercentage.ZERO_NORMALIZED - if pct else - pylibcudf.aggregation.RankPercentage.NONE) - - )) - - # Reduce aggregations - @classmethod - def any(cls): - return cls(pylibcudf.aggregation.any()) - - @classmethod - def all(cls): - return cls(pylibcudf.aggregation.all()) - - # Rolling aggregations - @classmethod - def from_udf(cls, op, *args, **kwargs): - # Handling UDF type - nb_type = numpy_support.from_dtype(kwargs['dtype']) - type_signature = (nb_type[:],) - ptx_code, output_dtype = cudautils.compile_udf(op, type_signature) - output_np_dtype = cudf.dtype(output_dtype) - if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: - raise TypeError(f"Result of window function has unsupported dtype {op[1]}") - - return cls( - pylibcudf.aggregation.udf( - ptx_code, - pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype]), - ) - ) - - -def make_aggregation(op, kwargs=None): - r""" - Parameters - ---------- - op : str or callable - If callable, must meet one of the following requirements: - - * Is of the form lambda x: x.agg(*args, **kwargs), where - `agg` is the name of a supported aggregation. Used to - to specify aggregations that take arguments, e.g., - `lambda x: x.quantile(0.5)`. - * Is a user defined aggregation function that operates on - group values. In this case, the output dtype must be - specified in the `kwargs` dictionary. - \*\*kwargs : dict, optional - Any keyword arguments to be passed to the op. - - Returns - ------- - Aggregation - """ - if kwargs is None: - kwargs = {} - - if isinstance(op, str): - return getattr(Aggregation, op)(**kwargs) - elif callable(op): - if op is list: - return Aggregation.collect() - elif "dtype" in kwargs: - return Aggregation.from_udf(op, **kwargs) - else: - return op(Aggregation) - raise TypeError(f"Unknown aggregation {op}") diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx deleted file mode 100644 index b1759635a36..00000000000 --- a/python/cudf/cudf/_lib/avro.pyx +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.utils cimport data_from_pylibcudf_io - -import pylibcudf as plc -from pylibcudf.io.types import SourceInfo - - -cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1): - """ - Cython function to call libcudf read_avro, see `read_avro`. - - See Also - -------- - cudf.io.avro.read_avro - """ - - num_rows = -1 if num_rows is None else num_rows - skip_rows = 0 if skip_rows is None else skip_rows - - if not isinstance(num_rows, int) or num_rows < -1: - raise TypeError("num_rows must be an int >= -1") - if not isinstance(skip_rows, int) or skip_rows < 0: - raise TypeError("skip_rows must be an int >= 0") - - return data_from_pylibcudf_io( - plc.io.avro.read_avro( - SourceInfo([datasource]), - columns, - skip_rows, - num_rows - ) - ) diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx deleted file mode 100644 index e2547476849..00000000000 --- a/python/cudf/cudf/_lib/binaryop.pyx +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import pylibcudf - -from cudf._lib.scalar import as_device_scalar -from cudf.core.buffer import acquire_spill_lock - -# Map pandas operation names to pylibcudf operation names. -_op_map = { - "TRUEDIV": "TRUE_DIV", - "FLOORDIV": "FLOOR_DIV", - "MOD": "PYMOD", - "EQ": "EQUAL", - "NE": "NOT_EQUAL", - "LT": "LESS", - "GT": "GREATER", - "LE": "LESS_EQUAL", - "GE": "GREATER_EQUAL", - "AND": "BITWISE_AND", - "OR": "BITWISE_OR", - "XOR": "BITWISE_XOR", - "L_AND": "LOGICAL_AND", - "L_OR": "LOGICAL_OR", -} - - -@acquire_spill_lock() -def binaryop(lhs, rhs, op, dtype): - """ - Dispatches a binary op call to the appropriate libcudf function: - """ - # TODO: Shouldn't have to keep special-casing. We need to define a separate - # pipeline for libcudf binops that don't map to Python binops. - if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}: - op = op[2:-2] - op = op.upper() - op = _op_map.get(op, op) - - return Column.from_pylibcudf( - # Check if the dtype args are desirable here. - pylibcudf.binaryop.binary_operation( - lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column) - else ( - as_device_scalar( - lhs, dtype=rhs.dtype if lhs is None else None - ) - ).c_value, - rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column) - else ( - as_device_scalar( - rhs, dtype=lhs.dtype if rhs is None else None - ) - ).c_value, - pylibcudf.binaryop.BinaryOperator[op], - dtype_to_pylibcudf_type(dtype), - ) - ) diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd deleted file mode 100644 index 8ceea4920e2..00000000000 --- a/python/cudf/cudf/_lib/column.pxd +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from typing import Literal - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport ( - column_view, - mutable_column_view, -) -from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer - - -cdef class Column: - cdef public: - cdef int _offset - cdef int _size - cdef object _dtype - cdef object _base_children - cdef object _base_data - cdef object _base_mask - cdef object _children - cdef object _data - cdef object _mask - cdef object _null_count - cdef object _distinct_count - - cdef column_view _view(self, size_type null_count) except * - cdef column_view view(self) except * - cdef mutable_column_view mutable_view(self) except * - cpdef to_pylibcudf(self, mode: Literal["read", "write"]) - - @staticmethod - cdef Column from_unique_ptr( - unique_ptr[column] c_col, bint data_ptr_exposed=* - ) - - @staticmethod - cdef Column from_column_view(column_view, object) - - cdef size_type compute_null_count(self) except? 0 diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi deleted file mode 100644 index bb38488eefb..00000000000 --- a/python/cudf/cudf/_lib/column.pyi +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -from __future__ import annotations - -from typing_extensions import Self - -from cudf._typing import Dtype, DtypeObj, ScalarLike -from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase - -class Column: - _data: Buffer | None - _mask: Buffer | None - _base_data: Buffer | None - _base_mask: Buffer | None - _dtype: DtypeObj - _size: int - _offset: int - _null_count: int - _children: tuple[ColumnBase, ...] - _base_children: tuple[ColumnBase, ...] - _distinct_count: dict[bool, int] - - def __init__( - self, - data: Buffer | None, - size: int, - dtype: Dtype, - mask: Buffer | None = None, - offset: int | None = None, - null_count: int | None = None, - children: tuple[ColumnBase, ...] = (), - ) -> None: ... - @property - def base_size(self) -> int: ... - @property - def dtype(self) -> DtypeObj: ... - @property - def size(self) -> int: ... - @property - def base_data(self) -> Buffer | None: ... - @property - def data(self) -> Buffer | None: ... - @property - def data_ptr(self) -> int: ... - def set_base_data(self, value: Buffer) -> None: ... - @property - def nullable(self) -> bool: ... - def has_nulls(self, include_nan: bool = False) -> bool: ... - @property - def base_mask(self) -> Buffer | None: ... - @property - def mask(self) -> Buffer | None: ... - @property - def mask_ptr(self) -> int: ... - def set_base_mask(self, value: Buffer | None) -> None: ... - def set_mask(self, value: ColumnBase | Buffer | None) -> Self: ... - @property - def null_count(self) -> int: ... - @property - def offset(self) -> int: ... - @property - def base_children(self) -> tuple[ColumnBase, ...]: ... - @property - def children(self) -> tuple[ColumnBase, ...]: ... - def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: ... - def _mimic_inplace( - self, other_col: ColumnBase, inplace=False - ) -> Self | None: ... - - # TODO: The val parameter should be Scalar, not ScalarLike - @staticmethod - def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 99e4c21df8a..fc1ee369480 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,838 +1,6 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from cudf._lib.scalar import g +from rmm._lib.device_buffer cimport device_buffer -from typing import Literal - -import cupy as cp -import numpy as np -import pandas as pd - -import pylibcudf -import rmm - -import cudf -import cudf._lib as libcudf -from cudf.core.buffer import ( - Buffer, - ExposureTrackedBuffer, - SpillableBuffer, - acquire_spill_lock, - as_buffer, - cuda_array_interface_wrapper, -) -from cudf.utils.dtypes import _get_base_dtype - -from cpython.buffer cimport PyObject_CheckBuffer -from libc.stdint cimport uintptr_t -from libcpp.memory cimport make_unique, unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from rmm._lib.device_buffer cimport DeviceBuffer - -from cudf._lib.types cimport ( - dtype_from_column_view, - dtype_to_data_type, - dtype_to_pylibcudf_type, -) - -from cudf._lib.null_mask import bitmask_allocation_size_bytes -from cudf._lib.types import dtype_from_pylibcudf_column - -cimport pylibcudf.libcudf.copying as cpp_copying -cimport pylibcudf.libcudf.types as libcudf_types -cimport pylibcudf.libcudf.unary as libcudf_unary -from pylibcudf.libcudf.column.column cimport column, column_contents -from pylibcudf.libcudf.column.column_factories cimport ( - make_column_from_scalar as cpp_make_column_from_scalar, - make_numeric_column, -) -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count -from pylibcudf.libcudf.scalar.scalar cimport scalar - -from cudf._lib.scalar cimport DeviceScalar - - -cdef get_element(column_view col_view, size_type index): - - cdef unique_ptr[scalar] c_output - with nogil: - c_output = move( - cpp_copying.get_element(col_view, index) - ) - - return DeviceScalar.from_unique_ptr( - move(c_output), dtype=dtype_from_column_view(col_view) - ) - - -cdef class Column: - """ - A Column stores columnar data in device memory. - A Column may be composed of: - - * A *data* Buffer - * One or more (optional) *children* Columns - * An (optional) *mask* Buffer representing the nullmask - - The *dtype* indicates the Column's element type. - """ - def __init__( - self, - object data, - int size, - object dtype, - object mask=None, - int offset=0, - object null_count=None, - tuple children=() - ): - if size < 0: - raise ValueError("size must be >=0") - self._size = size - self._distinct_count = {} - self._dtype = dtype - self._offset = offset - self._null_count = null_count - self.set_base_children(children) - self.set_base_data(data) - self.set_base_mask(mask) - - @property - def base_size(self): - return int(self.base_data.size / self.dtype.itemsize) - - @property - def dtype(self): - return self._dtype - - @property - def size(self): - return self._size - - @property - def base_data(self): - return self._base_data - - @property - def data(self): - if self.base_data is None: - return None - if self._data is None: - start = self.offset * self.dtype.itemsize - end = start + self.size * self.dtype.itemsize - self._data = self.base_data[start:end] - return self._data - - @property - def data_ptr(self): - if self.data is None: - return 0 - else: - return self.data.get_ptr(mode="write") - - def set_base_data(self, value): - if value is not None and not isinstance(value, Buffer): - raise TypeError( - "Expected a Buffer or None for data, " - f"got {type(value).__name__}" - ) - - self._data = None - self._base_data = value - - @property - def nullable(self): - return self.base_mask is not None - - def has_nulls(self, include_nan=False): - return int(self.null_count) != 0 - - @property - def base_mask(self): - return self._base_mask - - @property - def mask(self): - if self._mask is None: - if self.base_mask is None or self.offset == 0: - self._mask = self.base_mask - else: - self._mask = libcudf.null_mask.copy_bitmask(self) - return self._mask - - @property - def mask_ptr(self): - if self.mask is None: - return 0 - else: - return self.mask.get_ptr(mode="write") - - def set_base_mask(self, value): - """ - Replaces the base mask buffer of the column inplace. This does not - modify size or offset in any way, so the passed mask is expected to be - compatible with the current offset. - """ - if value is not None and not isinstance(value, Buffer): - raise TypeError( - "Expected a Buffer or None for mask, " - f"got {type(value).__name__}" - ) - - if value is not None: - # bitmask size must be relative to offset = 0 data. - required_size = bitmask_allocation_size_bytes(self.base_size) - if value.size < required_size: - error_msg = ( - "The Buffer for mask is smaller than expected, " - f"got {value.size} bytes, expected {required_size} bytes." - ) - if self.offset > 0 or self.size < self.base_size: - error_msg += ( - "\n\nNote: The mask is expected to be sized according " - "to the base allocation as opposed to the offsetted or" - " sized allocation." - ) - raise ValueError(error_msg) - - self._mask = None - self._children = None - self._base_mask = value - self._clear_cache() - - def _clear_cache(self): - self._distinct_count = {} - attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing") - for attr in attrs: - try: - delattr(self, attr) - except AttributeError: - # attr was not called yet, so ignore. - pass - self._null_count = None - - def set_mask(self, value): - """ - Replaces the mask buffer of the column and returns a new column. This - will zero the column offset, compute a new mask buffer if necessary, - and compute new data Buffers zero-copy that use pointer arithmetic to - properly adjust the pointer. - """ - mask_size = bitmask_allocation_size_bytes(self.size) - required_num_bytes = -(-self.size // 8) # ceiling divide - error_msg = ( - "The value for mask is smaller than expected, got {} bytes, " - "expected " + str(required_num_bytes) + " bytes." - ) - if value is None: - mask = None - elif hasattr(value, "__cuda_array_interface__"): - if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): - if isinstance(value, Column): - value = value.data_array_view(mode="write") - value = cp.asarray(value).view('|u1') - mask = as_buffer(value) - if mask.size < required_num_bytes: - raise ValueError(error_msg.format(str(value.size))) - if mask.size < mask_size: - dbuf = rmm.DeviceBuffer(size=mask_size) - dbuf.copy_from_device(value) - mask = as_buffer(dbuf) - elif hasattr(value, "__array_interface__"): - value = np.asarray(value).view("u1")[:mask_size] - if value.size < required_num_bytes: - raise ValueError(error_msg.format(str(value.size))) - dbuf = rmm.DeviceBuffer(size=mask_size) - dbuf.copy_from_host(value) - mask = as_buffer(dbuf) - elif PyObject_CheckBuffer(value): - value = np.asarray(value).view("u1")[:mask_size] - if value.size < required_num_bytes: - raise ValueError(error_msg.format(str(value.size))) - dbuf = rmm.DeviceBuffer(size=mask_size) - dbuf.copy_from_host(value) - mask = as_buffer(dbuf) - else: - raise TypeError( - "Expected a Buffer object or None for mask, " - f"got {type(value).__name__}" - ) - - return cudf.core.column.build_column( - data=self.data, - dtype=self.dtype, - mask=mask, - size=self.size, - offset=0, - children=self.children - ) - - @property - def null_count(self): - if self._null_count is None: - self._null_count = self.compute_null_count() - return self._null_count - - @property - def offset(self): - return self._offset - - @property - def base_children(self): - return self._base_children - - @property - def children(self): - if (self.offset == 0) and (self.size == self.base_size): - self._children = self.base_children - if self._children is None: - if self.base_children == (): - self._children = () - else: - children = Column.from_unique_ptr( - move(make_unique[column](self.view())) - ).base_children - dtypes = [ - base_child.dtype for base_child in self.base_children - ] - self._children = tuple( - child._with_type_metadata(dtype) for child, dtype in zip( - children, dtypes - ) - ) - return self._children - - def set_base_children(self, value): - if not isinstance(value, tuple): - raise TypeError("Expected a tuple of Columns for children, got " + - type(value).__name__) - - for child in value: - if not isinstance(child, Column): - raise TypeError( - "Expected each of children to be a Column, got " + - type(child).__name__ - ) - - self._children = None - self._base_children = value - - def _mimic_inplace(self, other_col, inplace=False): - """ - Given another column, update the attributes of this column to mimic an - inplace operation. This does not modify the memory of Buffers, but - instead replaces the Buffers and other attributes underneath the column - object with the Buffers and attributes from the other column. - """ - if inplace: - self._offset = other_col.offset - self._size = other_col.size - self._dtype = other_col._dtype - self.set_base_data(other_col.base_data) - self.set_base_children(other_col.base_children) - self.set_base_mask(other_col.base_mask) - else: - return other_col - - cdef libcudf_types.size_type compute_null_count(self) except? 0: - with acquire_spill_lock(): - if not self.nullable: - return 0 - return cpp_null_count( - ( - self.base_mask.get_ptr(mode="read") - ), - self.offset, - self.offset + self.size - ) - - cdef mutable_column_view mutable_view(self) except *: - if isinstance(self.dtype, cudf.CategoricalDtype): - col = self.base_children[0] - data_dtype = col.dtype - elif isinstance(self.dtype, pd.DatetimeTZDtype): - col = self - data_dtype = _get_base_dtype(col.dtype) - else: - col = self - data_dtype = col.dtype - - cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype) - cdef libcudf_types.size_type offset = self.offset - cdef vector[mutable_column_view] children - cdef void* data - - if col.base_data is None: - data = NULL - else: - data = ( - col.base_data.get_ptr(mode="write") - ) - - cdef Column child_column - if col.base_children: - for child_column in col.base_children: - children.push_back(child_column.mutable_view()) - - cdef libcudf_types.bitmask_type* mask - if self.nullable: - mask = ( - self.base_mask.get_ptr(mode="write") - ) - else: - mask = NULL - - null_count = self._null_count - - if null_count is None: - null_count = 0 - cdef libcudf_types.size_type c_null_count = null_count - - self._mask = None - self._null_count = None - self._children = None - self._data = None - - return mutable_column_view( - dtype, - self.size, - data, - mask, - c_null_count, - offset, - children) - - cdef column_view view(self) except *: - null_count = self.null_count - if null_count is None: - null_count = 0 - cdef libcudf_types.size_type c_null_count = null_count - return self._view(c_null_count) - - cdef column_view _view(self, libcudf_types.size_type null_count) except *: - if isinstance(self.dtype, cudf.CategoricalDtype): - col = self.base_children[0] - data_dtype = col.dtype - elif isinstance(self.dtype, pd.DatetimeTZDtype): - col = self - data_dtype = _get_base_dtype(col.dtype) - else: - col = self - data_dtype = col.dtype - - cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype) - cdef libcudf_types.size_type offset = self.offset - cdef vector[column_view] children - cdef void* data - - if col.base_data is None: - data = NULL - else: - data = (col.base_data.get_ptr(mode="read")) - - cdef Column child_column - if col.base_children: - for child_column in col.base_children: - children.push_back(child_column.view()) - - cdef libcudf_types.bitmask_type* mask - if self.nullable: - mask = ( - self.base_mask.get_ptr(mode="read") - ) - else: - mask = NULL - - cdef libcudf_types.size_type c_null_count = null_count - - return column_view( - dtype, - self.size, - data, - mask, - c_null_count, - offset, - children) - - # TODO: Consider whether this function should support some sort of `copy` - # parameter. Not urgent until this functionality is moved up to the Frame - # layer and made public. This function will also need to mark the - # underlying buffers as exposed before this function can itself be exposed - # publicly. User requests to convert to pylibcudf must assume that the - # data may be modified afterwards. - cpdef to_pylibcudf(self, mode: Literal["read", "write"]): - """Convert this Column to a pylibcudf.Column. - - This function will generate a pylibcudf Column pointing to the same - data, mask, and children as this one. - - Parameters - ---------- - mode : str - Supported values are {"read", "write"} If "write", the data pointed - to may be modified by the caller. If "read", the data pointed to - must not be modified by the caller. Failure to fulfill this - contract will cause incorrect behavior. - - Returns - ------- - pylibcudf.Column - A new pylibcudf.Column referencing the same data. - """ - - # TODO: Categoricals will need to be treated differently eventually. - # There is no 1-1 correspondence between cudf and libcudf for - # categoricals because cudf supports ordered and unordered categoricals - # while libcudf supports only unordered categoricals (see - # https://github.com/rapidsai/cudf/pull/8567). - if isinstance(self.dtype, cudf.CategoricalDtype): - col = self.base_children[0] - else: - col = self - - dtype = dtype_to_pylibcudf_type(col.dtype) - - data = None - if col.base_data is not None: - cai = cuda_array_interface_wrapper( - ptr=col.base_data.get_ptr(mode=mode), - size=col.base_data.size, - owner=col.base_data, - ) - data = pylibcudf.gpumemoryview(cai) - - mask = None - if self.nullable: - # TODO: Are we intentionally use self's mask instead of col's? - # Where is the mask stored for categoricals? - cai = cuda_array_interface_wrapper( - ptr=self.base_mask.get_ptr(mode=mode), - size=self.base_mask.size, - owner=self.base_mask, - ) - mask = pylibcudf.gpumemoryview(cai) - - cdef Column child_column - children = [] - if col.base_children: - for child_column in col.base_children: - children.append(child_column.to_pylibcudf(mode=mode)) - - return pylibcudf.Column( - dtype, - self.size, - data, - mask, - self.null_count, - self.offset, - children, - ) - - @staticmethod - cdef Column from_unique_ptr( - unique_ptr[column] c_col, bint data_ptr_exposed=False - ): - """Create a Column from a column - - Typically, this is called on the result of a libcudf operation. - If the data of the libcudf result has been exposed, set - `data_ptr_exposed=True` to expose the memory of the returned Column - as well. - """ - cdef column_view view = c_col.get()[0].view() - cdef libcudf_types.type_id tid = view.type().id() - cdef libcudf_types.data_type c_dtype - cdef size_type length = view.size() - cdef libcudf_types.mask_state mask_state - if tid == libcudf_types.type_id.TIMESTAMP_DAYS: - c_dtype = libcudf_types.data_type( - libcudf_types.type_id.TIMESTAMP_SECONDS - ) - with nogil: - c_col = move(libcudf_unary.cast(view, c_dtype)) - elif tid == libcudf_types.type_id.EMPTY: - c_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8) - mask_state = libcudf_types.mask_state.ALL_NULL - with nogil: - c_col = move(make_numeric_column(c_dtype, length, mask_state)) - - size = c_col.get()[0].size() - dtype = dtype_from_column_view(c_col.get()[0].view()) - null_count = c_col.get()[0].null_count() - - # After call to release(), c_col is unusable - cdef column_contents contents = move(c_col.get()[0].release()) - - data = as_buffer( - DeviceBuffer.c_from_unique_ptr(move(contents.data)), - exposed=data_ptr_exposed - ) - - if null_count > 0: - mask = as_buffer( - DeviceBuffer.c_from_unique_ptr(move(contents.null_mask)), - exposed=data_ptr_exposed - ) - else: - mask = None - - cdef vector[unique_ptr[column]] c_children = move(contents.children) - children = [] - if c_children.size() != 0: - # Because of a bug in Cython, we cannot set the optional - # `data_ptr_exposed` argument within a comprehension. - for i in range(c_children.size()): - child = Column.from_unique_ptr( - move(c_children[i]), - data_ptr_exposed=data_ptr_exposed - ) - children.append(child) - - return cudf.core.column.build_column( - data, - dtype=dtype, - mask=mask, - size=size, - null_count=null_count, - children=tuple(children) - ) - - @staticmethod - def from_pylibcudf( - col, bint data_ptr_exposed=False - ): - """Create a Column from a pylibcudf.Column. - - This function will generate a Column pointing to the provided pylibcudf - Column. It will directly access the data and mask buffers of the - pylibcudf Column, so the newly created object is not tied to the - lifetime of the original pylibcudf.Column. - - Parameters - ---------- - col : pylibcudf.Column - The object to copy. - data_ptr_exposed : bool - Whether the data buffer is exposed. - - Returns - ------- - pylibcudf.Column - A new pylibcudf.Column referencing the same data. - """ - if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS: - col = pylibcudf.unary.cast( - col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS) - ) - elif col.type().id() == pylibcudf.TypeId.EMPTY: - new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8) - - col = pylibcudf.column_factories.make_numeric_column( - new_dtype, - col.size(), - pylibcudf.column_factories.MaskState.ALL_NULL - ) - - dtype = dtype_from_pylibcudf_column(col) - - return cudf.core.column.build_column( - data=as_buffer( - col.data().obj, exposed=data_ptr_exposed - ) if col.data() is not None else None, - dtype=dtype, - size=col.size(), - mask=as_buffer( - col.null_mask().obj, exposed=data_ptr_exposed - ) if col.null_mask() is not None else None, - offset=col.offset(), - null_count=col.null_count(), - children=tuple([ - Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed) - for child in col.children() - ]) - ) - - @staticmethod - cdef Column from_column_view(column_view cv, object owner): - """ - Given a ``cudf::column_view``, constructs a ``cudf.Column`` from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. If ``owner`` is a ``cudf.Column``, we reach inside of it and - make the owner of each newly created ``Buffer`` the respective - ``Buffer`` from the ``owner`` ``cudf.Column``. - If ``owner`` is ``None``, we allocate new memory for the resulting - ``cudf.Column``. - """ - column_owner = isinstance(owner, Column) - mask_owner = owner - if column_owner and isinstance(owner.dtype, cudf.CategoricalDtype): - owner = owner.base_children[0] - - size = cv.size() - offset = cv.offset() - dtype = dtype_from_column_view(cv) - dtype_itemsize = getattr(dtype, "itemsize", 1) - - data_ptr = (cv.head[void]()) - data = None - base_size = size + offset - data_owner = owner - - if column_owner: - data_owner = owner.base_data - mask_owner = mask_owner.base_mask - base_size = owner.base_size - base_nbytes = base_size * dtype_itemsize - # special case for string column - is_string_column = (cv.type().id() == libcudf_types.type_id.STRING) - if is_string_column: - # get the size from offset child column (device to host copy) - offsets_column_index = 0 - offset_child_column = cv.child(offsets_column_index) - if offset_child_column.size() == 0: - base_nbytes = 0 - else: - chars_size = get_element( - offset_child_column, offset_child_column.size()-1).value - base_nbytes = chars_size - - if data_ptr: - if data_owner is None: - buffer_size = ( - base_nbytes - if is_string_column - else ((size + offset) * dtype_itemsize) - ) - data = as_buffer( - rmm.DeviceBuffer(ptr=data_ptr, - size=buffer_size) - ) - elif ( - column_owner and - isinstance(data_owner, ExposureTrackedBuffer) - ): - data = as_buffer( - data=data_ptr, - size=base_nbytes, - owner=data_owner, - exposed=False, - ) - elif ( - # This is an optimization of the most common case where - # from_column_view creates a "view" that is identical to - # the owner. - column_owner and - isinstance(data_owner, SpillableBuffer) and - # We check that `data_owner` is spill locked (not spillable) - # and that it points to the same memory as `data_ptr`. - not data_owner.spillable and - data_owner.memory_info() == (data_ptr, base_nbytes, "gpu") - ): - data = data_owner - else: - # At this point we don't know the relationship between data_ptr - # and data_owner thus we mark both of them exposed. - # TODO: try to discover their relationship and create a - # SpillableBufferSlice instead. - data = as_buffer( - data=data_ptr, - size=base_nbytes, - owner=data_owner, - exposed=True, - ) - if isinstance(data_owner, ExposureTrackedBuffer): - # accessing the pointer marks it exposed permanently. - data_owner.mark_exposed() - elif isinstance(data_owner, SpillableBuffer): - if data_owner.is_spilled: - raise ValueError( - f"{data_owner} is spilled, which invalidates " - f"the exposed data_ptr ({hex(data_ptr)})" - ) - # accessing the pointer marks it exposed permanently. - data_owner.mark_exposed() - else: - data = as_buffer( - rmm.DeviceBuffer(ptr=data_ptr, size=0) - ) - - mask = None - mask_ptr = (cv.null_mask()) - if mask_ptr: - if mask_owner is None: - if column_owner: - # if we reached here, it means `owner` is a `Column` - # that does not have a null mask, but `cv` thinks it - # should have a null mask. This can happen in the - # following sequence of events: - # - # 1) `cv` is constructed as a view into a - # `cudf::column` that is nullable (i.e., it has - # a null mask), but contains no nulls. - # 2) `owner`, a `Column`, is constructed from the - # same `cudf::column`. Because `cudf::column` - # is memory owning, `owner` takes ownership of - # the memory owned by the - # `cudf::column`. Because the column has a null - # count of 0, it may choose to discard the null - # mask. - # 3) Now, `cv` points to a discarded null mask. - # - # TL;DR: we should not include a null mask in the - # result: - mask = None - else: - mask = as_buffer( - rmm.DeviceBuffer( - ptr=mask_ptr, - size=bitmask_allocation_size_bytes(base_size) - ) - ) - else: - mask = as_buffer( - data=mask_ptr, - size=bitmask_allocation_size_bytes(base_size), - owner=mask_owner, - exposed=True - ) - - if cv.has_nulls(): - null_count = cv.null_count() - else: - null_count = 0 - - children = [] - for child_index in range(cv.num_children()): - child_owner = owner - if column_owner: - child_owner = owner.base_children[child_index] - children.append( - Column.from_column_view( - cv.child(child_index), - child_owner - ) - ) - children = tuple(children) - - result = cudf.core.column.build_column( - data=data, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=tuple(children) - ) - - return result - - @staticmethod - def from_scalar(py_val, size_type size): - cdef DeviceScalar val = py_val.device_value - cdef const scalar* c_val = val.get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move(cpp_make_column_from_scalar(c_val[0], size)) - return Column.from_unique_ptr(move(c_result)) +def f(): + pass diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx deleted file mode 100644 index e6c2d136f0d..00000000000 --- a/python/cudf/cudf/_lib/concat.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport data_from_pylibcudf_table - -import pylibcudf - -from cudf.core.buffer import acquire_spill_lock - - -@acquire_spill_lock() -def concat_columns(object columns): - return Column.from_pylibcudf( - pylibcudf.concatenate.concatenate( - [col.to_pylibcudf(mode="read") for col in columns] - ) - ) - - -@acquire_spill_lock() -def concat_tables(object tables, bool ignore_index=False): - plc_tables = [] - for table in tables: - cols = table._columns - if not ignore_index: - cols = table._index._columns + cols - plc_tables.append(pylibcudf.Table([c.to_pylibcudf(mode="read") for c in cols])) - - return data_from_pylibcudf_table( - pylibcudf.concatenate.concatenate(plc_tables), - column_names=tables[0]._column_names, - index_names=None if ignore_index else tables[0]._index_names - ) diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd deleted file mode 100644 index 14c7d2066d8..00000000000 --- a/python/cudf/cudf/_lib/copying.pxd +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from pylibcudf.libcudf.contiguous_split cimport packed_columns - - -cdef class _CPackedColumns: - cdef packed_columns c_obj - cdef object column_names - cdef object column_dtypes - cdef object index_names diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx deleted file mode 100644 index 49714091f46..00000000000 --- a/python/cudf/cudf/_lib/copying.pyx +++ /dev/null @@ -1,538 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pickle - -from libc.stdint cimport uint8_t, uintptr_t -from libcpp cimport bool -from libcpp.memory cimport make_shared, shared_ptr, unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from rmm._lib.device_buffer cimport DeviceBuffer - -import pylibcudf - -import cudf -from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_table - -from cudf._lib.reduce import minmax -from cudf.core.abc import Serializable - -from libcpp.memory cimport make_unique - -cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.gather cimport ( - segmented_gather as cpp_segmented_gather, -) -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -from pylibcudf.libcudf.scalar.scalar cimport scalar -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view - -# workaround for https://github.com/cython/cython/issues/3885 -ctypedef const scalar constscalar - - -def _gather_map_is_valid( - gather_map: "cudf.core.column.ColumnBase", - nrows: int, - check_bounds: bool, - nullify: bool, -) -> bool: - """Returns true if gather map is valid. - - A gather map is valid if empty or all indices are within the range - ``[-nrows, nrows)``, except when ``nullify`` is specified. - """ - if not check_bounds or nullify or len(gather_map) == 0: - return True - gm_min, gm_max = minmax(gather_map) - return gm_min >= -nrows and gm_max < nrows - - -@acquire_spill_lock() -def copy_column(Column input_column): - """ - Deep copies a column - - Parameters - ---------- - input_columns : column to be copied - - Returns - ------- - Deep copied column - """ - cdef unique_ptr[column] c_result - cdef column_view input_column_view = input_column.view() - with nogil: - c_result = move(make_unique[column](input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def _copy_range_in_place(Column input_column, - Column target_column, - size_type input_begin, - size_type input_end, - size_type target_begin): - pylibcudf.copying.copy_range( - input_column.to_pylibcudf(mode="write"), - target_column.to_pylibcudf(mode="write"), - input_begin, - input_end, - target_begin - ) - - -def _copy_range(Column input_column, - Column target_column, - size_type input_begin, - size_type input_end, - size_type target_begin): - return Column.from_pylibcudf( - pylibcudf.copying.copy_range( - input_column.to_pylibcudf(mode="read"), - target_column.to_pylibcudf(mode="read"), - input_begin, - input_end, - target_begin - ) - ) - - -@acquire_spill_lock() -def copy_range(Column source_column, - Column target_column, - size_type source_begin, - size_type source_end, - size_type target_begin, - size_type target_end, - bool inplace): - """ - Copy a contiguous range from a source to a target column - - Notes - ----- - Expects the source and target ranges to have been sanitised to be - in-range for the source and target column respectively. For - example via ``slice.indices``. - """ - - msg = "Source and target ranges must be same length" - assert source_end - source_begin == target_end - target_begin, msg - if target_end >= target_begin and inplace: - # FIXME: Are we allowed to do this when inplace=False? - return target_column - - if inplace: - _copy_range_in_place(source_column, target_column, - source_begin, source_end, target_begin) - else: - return _copy_range(source_column, target_column, - source_begin, source_end, target_begin) - - -@acquire_spill_lock() -def gather( - list columns, - Column gather_map, - bool nullify=False -): - tbl = pylibcudf.copying.gather( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in columns]), - gather_map.to_pylibcudf(mode="read"), - pylibcudf.copying.OutOfBoundsPolicy.NULLIFY if nullify - else pylibcudf.copying.OutOfBoundsPolicy.DONT_CHECK - ) - return columns_from_pylibcudf_table(tbl) - - -@acquire_spill_lock() -def scatter(list sources, Column scatter_map, list target_columns, - bool bounds_check=True): - """ - Scattering source into target as per the scatter map. - `source` can be a list of scalars, or a list of columns. The number of - items in `sources` must equal the number of `target_columns` to scatter. - """ - # TODO: Only single column scatter is used, we should explore multi-column - # scatter for frames for performance increase. - - if len(sources) != len(target_columns): - raise ValueError("Mismatched number of source and target columns.") - - if len(sources) == 0: - return [] - - if bounds_check: - n_rows = len(target_columns[0]) - if not ( - (scatter_map >= -n_rows).all() - and (scatter_map < n_rows).all() - ): - raise IndexError( - f"index out of bounds for column of size {n_rows}" - ) - - tbl = pylibcudf.copying.scatter( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in sources]) - if isinstance(sources[0], Column) - else [( as_device_scalar(slr)).c_value for slr in sources], - scatter_map.to_pylibcudf(mode="read"), - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in target_columns]), - ) - - return columns_from_pylibcudf_table(tbl) - - -@acquire_spill_lock() -def column_empty_like(Column input_column): - return Column.from_pylibcudf( - pylibcudf.copying.empty_like( - input_column.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def column_allocate_like(Column input_column, size=None): - return Column.from_pylibcudf( - pylibcudf.copying.allocate_like( - input_column.to_pylibcudf(mode="read"), - size, - ) - ) - - -@acquire_spill_lock() -def columns_empty_like(list input_columns): - return columns_from_pylibcudf_table( - pylibcudf.copying.empty_like( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]) - ) - ) - - -@acquire_spill_lock() -def column_slice(Column input_column, object indices): - return [ - Column.from_pylibcudf(c) - for c in pylibcudf.copying.slice( - input_column.to_pylibcudf(mode="read"), - list(indices), - ) - ] - - -@acquire_spill_lock() -def columns_slice(list input_columns, object indices): - return [ - columns_from_pylibcudf_table(tbl) - for tbl in pylibcudf.copying.slice( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]), - list(indices), - ) - ] - - -@acquire_spill_lock() -def column_split(Column input_column, object splits): - return [ - Column.from_pylibcudf(c) - for c in pylibcudf.copying.split( - input_column.to_pylibcudf(mode="read"), - list(splits), - ) - ] - - -@acquire_spill_lock() -def columns_split(list input_columns, object splits): - return [ - columns_from_pylibcudf_table(tbl) - for tbl in pylibcudf.copying.split( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]), - list(splits), - ) - ] - - -@acquire_spill_lock() -def copy_if_else(object lhs, object rhs, Column boolean_mask): - return Column.from_pylibcudf( - pylibcudf.copying.copy_if_else( - lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column) - else ( as_device_scalar(lhs)).c_value, - rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column) - else ( as_device_scalar(rhs)).c_value, - boolean_mask.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def boolean_mask_scatter(list input_, list target_columns, - Column boolean_mask): - """Copy the target columns, replacing masked rows with input data. - - The ``input_`` data can be a list of columns or as a list of scalars. - A list of input columns will be used to replace corresponding rows in the - target columns for which the boolean mask is ``True``. For the nth ``True`` - in the boolean mask, the nth row in ``input_`` is used to replace. A list - of input scalars will replace all rows in the target columns for which the - boolean mask is ``True``. - """ - if len(input_) != len(target_columns): - raise ValueError("Mismatched number of input and target columns.") - - if len(input_) == 0: - return [] - - tbl = pylibcudf.copying.boolean_mask_scatter( - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_]) - if isinstance(input_[0], Column) - else [( as_device_scalar(i)).c_value for i in input_], - pylibcudf.Table([col.to_pylibcudf(mode="read") for col in target_columns]), - boolean_mask.to_pylibcudf(mode="read"), - ) - - return columns_from_pylibcudf_table(tbl) - - -@acquire_spill_lock() -def shift(Column input, int offset, object fill_value=None): - cdef DeviceScalar fill - - if isinstance(fill_value, DeviceScalar): - fill = fill_value - else: - fill = as_device_scalar(fill_value, input.dtype) - - col = pylibcudf.copying.shift( - input.to_pylibcudf(mode="read"), - offset, - fill.c_value, - ) - return Column.from_pylibcudf(col) - - -@acquire_spill_lock() -def get_element(Column input_column, size_type index): - return DeviceScalar.from_pylibcudf( - pylibcudf.copying.get_element( - input_column.to_pylibcudf(mode="read"), - index, - ), - dtype=input_column.dtype, - ) - - -@acquire_spill_lock() -def segmented_gather(Column source_column, Column gather_map): - cdef shared_ptr[lists_column_view] source_LCV = ( - make_shared[lists_column_view](source_column.view()) - ) - cdef shared_ptr[lists_column_view] gather_map_LCV = ( - make_shared[lists_column_view](gather_map.view()) - ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_segmented_gather( - source_LCV.get()[0], gather_map_LCV.get()[0]) - ) - - result = Column.from_unique_ptr(move(c_result)) - return result - - -cdef class _CPackedColumns: - - @staticmethod - def from_py_table(input_table, keep_index=True): - """ - Construct a ``PackedColumns`` object from a ``cudf.DataFrame``. - """ - import cudf.core.dtypes - - cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns) - - if keep_index and ( - not isinstance(input_table.index, cudf.RangeIndex) - or input_table.index.start != 0 - or input_table.index.stop != len(input_table) - or input_table.index.step != 1 - ): - input_table_view = table_view_from_table(input_table) - p.index_names = input_table._index_names - else: - input_table_view = table_view_from_table( - input_table, ignore_index=True) - - p.column_names = input_table._column_names - p.column_dtypes = {} - for name, col in input_table._column_labels_and_values: - if isinstance(col.dtype, cudf.core.dtypes._BaseDtype): - p.column_dtypes[name] = col.dtype - - p.c_obj = move(cpp_contiguous_split.pack(input_table_view)) - - return p - - @property - def gpu_data_ptr(self): - return int(self.c_obj.gpu_data.get()[0].data()) - - @property - def gpu_data_size(self): - return int(self.c_obj.gpu_data.get()[0].size()) - - def serialize(self): - header = {} - frames = [] - - gpu_data = as_buffer( - data=self.gpu_data_ptr, - size=self.gpu_data_size, - owner=self, - exposed=True - ) - data_header, data_frames = gpu_data.serialize() - header["data"] = data_header - frames.extend(data_frames) - - header["column-names"] = self.column_names - header["index-names"] = self.index_names - if self.c_obj.metadata.get()[0].data() != NULL: - header["metadata"] = list( - - self.c_obj.metadata.get()[0].data() - ) - - column_dtypes = {} - for name, dtype in self.column_dtypes.items(): - dtype_header, dtype_frames = dtype.serialize() - column_dtypes[name] = ( - dtype_header, - (len(frames), len(frames) + len(dtype_frames)), - ) - frames.extend(dtype_frames) - header["column-dtypes"] = column_dtypes - - return header, frames - - @staticmethod - def deserialize(header, frames): - cdef _CPackedColumns p = _CPackedColumns.__new__(_CPackedColumns) - - gpu_data = Buffer.deserialize(header["data"], frames) - - dbuf = DeviceBuffer( - ptr=gpu_data.get_ptr(mode="write"), - size=gpu_data.nbytes - ) - - cdef cpp_contiguous_split.packed_columns data - data.metadata = move( - make_unique[vector[uint8_t]]( - move(header.get("metadata", [])) - ) - ) - data.gpu_data = move(dbuf.c_obj) - - p.c_obj = move(data) - p.column_names = header["column-names"] - p.index_names = header["index-names"] - - column_dtypes = {} - for name, dtype in header["column-dtypes"].items(): - dtype_header, (start, stop) = dtype - column_dtypes[name] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize(dtype_header, frames[start:stop]) - p.column_dtypes = column_dtypes - - return p - - def unpack(self): - output_table = cudf.DataFrame._from_data(*data_from_table_view( - cpp_contiguous_split.unpack(self.c_obj), - self, - self.column_names, - self.index_names - )) - - for name, dtype in self.column_dtypes.items(): - output_table._data[name] = ( - output_table._data[name]._with_type_metadata(dtype) - ) - - return output_table - - -class PackedColumns(Serializable): - """ - A packed representation of a Frame, with all columns residing - in a single GPU memory buffer. - """ - - def __init__(self, data): - self._data = data - - def __reduce__(self): - return self.deserialize, self.serialize() - - @property - def __cuda_array_interface__(self): - return { - "data": (self._data.gpu_data_ptr, False), - "shape": (self._data.gpu_data_size,), - "strides": None, - "typestr": "|u1", - "version": 0 - } - - def serialize(self): - header, frames = self._data.serialize() - header["type-serialized"] = pickle.dumps(type(self)) - - return header, frames - - @classmethod - def deserialize(cls, header, frames): - return cls(_CPackedColumns.deserialize(header, frames)) - - @classmethod - def from_py_table(cls, input_table, keep_index=True): - return cls(_CPackedColumns.from_py_table(input_table, keep_index)) - - def unpack(self): - return self._data.unpack() - - -def pack(input_table, keep_index=True): - """ - Pack the columns of a cudf Frame into a single GPU memory buffer. - """ - return PackedColumns.from_py_table(input_table, keep_index) - - -def unpack(packed): - """ - Unpack the results of packing a cudf Frame returning a new - cudf Frame in the process. - """ - return packed.unpack() diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx deleted file mode 100644 index 9ad96f610b3..00000000000 --- a/python/cudf/cudf/_lib/csv.pyx +++ /dev/null @@ -1,429 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector - -cimport pylibcudf.libcudf.types as libcudf_types - -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import errno -import os -from collections import abc -from io import BytesIO, StringIO - -import numpy as np -import pandas as pd - -import cudf -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from pylibcudf.libcudf.io.csv cimport ( - csv_writer_options, - write_csv as cpp_write_csv, -) -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport compression_type, sink_info -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.io.utils cimport make_sink_info -from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table - -import pylibcudf as plc - -from cudf.api.types import is_hashable - -from pylibcudf.types cimport DataType - -CSV_HEX_TYPE_MAP = { - "hex": np.dtype("int64"), - "hex64": np.dtype("int64"), - "hex32": np.dtype("int32") -} - - -def validate_args( - object delimiter, - object sep, - bool delim_whitespace, - object decimal, - object thousands, - object nrows, - int skipfooter, - object byte_range, - int skiprows -): - if delim_whitespace: - if delimiter is not None: - raise ValueError("cannot set both delimiter and delim_whitespace") - if sep != ',': - raise ValueError("cannot set both sep and delim_whitespace") - - # Alias sep -> delimiter. - actual_delimiter = delimiter if delimiter else sep - - if decimal == actual_delimiter: - raise ValueError("decimal cannot be the same as delimiter") - - if thousands == actual_delimiter: - raise ValueError("thousands cannot be the same as delimiter") - - if nrows is not None and skipfooter != 0: - raise ValueError("cannot use both nrows and skipfooter parameters") - - if byte_range is not None: - if skipfooter != 0 or skiprows != 0 or nrows is not None: - raise ValueError("""cannot manually limit rows to be read when - using the byte range parameter""") - - -def read_csv( - object datasource, - object lineterminator="\n", - object quotechar='"', - int quoting=0, - bool doublequote=True, - object header="infer", - bool mangle_dupe_cols=True, - object usecols=None, - object sep=",", - object delimiter=None, - bool delim_whitespace=False, - bool skipinitialspace=False, - object names=None, - object dtype=None, - int skipfooter=0, - int skiprows=0, - bool dayfirst=False, - object compression="infer", - object thousands=None, - object decimal=".", - object true_values=None, - object false_values=None, - object nrows=None, - object byte_range=None, - bool skip_blank_lines=True, - object parse_dates=None, - object comment=None, - object na_values=None, - bool keep_default_na=True, - bool na_filter=True, - object prefix=None, - object index_col=None, -): - """ - Cython function to call into libcudf API, see `read_csv`. - - See Also - -------- - cudf.read_csv - """ - - if not isinstance(datasource, (BytesIO, StringIO, bytes)): - if not os.path.isfile(datasource): - raise FileNotFoundError( - errno.ENOENT, os.strerror(errno.ENOENT), datasource - ) - - if isinstance(datasource, StringIO): - datasource = datasource.read().encode() - elif isinstance(datasource, str) and not os.path.isfile(datasource): - datasource = datasource.encode() - - validate_args(delimiter, sep, delim_whitespace, decimal, thousands, - nrows, skipfooter, byte_range, skiprows) - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - delimiter = str(delimiter) - - if byte_range is None: - byte_range = (0, 0) - - if compression is None: - c_compression = compression_type.NONE - else: - compression_map = { - "infer": compression_type.AUTO, - "gzip": compression_type.GZIP, - "bz2": compression_type.BZIP2, - "zip": compression_type.ZIP, - } - c_compression = compression_map[compression] - - # We need this later when setting index cols - orig_header = header - - if names is not None: - # explicitly mentioned name, so don't check header - if header is None or header == 'infer': - header = -1 - else: - header = header - names = list(names) - else: - if header is None: - header = -1 - elif header == 'infer': - header = 0 - - hex_cols = [] - - new_dtypes = [] - if dtype is not None: - if isinstance(dtype, abc.Mapping): - new_dtypes = dict() - for k, v in dtype.items(): - col_type = v - if is_hashable(v) and v in CSV_HEX_TYPE_MAP: - col_type = CSV_HEX_TYPE_MAP[v] - hex_cols.append(str(k)) - - new_dtypes[k] = _get_plc_data_type_from_dtype( - cudf.dtype(col_type) - ) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP: - dtype = CSV_HEX_TYPE_MAP[dtype] - hex_cols.append(0) - - new_dtypes.append( - _get_plc_data_type_from_dtype(dtype) - ) - elif isinstance(dtype, abc.Collection): - for index, col_dtype in enumerate(dtype): - if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP: - col_dtype = CSV_HEX_TYPE_MAP[col_dtype] - hex_cols.append(index) - - new_dtypes.append( - _get_plc_data_type_from_dtype(col_dtype) - ) - else: - raise ValueError( - "dtype should be a scalar/str/list-like/dict-like" - ) - - lineterminator = str(lineterminator) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io( - plc.io.csv.read_csv( - plc.io.SourceInfo([datasource]), - lineterminator=lineterminator, - quotechar = quotechar, - quoting = quoting, - doublequote = doublequote, - header = header, - mangle_dupe_cols = mangle_dupe_cols, - usecols = usecols, - delimiter = delimiter, - delim_whitespace = delim_whitespace, - skipinitialspace = skipinitialspace, - col_names = names, - dtypes = new_dtypes, - skipfooter = skipfooter, - skiprows = skiprows, - dayfirst = dayfirst, - compression = c_compression, - thousands = thousands, - decimal = decimal, - true_values = true_values, - false_values = false_values, - nrows = nrows if nrows is not None else -1, - byte_range_offset = byte_range[0], - byte_range_size = byte_range[1], - skip_blank_lines = skip_blank_lines, - parse_dates = parse_dates, - parse_hex = hex_cols, - comment = comment, - na_values = na_values, - keep_default_na = keep_default_na, - na_filter = na_filter, - prefix = prefix, - ) - ) - ) - - if dtype is not None: - if isinstance(dtype, abc.Mapping): - for k, v in dtype.items(): - if isinstance(cudf.dtype(v), cudf.CategoricalDtype): - df._data[str(k)] = df._data[str(k)].astype(v) - elif ( - cudf.api.types.is_scalar(dtype) or - isinstance(dtype, ( - np.dtype, pd.api.extensions.ExtensionDtype, type - )) - ): - if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype): - df = df.astype(dtype) - elif isinstance(dtype, abc.Collection): - for index, col_dtype in enumerate(dtype): - if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype): - col_name = df._column_names[index] - df._data[col_name] = df._data[col_name].astype(col_dtype) - - if names is not None and len(names) and isinstance(names[0], int): - df.columns = [int(x) for x in df._data] - elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"): - df.columns = [int(x) for x in df._column_names] - - # Set index if the index_col parameter is passed - if index_col is not None and index_col is not False: - if isinstance(index_col, int): - index_col_name = df._data.get_labels_by_index(index_col)[0] - df = df.set_index(index_col_name) - if isinstance(index_col_name, str) and \ - names is None and orig_header == "infer": - if index_col_name.startswith("Unnamed:"): - # TODO: Try to upstream it to libcudf - # csv reader in future - df._index.name = None - elif names is None: - df._index.name = index_col - else: - df = df.set_index(index_col) - - return df - - -@acquire_spill_lock() -def write_csv( - table, - object path_or_buf=None, - object sep=",", - object na_rep="", - bool header=True, - object lineterminator="\n", - int rows_per_chunk=8, - bool index=True, -): - """ - Cython function to call into libcudf API, see `write_csv`. - - See Also - -------- - cudf.to_csv - """ - cdef table_view input_table_view = table_view_from_table( - table, not index - ) - cdef bool include_header_c = header - cdef char delim_c = ord(sep) - cdef string line_term_c = lineterminator.encode() - cdef string na_c = na_rep.encode() - cdef int rows_per_chunk_c = rows_per_chunk - cdef vector[string] col_names - cdef string true_value_c = 'True'.encode() - cdef string false_value_c = 'False'.encode() - cdef unique_ptr[data_sink] data_sink_c - cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - - if header is True: - all_names = columns_apply_na_rep(table._column_names, na_rep) - if index is True: - all_names = table._index.names + all_names - - if len(all_names) > 0: - col_names.reserve(len(all_names)) - if len(all_names) == 1: - if all_names[0] in (None, ''): - col_names.push_back('""'.encode()) - else: - col_names.push_back( - str(all_names[0]).encode() - ) - else: - for idx, col_name in enumerate(all_names): - if col_name is None: - col_names.push_back(''.encode()) - else: - col_names.push_back( - str(col_name).encode() - ) - - cdef csv_writer_options options = move( - csv_writer_options.builder(sink_info_c, input_table_view) - .names(col_names) - .na_rep(na_c) - .include_header(include_header_c) - .rows_per_chunk(rows_per_chunk_c) - .line_terminator(line_term_c) - .inter_column_delimiter(delim_c) - .true_value(true_value_c) - .false_value(false_value_c) - .build() - ) - - try: - with nogil: - cpp_write_csv(options) - except OverflowError: - raise OverflowError( - f"Writing CSV file with chunksize={rows_per_chunk} failed. " - "Consider providing a smaller chunksize argument." - ) - - -cdef DataType _get_plc_data_type_from_dtype(object dtype) except *: - # TODO: Remove this work-around Dictionary types - # in libcudf are fully mapped to categorical columns: - # https://github.com/rapidsai/cudf/issues/3960 - if isinstance(dtype, cudf.CategoricalDtype): - dtype = dtype.categories.dtype - elif dtype == "category": - dtype = "str" - - if isinstance(dtype, str): - if str(dtype) == "date32": - return DataType( - libcudf_types.type_id.TIMESTAMP_DAYS - ) - elif str(dtype) in ("date", "date64"): - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp": - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp[us]": - return DataType( - libcudf_types.type_id.TIMESTAMP_MICROSECONDS - ) - elif str(dtype) == "timestamp[s]": - return DataType( - libcudf_types.type_id.TIMESTAMP_SECONDS - ) - elif str(dtype) == "timestamp[ms]": - return DataType( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - elif str(dtype) == "timestamp[ns]": - return DataType( - libcudf_types.type_id.TIMESTAMP_NANOSECONDS - ) - - dtype = cudf.dtype(dtype) - return dtype_to_pylibcudf_type(dtype) - - -def columns_apply_na_rep(column_names, na_rep): - return tuple( - na_rep if pd.isnull(col_name) - else col_name - for col_name in column_names - ) diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx deleted file mode 100644 index bc5e085ec39..00000000000 --- a/python/cudf/cudf/_lib/datetime.pyx +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import warnings - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -cimport pylibcudf.libcudf.datetime as libcudf_datetime -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.filling cimport calendrical_month_sequence -from pylibcudf.libcudf.scalar.scalar cimport scalar -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def add_months(Column col, Column months): - # months must be int16 dtype - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef column_view months_view = months.view() - - with nogil: - c_result = move( - libcudf_datetime.add_calendrical_months( - col_view, - months_view - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def extract_datetime_component(Column col, object field): - result = Column.from_pylibcudf( - plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) - ) - - if field == "weekday": - # Pandas counts Monday-Sunday as 0-6 - # while libcudf counts Monday-Sunday as 1-7 - result = result - result.dtype.type(1) - - return result - - -cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq): - cdef libcudf_datetime.rounding_frequency freq_val - - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html - old_to_new_freq_map = { - "H": "h", - "N": "ns", - "T": "min", - "L": "ms", - "U": "us", - "S": "s", - } - if freq in old_to_new_freq_map: - warnings.warn( - f"FutureWarning: {freq} is deprecated and will be " - "removed in a future version, please use " - f"{old_to_new_freq_map[freq]} instead.", - FutureWarning - ) - freq = old_to_new_freq_map.get(freq) - if freq == "D": - freq_val = libcudf_datetime.rounding_frequency.DAY - elif freq == "h": - freq_val = libcudf_datetime.rounding_frequency.HOUR - elif freq == "min": - freq_val = libcudf_datetime.rounding_frequency.MINUTE - elif freq == "s": - freq_val = libcudf_datetime.rounding_frequency.SECOND - elif freq == "ms": - freq_val = libcudf_datetime.rounding_frequency.MILLISECOND - elif freq == "us": - freq_val = libcudf_datetime.rounding_frequency.MICROSECOND - elif freq == "ns": - freq_val = libcudf_datetime.rounding_frequency.NANOSECOND - else: - raise ValueError(f"Invalid resolution: '{freq}'") - return freq_val - - -@acquire_spill_lock() -def ceil_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.ceil_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result - - -@acquire_spill_lock() -def floor_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.floor_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result - - -@acquire_spill_lock() -def round_datetime(Column col, object freq): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - cdef libcudf_datetime.rounding_frequency freq_val = \ - _get_rounding_frequency(freq) - - with nogil: - c_result = move(libcudf_datetime.round_datetimes(col_view, freq_val)) - - result = Column.from_unique_ptr(move(c_result)) - return result - - -@acquire_spill_lock() -def is_leap_year(Column col): - """Returns a boolean indicator whether the year of the date is a leap year - """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.is_leap_year(col_view)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def date_range(DeviceScalar start, size_type n, offset): - cdef unique_ptr[column] c_result - cdef size_type months = ( - offset.kwds.get("years", 0) * 12 - + offset.kwds.get("months", 0) - ) - - cdef const scalar* c_start = start.get_raw_ptr() - with nogil: - c_result = move(calendrical_month_sequence( - n, - c_start[0], - months - )) - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def extract_quarter(Column col): - """ - Returns a column which contains the corresponding quarter of the year - for every timestamp inside the input column. - """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.extract_quarter(col_view)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def days_in_month(Column col): - """Extracts the number of days in the month of the date - """ - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.days_in_month(col_view)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def last_day_of_month(Column col): - cdef unique_ptr[column] c_result - cdef column_view col_view = col.view() - - with nogil: - c_result = move(libcudf_datetime.last_day_of_month(col_view)) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx deleted file mode 100644 index b2f4c620144..00000000000 --- a/python/cudf/cudf/_lib/filling.pyx +++ /dev/null @@ -1,57 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - -from cudf._lib.scalar import as_device_scalar - - -@acquire_spill_lock() -def fill_in_place(Column destination, int begin, int end, DeviceScalar value): - pylibcudf.filling.fill_in_place( - destination.to_pylibcudf(mode='write'), - begin, - end, - ( as_device_scalar(value, dtype=destination.dtype)).c_value - ) - - -@acquire_spill_lock() -def fill(Column destination, int begin, int end, DeviceScalar value): - return Column.from_pylibcudf( - pylibcudf.filling.fill( - destination.to_pylibcudf(mode='read'), - begin, - end, - ( as_device_scalar(value)).c_value - ) - ) - - -@acquire_spill_lock() -def repeat(list inp, object count): - ctbl = pylibcudf.Table([col.to_pylibcudf(mode="read") for col in inp]) - if isinstance(count, Column): - count = count.to_pylibcudf(mode="read") - return columns_from_pylibcudf_table( - pylibcudf.filling.repeat( - ctbl, - count - ) - ) - - -@acquire_spill_lock() -def sequence(int size, DeviceScalar init, DeviceScalar step): - return Column.from_pylibcudf( - pylibcudf.filling.sequence( - size, - ( as_device_scalar(init)).c_value, - ( as_device_scalar(step)).c_value - ) - ) diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx deleted file mode 100644 index c199ed96d4f..00000000000 --- a/python/cudf/cudf/_lib/groupby.pyx +++ /dev/null @@ -1,288 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from functools import singledispatch - -from pandas.errors import DataError - -from cudf.api.types import _is_categorical_dtype, is_string_dtype -from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import ( - CategoricalDtype, - DecimalDtype, - IntervalDtype, - ListDtype, - StructDtype, -) - -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport columns_from_pylibcudf_table - -from cudf._lib.scalar import as_device_scalar - -from pylibcudf.libcudf.replace cimport replace_policy -from pylibcudf.libcudf.scalar.scalar cimport scalar - -import pylibcudf - -from cudf._lib.aggregation import make_aggregation - -# The sets below define the possible aggregations that can be performed on -# different dtypes. These strings must be elements of the AggregationKind enum. -# The libcudf infrastructure exists for "COLLECT" support on -# categoricals, but the dtype support in python does not. -_CATEGORICAL_AGGS = {"COUNT", "NUNIQUE", "SIZE", "UNIQUE"} -_STRING_AGGS = { - "COLLECT", - "COUNT", - "MAX", - "MIN", - "NTH", - "NUNIQUE", - "SIZE", - "UNIQUE", -} -_LIST_AGGS = {"COLLECT"} -_STRUCT_AGGS = {"COLLECT", "CORRELATION", "COVARIANCE"} -_INTERVAL_AGGS = {"COLLECT"} -_DECIMAL_AGGS = { - "ARGMIN", - "ARGMAX", - "COLLECT", - "COUNT", - "MAX", - "MIN", - "NTH", - "NUNIQUE", - "SUM", -} -# workaround for https://github.com/cython/cython/issues/3885 -ctypedef const scalar constscalar - - -@singledispatch -def get_valid_aggregation(dtype): - if is_string_dtype(dtype): - return _STRING_AGGS - return "ALL" - - -@get_valid_aggregation.register -def _(dtype: ListDtype): - return _LIST_AGGS - - -@get_valid_aggregation.register -def _(dtype: CategoricalDtype): - return _CATEGORICAL_AGGS - - -@get_valid_aggregation.register -def _(dtype: ListDtype): - return _LIST_AGGS - - -@get_valid_aggregation.register -def _(dtype: StructDtype): - return _STRUCT_AGGS - - -@get_valid_aggregation.register -def _(dtype: IntervalDtype): - return _INTERVAL_AGGS - - -@get_valid_aggregation.register -def _(dtype: DecimalDtype): - return _DECIMAL_AGGS - - -cdef class GroupBy: - cdef dict __dict__ - - def __init__(self, keys, dropna=True): - with acquire_spill_lock() as spill_lock: - self._groupby = pylibcudf.groupby.GroupBy( - pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in keys]), - pylibcudf.types.NullPolicy.EXCLUDE if dropna - else pylibcudf.types.NullPolicy.INCLUDE - ) - - # We spill lock the columns while this GroupBy instance is alive. - self._spill_lock = spill_lock - - def groups(self, list values): - """ - Perform a sort groupby, using the keys used to construct the Groupby as the key - columns and ``values`` as the value columns. - - Parameters - ---------- - values: list of Columns - The value columns - - Returns - ------- - offsets: list of integers - Integer offsets such that offsets[i+1] - offsets[i] - represents the size of group `i`. - grouped_keys: list of Columns - The grouped key columns - grouped_values: list of Columns - The grouped value columns - """ - offsets, grouped_keys, grouped_values = self._groupby.get_groups( - pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]) - if values else None - ) - - return ( - offsets, - columns_from_pylibcudf_table(grouped_keys), - ( - columns_from_pylibcudf_table(grouped_values) - if grouped_values is not None else [] - ), - ) - - def aggregate(self, values, aggregations): - """ - Parameters - ---------- - values : Frame - aggregations - A dict mapping column names in `Frame` to a list of aggregations - to perform on that column - - Each aggregation may be specified as: - - a string (e.g., "max") - - a lambda/function - - Returns - ------- - Frame of aggregated values - """ - included_aggregations = [] - column_included = [] - requests = [] - for i, (col, aggs) in enumerate(zip(values, aggregations)): - valid_aggregations = get_valid_aggregation(col.dtype) - included_aggregations_i = [] - col_aggregations = [] - for agg in aggs: - str_agg = str(agg) - if ( - is_string_dtype(col) - and agg not in _STRING_AGGS - and - ( - str_agg in {"cumsum", "cummin", "cummax"} - or not ( - any(a in str_agg for a in { - "count", - "max", - "min", - "first", - "last", - "nunique", - "unique", - "nth" - }) - or (agg is list) - ) - ) - ): - raise TypeError( - f"function is not supported for this dtype: {agg}" - ) - elif ( - _is_categorical_dtype(col) - and agg not in _CATEGORICAL_AGGS - and ( - str_agg in {"cumsum", "cummin", "cummax"} - or - not ( - any(a in str_agg for a in {"count", "max", "min", "unique"}) - ) - ) - ): - raise TypeError( - f"{col.dtype} type does not support {agg} operations" - ) - - agg_obj = make_aggregation(agg) - if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations: - included_aggregations_i.append((agg, agg_obj.kind)) - col_aggregations.append(agg_obj.c_obj) - included_aggregations.append(included_aggregations_i) - if col_aggregations: - requests.append(pylibcudf.groupby.GroupByRequest( - col.to_pylibcudf(mode="read"), col_aggregations - )) - column_included.append(i) - - if not requests and any(len(v) > 0 for v in aggregations): - raise DataError("All requested aggregations are unsupported.") - - keys, results = self._groupby.scan(requests) if \ - _is_all_scan_aggregate(aggregations) else self._groupby.aggregate(requests) - - result_columns = [[] for _ in range(len(values))] - for i, result in zip(column_included, results): - result_columns[i] = columns_from_pylibcudf_table(result) - - return result_columns, columns_from_pylibcudf_table(keys), included_aggregations - - def shift(self, list values, int periods, list fill_values): - keys, shifts = self._groupby.shift( - pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]), - [periods] * len(values), - [ - ( as_device_scalar(val, dtype=col.dtype)).c_value - for val, col in zip(fill_values, values) - ], - ) - - return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys) - - def replace_nulls(self, list values, object method): - # TODO: This is using an enum (replace_policy) that has not been exposed in - # pylibcudf yet. We'll want to fix that import once it is in pylibcudf. - _, replaced = self._groupby.replace_nulls( - pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]), - [ - replace_policy.PRECEDING - if method == 'ffill' else replace_policy.FOLLOWING - ] * len(values), - ) - - return columns_from_pylibcudf_table(replaced) - - -_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax", "cumprod", "rank"} - - -def _is_all_scan_aggregate(all_aggs): - """ - Returns true if all are scan aggregations. - Raises - ------ - NotImplementedError - If both reduction aggregations and scan aggregations are present. - """ - - def get_name(agg): - return agg.__name__ if callable(agg) else agg - - all_scan = all( - get_name(agg_name) in _GROUPBY_SCANS for aggs in all_aggs - for agg_name in aggs - ) - any_scan = any( - get_name(agg_name) in _GROUPBY_SCANS for aggs in all_aggs - for agg_name in aggs - ) - - if not all_scan and any_scan: - raise NotImplementedError( - "Cannot perform both aggregation and scan in one operation" - ) - return all_scan and any_scan diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx deleted file mode 100644 index 9b7ab0888d2..00000000000 --- a/python/cudf/cudf/_lib/hash.pyx +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.hash cimport ( - md5, - murmurhash3_x86_32, - sha1, - sha224, - sha256, - sha384, - sha512, - xxhash_64, -) -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport table_view_from_columns - -import pylibcudf as plc - - -@acquire_spill_lock() -def hash_partition(list source_columns, list columns_to_hash, - int num_partitions): - plc_table, offsets = plc.partitioning.hash_partition( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), - columns_to_hash, - num_partitions - ) - return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets - - -@acquire_spill_lock() -def hash(list source_columns, str method, int seed=0): - cdef table_view c_source_view = table_view_from_columns(source_columns) - cdef unique_ptr[column] c_result - if method == "murmur3": - with nogil: - c_result = move(murmurhash3_x86_32(c_source_view, seed)) - elif method == "md5": - with nogil: - c_result = move(md5(c_source_view)) - elif method == "sha1": - with nogil: - c_result = move(sha1(c_source_view)) - elif method == "sha224": - with nogil: - c_result = move(sha224(c_source_view)) - elif method == "sha256": - with nogil: - c_result = move(sha256(c_source_view)) - elif method == "sha384": - with nogil: - c_result = move(sha384(c_source_view)) - elif method == "sha512": - with nogil: - c_result = move(sha512(c_source_view)) - elif method == "xxhash64": - with nogil: - c_result = move(xxhash_64(c_source_view, seed)) - else: - raise ValueError(f"Unsupported hash function: {method}") - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx deleted file mode 100644 index 1dc586bb257..00000000000 --- a/python/cudf/cudf/_lib/interop.pyx +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cpython cimport pycapsule -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -import pylibcudf - -from pylibcudf.libcudf.interop cimport ( - DLManagedTensor, - from_dlpack as cpp_from_dlpack, - to_dlpack as cpp_to_dlpack, -) -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - columns_from_unique_ptr, - table_view_from_columns, -) - -from cudf.core.buffer import acquire_spill_lock -from cudf.core.dtypes import ListDtype, StructDtype - - -def from_dlpack(dlpack_capsule): - """ - Converts a DLPack Tensor PyCapsule into a list of columns. - - DLPack Tensor PyCapsule is expected to have the name "dltensor". - """ - cdef DLManagedTensor* dlpack_tensor = pycapsule.\ - PyCapsule_GetPointer(dlpack_capsule, 'dltensor') - pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor') - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_from_dlpack(dlpack_tensor) - ) - - res = columns_from_unique_ptr(move(c_result)) - dlpack_tensor.deleter(dlpack_tensor) - return res - - -def to_dlpack(list source_columns): - """ - Converts a list of columns into a DLPack Tensor PyCapsule. - - DLPack Tensor PyCapsule will have the name "dltensor". - """ - if any(column.null_count for column in source_columns): - raise ValueError( - "Cannot create a DLPack tensor with null values. \ - Input is required to have null count as zero." - ) - - cdef DLManagedTensor *dlpack_tensor - cdef table_view source_table_view = table_view_from_columns(source_columns) - - with nogil: - dlpack_tensor = cpp_to_dlpack( - source_table_view - ) - - return pycapsule.PyCapsule_New( - dlpack_tensor, - 'dltensor', - dlmanaged_tensor_pycapsule_deleter - ) - - -cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: - cdef DLManagedTensor* dlpack_tensor = 0 - try: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'used_dltensor') - return # we do not call a used capsule's deleter - except Exception: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'dltensor') - dlpack_tensor.deleter(dlpack_tensor) - - -def gather_metadata(object cols_dtypes): - """ - Generates a ColumnMetadata vector for each column. - - Parameters - ---------- - cols_dtypes : iterable - An iterable of ``(column_name, dtype)`` pairs. - """ - cpp_metadata = [] - if cols_dtypes is not None: - for idx, (col_name, col_dtype) in enumerate(cols_dtypes): - cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name)) - if isinstance(col_dtype, (ListDtype, StructDtype)): - _set_col_children_metadata(col_dtype, cpp_metadata[idx]) - else: - raise TypeError( - "An iterable of (column_name, dtype) pairs is required to " - "construct column_metadata" - ) - return cpp_metadata - - -def _set_col_children_metadata(dtype, col_meta): - if isinstance(dtype, StructDtype): - for name, value in dtype.fields.items(): - element_metadata = pylibcudf.interop.ColumnMetadata(name) - _set_col_children_metadata(value, element_metadata) - col_meta.children_meta.append(element_metadata) - elif isinstance(dtype, ListDtype): - # Offsets - child 0 - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - # Element column - child 1 - element_metadata = pylibcudf.interop.ColumnMetadata() - _set_col_children_metadata(dtype.element_type, element_metadata) - col_meta.children_meta.append(element_metadata) - else: - col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata()) - - -@acquire_spill_lock() -def to_arrow(list source_columns, object column_dtypes): - """Convert a list of columns from - cudf Frame to a PyArrow Table. - - Parameters - ---------- - source_columns : a list of columns to convert - column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs - - Returns - ------- - pyarrow table - """ - cpp_metadata = gather_metadata(column_dtypes) - return pylibcudf.interop.to_arrow( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), - cpp_metadata, - ) - - -@acquire_spill_lock() -def from_arrow(object input_table): - """Convert from PyArrow Table to a list of columns. - - Parameters - ---------- - input_table : PyArrow table - - Returns - ------- - A list of columns to construct Frame object - """ - return columns_from_pylibcudf_table( - pylibcudf.interop.from_arrow(input_table) - ) diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt deleted file mode 100644 index e7408cf2852..00000000000 --- a/python/cudf/cudf/_lib/io/CMakeLists.txt +++ /dev/null @@ -1,21 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources utils.pyx) -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/io/__init__.pxd b/python/cudf/cudf/_lib/io/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/io/__init__.py b/python/cudf/cudf/_lib/io/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd deleted file mode 100644 index 76a6e32fde0..00000000000 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.vector cimport vector - -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport ( - column_name_info, - sink_info, - source_info, -) - -from cudf._lib.column cimport Column - - -cdef source_info make_source_info(list src) except* -cdef sink_info make_sinks_info( - list src, vector[unique_ptr[data_sink]] & data) except* -cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except* -cdef add_df_col_struct_names( - df, - child_names_dict -) -cdef update_col_struct_field_names( - Column col, - child_names -) -cdef update_struct_field_names( - table, - vector[column_name_info]& schema_info) -cdef Column update_column_struct_field_names( - Column col, - column_name_info& info -) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx deleted file mode 100644 index 564daefbae2..00000000000 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cpython.buffer cimport PyBUF_READ -from cpython.memoryview cimport PyMemoryView_FromMemory -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from pylibcudf.io.datasource cimport Datasource -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.datasource cimport datasource -from pylibcudf.libcudf.io.types cimport ( - column_name_info, - host_buffer, - sink_info, - source_info, -) - -from cudf._lib.column cimport Column - -import codecs -import errno -import io -import os - -from cudf.core.dtypes import StructDtype - - -# Converts the Python source input to libcudf IO source_info -# with the appropriate type and source values -cdef source_info make_source_info(list src) except*: - if not src: - raise ValueError("Need to pass at least one source") - - cdef const unsigned char[::1] c_buffer - cdef vector[host_buffer] c_host_buffers - cdef vector[string] c_files - cdef Datasource csrc - cdef vector[datasource*] c_datasources - empty_buffer = False - if isinstance(src[0], bytes): - empty_buffer = True - for buffer in src: - if (len(buffer) > 0): - c_buffer = buffer - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - empty_buffer = False - elif isinstance(src[0], io.BytesIO): - for bio in src: - c_buffer = bio.getbuffer() # check if empty? - c_host_buffers.push_back(host_buffer(&c_buffer[0], - c_buffer.shape[0])) - # Otherwise src is expected to be a numeric fd, string path, or PathLike. - # TODO (ptaylor): Might need to update this check if accepted input types - # change when UCX and/or cuStreamz support is added. - elif isinstance(src[0], Datasource): - for csrc in src: - c_datasources.push_back(csrc.get_datasource()) - return source_info(c_datasources) - elif isinstance(src[0], (int, float, complex, basestring, os.PathLike)): - # If source is a file, return source_info where type=FILEPATH - if not all(os.path.isfile(file) for file in src): - raise FileNotFoundError(errno.ENOENT, - os.strerror(errno.ENOENT), - src) - - files = [ str(elem).encode() for elem in src] - c_files = files - return source_info(c_files) - else: - raise TypeError("Unrecognized input type: {}".format(type(src[0]))) - - if empty_buffer is True: - c_host_buffers.push_back(host_buffer(NULL, 0)) - - return source_info(c_host_buffers) - -# Converts the Python sink input to libcudf IO sink_info. -cdef sink_info make_sinks_info( - list src, vector[unique_ptr[data_sink]] & sink -) except*: - cdef vector[data_sink *] data_sinks - cdef vector[string] paths - if isinstance(src[0], io.StringIO): - data_sinks.reserve(len(src)) - for s in src: - sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s))) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], io.TextIOBase): - data_sinks.reserve(len(src)) - for s in src: - # Files opened in text mode expect writes to be str rather than - # bytes, which requires conversion from utf-8. If the underlying - # buffer is utf-8, we can bypass this conversion by writing - # directly to it. - if codecs.lookup(s.encoding).name not in {"utf-8", "ascii"}: - raise NotImplementedError(f"Unsupported encoding {s.encoding}") - sink.push_back( - unique_ptr[data_sink](new iobase_data_sink(s.buffer)) - ) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], io.IOBase): - data_sinks.reserve(len(src)) - for s in src: - sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s))) - data_sinks.push_back(sink.back().get()) - return sink_info(data_sinks) - elif isinstance(src[0], (basestring, os.PathLike)): - paths.reserve(len(src)) - for s in src: - paths.push_back( os.path.expanduser(s).encode()) - return sink_info(move(paths)) - else: - raise TypeError("Unrecognized input type: {}".format(type(src))) - - -cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*: - cdef vector[unique_ptr[data_sink]] datasinks - cdef sink_info info = make_sinks_info([src], datasinks) - if not datasinks.empty(): - sink.swap(datasinks[0]) - return info - - -# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you -# write from cudf to any python file-like object (File/BytesIO/SocketIO etc) -cdef cppclass iobase_data_sink(data_sink): - object buf - - iobase_data_sink(object buf_): - this.buf = buf_ - - void host_write(const void * data, size_t size) with gil: - if isinstance(buf, io.StringIO): - buf.write(PyMemoryView_FromMemory(data, size, PyBUF_READ) - .tobytes().decode()) - else: - buf.write(PyMemoryView_FromMemory(data, size, PyBUF_READ)) - - void flush() with gil: - buf.flush() - - size_t bytes_written() with gil: - return buf.tell() - - -cdef add_df_col_struct_names(df, child_names_dict): - for name, child_names in child_names_dict.items(): - col = df._data[name] - - df._data[name] = update_col_struct_field_names(col, child_names) - - -cdef update_col_struct_field_names(Column col, child_names): - if col.children: - children = list(col.children) - for i, (child, names) in enumerate(zip(children, child_names.values())): - children[i] = update_col_struct_field_names( - child, - names - ) - col.set_base_children(tuple(children)) - - if isinstance(col.dtype, StructDtype): - col = col._rename_fields( - child_names.keys() - ) - - return col - - -cdef update_struct_field_names( - table, - vector[column_name_info]& schema_info -): - # Deprecated, remove in favor of add_col_struct_names - # when a reader is ported to pylibcudf - for i, (name, col) in enumerate(table._column_labels_and_values): - table._data[name] = update_column_struct_field_names( - col, schema_info[i] - ) - - -cdef Column update_column_struct_field_names( - Column col, - column_name_info& info -): - cdef vector[string] field_names - - if col.children: - children = list(col.children) - for i, child in enumerate(children): - children[i] = update_column_struct_field_names( - child, - info.children[i] - ) - col.set_base_children(tuple(children)) - - if isinstance(col.dtype, StructDtype): - field_names.reserve(len(col.base_children)) - for i in range(info.children.size()): - field_names.push_back(info.children[i].name) - col = col._rename_fields( - field_names - ) - - return col diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx deleted file mode 100644 index 2559358c21f..00000000000 --- a/python/cudf/cudf/_lib/join.pyx +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf - -# The functions below return the *gathermaps* that represent -# the join result when joining on the keys `lhs` and `rhs`. - - -@acquire_spill_lock() -def join(list lhs, list rhs, how=None): - if how == "outer": - how = "full" - if (join_func := getattr(pylibcudf.join, f"{how}_join", None)) is None: - raise ValueError(f"Invalid join type {how}") - - left_rows, right_rows = join_func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]), - pylibcudf.types.NullEquality.EQUAL - ) - return Column.from_pylibcudf(left_rows), Column.from_pylibcudf(right_rows) - - -@acquire_spill_lock() -def semi_join(list lhs, list rhs, how=None): - if ( - join_func := getattr( - pylibcudf.join, f"{how.replace('left', 'left_')}_join", None - ) - ) is None: - raise ValueError(f"Invalid join type {how}") - - return Column.from_pylibcudf( - join_func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in lhs]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in rhs]), - pylibcudf.types.NullEquality.EQUAL - ) - ), None diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx deleted file mode 100644 index 9bbbcf60dcf..00000000000 --- a/python/cudf/cudf/_lib/json.pyx +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import io -import os -from collections import abc - -import cudf -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -cimport pylibcudf.libcudf.io.types as cudf_io_types -from pylibcudf.io.types cimport compression_type -from pylibcudf.libcudf.io.json cimport json_recovery_mode_t -from pylibcudf.libcudf.io.types cimport compression_type -from pylibcudf.libcudf.types cimport data_type, type_id -from pylibcudf.types cimport DataType - -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport add_df_col_struct_names -from cudf._lib.types cimport dtype_to_data_type -from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io - -import pylibcudf as plc - - -cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): - if on_bad_lines.lower() == "error": - return json_recovery_mode_t.FAIL - elif on_bad_lines.lower() == "recover": - return json_recovery_mode_t.RECOVER_WITH_NULL - else: - raise TypeError(f"Invalid parameter for {on_bad_lines=}") - - -cpdef read_json(object filepaths_or_buffers, - object dtype, - bool lines, - object compression, - object byte_range, - bool keep_quotes, - bool mixed_types_as_string, - bool prune_columns, - object on_bad_lines): - """ - Cython function to call into libcudf API, see `read_json`. - - See Also - -------- - cudf.io.json.read_json - cudf.io.json.to_json - """ - - # If input data is a JSON string (or StringIO), hold a reference to - # the encoded memoryview externally to ensure the encoded buffer - # isn't destroyed before calling libcudf `read_json()` - - for idx in range(len(filepaths_or_buffers)): - if isinstance(filepaths_or_buffers[idx], io.StringIO): - filepaths_or_buffers[idx] = \ - filepaths_or_buffers[idx].read().encode() - elif isinstance(filepaths_or_buffers[idx], str) and \ - not os.path.isfile(filepaths_or_buffers[idx]): - filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode() - - # Setup arguments - cdef cudf_io_types.compression_type c_compression - - if compression is not None: - if compression == 'gzip': - c_compression = cudf_io_types.compression_type.GZIP - elif compression == 'bz2': - c_compression = cudf_io_types.compression_type.BZIP2 - elif compression == 'zip': - c_compression = cudf_io_types.compression_type.ZIP - else: - c_compression = cudf_io_types.compression_type.AUTO - else: - c_compression = cudf_io_types.compression_type.NONE - - processed_dtypes = None - - if dtype is False: - raise ValueError("False value is unsupported for `dtype`") - elif dtype is not True: - processed_dtypes = [] - if isinstance(dtype, abc.Mapping): - for k, v in dtype.items(): - # Make sure keys are string - k = str(k) - lib_type, child_types = _get_cudf_schema_element_from_dtype(v) - processed_dtypes.append((k, lib_type, child_types)) - elif isinstance(dtype, abc.Collection): - for col_dtype in dtype: - processed_dtypes.append( - # Ignore child columns since we cannot specify their dtypes - # when passing a list - _get_cudf_schema_element_from_dtype(col_dtype)[0] - ) - else: - raise TypeError("`dtype` must be 'list like' or 'dict'") - - if cudf.get_option("io.json.low_memory") and lines: - res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json( - plc.io.SourceInfo(filepaths_or_buffers), - processed_dtypes, - c_compression, - keep_quotes = keep_quotes, - mixed_types_as_string = mixed_types_as_string, - prune_columns = prune_columns, - recovery_mode = _get_json_recovery_mode(on_bad_lines) - ) - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in res_cols], - column_names=res_col_names, - index_names=None - ) - ) - add_df_col_struct_names(df, res_child_names) - return df - else: - table_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(filepaths_or_buffers), - processed_dtypes, - c_compression, - lines, - byte_range_offset = byte_range[0] if byte_range is not None else 0, - byte_range_size = byte_range[1] if byte_range is not None else 0, - keep_quotes = keep_quotes, - mixed_types_as_string = mixed_types_as_string, - prune_columns = prune_columns, - recovery_mode = _get_json_recovery_mode(on_bad_lines) - ) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io( - table_w_meta - ) - ) - - # Post-processing to add in struct column names - add_df_col_struct_names(df, table_w_meta.child_names) - return df - - -@acquire_spill_lock() -def write_json( - table, - object path_or_buf=None, - object na_rep="null", - bool include_nulls=True, - bool lines=False, - bool index=False, - int rows_per_chunk=1024*64, # 64K rows -): - """ - Cython function to call into libcudf API, see `write_json`. - - See Also - -------- - cudf.to_json - """ - cdef list colnames = [] - - for name in table._column_names: - colnames.append((name, _dtype_to_names_list(table[name]._column))) - - try: - plc.io.json.write_json( - plc.io.SinkInfo([path_or_buf]), - plc.io.TableWithMetadata( - plc.Table([ - c.to_pylibcudf(mode="read") for c in table._columns - ]), - colnames - ), - na_rep, - include_nulls, - lines, - rows_per_chunk, - true_value="true", - false_value="false" - ) - except OverflowError: - raise OverflowError( - f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. " - "Consider providing a smaller rows_per_chunk argument." - ) - - -cdef _get_cudf_schema_element_from_dtype(object dtype) except *: - dtype = cudf.dtype(dtype) - if isinstance(dtype, cudf.CategoricalDtype): - raise NotImplementedError( - "CategoricalDtype as dtype is not yet " - "supported in JSON reader" - ) - - lib_type = DataType.from_libcudf(dtype_to_data_type(dtype)) - child_types = [] - - if isinstance(dtype, cudf.StructDtype): - for name, child_type in dtype.fields.items(): - child_lib_type, grandchild_types = \ - _get_cudf_schema_element_from_dtype(child_type) - child_types.append((name, child_lib_type, grandchild_types)) - elif isinstance(dtype, cudf.ListDtype): - child_lib_type, grandchild_types = \ - _get_cudf_schema_element_from_dtype(dtype.element_type) - - child_types = [ - ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []), - ("element", child_lib_type, grandchild_types) - ] - - return lib_type, child_types - - -cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: - dtype = cudf.dtype(dtype) - if isinstance(dtype, cudf.CategoricalDtype): - raise NotImplementedError( - "CategoricalDtype as dtype is not yet " - "supported in JSON reader" - ) - return dtype_to_data_type(dtype) - - -def _dtype_to_names_list(col): - if isinstance(col.dtype, cudf.StructDtype): - return [(name, _dtype_to_names_list(child)) - for name, child in zip(col.dtype.fields, col.children)] - elif isinstance(col.dtype, cudf.ListDtype): - return [("", _dtype_to_names_list(child)) - for child in col.children] - return [] diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx deleted file mode 100644 index 3966cce8981..00000000000 --- a/python/cudf/cudf/_lib/labeling.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp cimport bool as cbool - -import pylibcudf as plc - -from cudf._lib.column cimport Column -from cudf.core.buffer import acquire_spill_lock - - -# Note that the parameter input shadows a Python built-in in the local scope, -# but I'm not too concerned about that since there's no use-case for actual -# input in this context. -@acquire_spill_lock() -def label_bins(Column input, Column left_edges, cbool left_inclusive, - Column right_edges, cbool right_inclusive): - plc_column = plc.labeling.label_bins( - input.to_pylibcudf(mode="read"), - left_edges.to_pylibcudf(mode="read"), - left_inclusive, - right_edges.to_pylibcudf(mode="read"), - right_inclusive - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx deleted file mode 100644 index 7e8710bedb6..00000000000 --- a/python/cudf/cudf/_lib/lists.pyx +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from pylibcudf.libcudf.types cimport null_order, size_type - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - -from pylibcudf cimport Scalar - - -@acquire_spill_lock() -def count_elements(Column col): - return Column.from_pylibcudf( - pylibcudf.lists.count_elements( - col.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def explode_outer(list source_columns, int explode_column_idx): - return columns_from_pylibcudf_table( - pylibcudf.lists.explode_outer( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), - explode_column_idx, - ) - ) - - -@acquire_spill_lock() -def distinct(Column col, bool nulls_equal, bool nans_all_equal): - return Column.from_pylibcudf( - pylibcudf.lists.distinct( - col.to_pylibcudf(mode="read"), - nulls_equal, - nans_all_equal, - ) - ) - - -@acquire_spill_lock() -def sort_lists(Column col, bool ascending, str na_position): - return Column.from_pylibcudf( - pylibcudf.lists.sort_lists( - col.to_pylibcudf(mode="read"), - ascending, - null_order.BEFORE if na_position == "first" else null_order.AFTER, - False, - ) - ) - - -@acquire_spill_lock() -def extract_element_scalar(Column col, size_type index): - return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( - col.to_pylibcudf(mode="read"), - index, - ) - ) - - -@acquire_spill_lock() -def extract_element_column(Column col, Column index): - return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( - col.to_pylibcudf(mode="read"), - index.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def contains_scalar(Column col, py_search_key): - return Column.from_pylibcudf( - pylibcudf.lists.contains( - col.to_pylibcudf(mode="read"), - py_search_key.device_value.c_value, - ) - ) - - -@acquire_spill_lock() -def index_of_scalar(Column col, object py_search_key): - return Column.from_pylibcudf( - pylibcudf.lists.index_of( - col.to_pylibcudf(mode="read"), - py_search_key.device_value.c_value, - True, - ) - ) - - -@acquire_spill_lock() -def index_of_column(Column col, Column search_keys): - return Column.from_pylibcudf( - pylibcudf.lists.index_of( - col.to_pylibcudf(mode="read"), - search_keys.to_pylibcudf(mode="read"), - True, - ) - ) - - -@acquire_spill_lock() -def concatenate_rows(list source_columns): - return Column.from_pylibcudf( - pylibcudf.lists.concatenate_rows( - pylibcudf.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]) - ) - ) - - -@acquire_spill_lock() -def concatenate_list_elements(Column input_column, dropna=False): - return Column.from_pylibcudf( - pylibcudf.lists.concatenate_list_elements( - input_column.to_pylibcudf(mode="read"), - dropna, - ) - ) diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx deleted file mode 100644 index 9372acdab44..00000000000 --- a/python/cudf/cudf/_lib/merge.pyx +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -def merge_sorted( - list input_columns, - list key_columns_indices, - bool ascending=True, - str na_position="last", -): - """Merge multiple lists of lexicographically sorted columns into one list - of sorted columns. `input_columns` is a list of lists of columns to be - merged. - """ - c_input_tables = [ - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ) for source_columns in input_columns - ] - - num_keys = len(key_columns_indices) - - column_order = ( - pylibcudf.types.Order.ASCENDING if ascending - else pylibcudf.types.Order.DESCENDING - ) - - if not ascending: - na_position = "last" if na_position == "first" else "first" - null_precedence = ( - pylibcudf.types.NullOrder.BEFORE if na_position == "first" - else pylibcudf.types.NullOrder.AFTER - ) - - return columns_from_pylibcudf_table( - pylibcudf.merge.merge( - c_input_tables, - key_columns_indices, - [column_order] * num_keys, - [null_precedence] * num_keys, - ) - ) diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx deleted file mode 100644 index d54e8e66281..00000000000 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf -from pylibcudf.null_mask import MaskState - -from cudf.core.buffer import acquire_spill_lock, as_buffer - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def copy_bitmask(Column col): - """ - Copies column's validity mask buffer into a new buffer, shifting by the - offset if nonzero - """ - if col.base_mask is None: - return None - - rmm_db = pylibcudf.null_mask.copy_bitmask(col.to_pylibcudf(mode="read")) - buf = as_buffer(rmm_db) - return buf - - -def bitmask_allocation_size_bytes(num_bits): - """ - Given a size, calculates the number of bytes that should be allocated for a - column validity mask - """ - return pylibcudf.null_mask.bitmask_allocation_size_bytes(num_bits) - - -def create_null_mask(size, state=MaskState.UNINITIALIZED): - """ - Given a size and a mask state, allocate a mask that can properly represent - the given size with the given mask state - - Parameters - ---------- - size : int - Number of elements the mask needs to be able to represent - state : ``MaskState``, default ``MaskState.UNINITIALIZED`` - State the null mask should be created in - """ - rmm_db = pylibcudf.null_mask.create_null_mask(size, state) - buf = as_buffer(rmm_db) - return buf - - -@acquire_spill_lock() -def bitmask_and(list columns): - rmm_db, other = pylibcudf.null_mask.bitmask_and( - [col.to_pylibcudf(mode="read") for col in columns] - ) - buf = as_buffer(rmm_db) - return buf, other - - -@acquire_spill_lock() -def bitmask_or(list columns): - rmm_db, other = pylibcudf.null_mask.bitmask_or( - [col.to_pylibcudf(mode="read") for col in columns] - ) - buf = as_buffer(rmm_db) - return buf, other diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt deleted file mode 100644 index 22ec5d472f2..00000000000 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources - byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx - ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx -) -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/nvtext/__init__.pxd b/python/cudf/cudf/_lib/nvtext/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/nvtext/__init__.py b/python/cudf/cudf/_lib/nvtext/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx deleted file mode 100644 index 0d768e24f39..00000000000 --- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.byte_pair_encode cimport ( - bpe_merge_pairs as cpp_bpe_merge_pairs, - byte_pair_encoding as cpp_byte_pair_encoding, - load_merge_pairs as cpp_load_merge_pairs, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -cdef class BPEMergePairs: - cdef unique_ptr[cpp_bpe_merge_pairs] c_obj - - def __cinit__(self, Column merge_pairs): - cdef column_view c_pairs = merge_pairs.view() - with nogil: - self.c_obj = move(cpp_load_merge_pairs(c_pairs)) - - -@acquire_spill_lock() -def byte_pair_encoding( - Column strings, - BPEMergePairs merge_pairs, - object separator -): - cdef column_view c_strings = strings.view() - cdef DeviceScalar d_separator = separator.device_value - cdef const string_scalar* c_separator = d_separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_byte_pair_encoding( - c_strings, - merge_pairs.c_obj.get()[0], - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx deleted file mode 100644 index e3c2273345a..00000000000 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.edit_distance cimport ( - edit_distance as cpp_edit_distance, - edit_distance_matrix as cpp_edit_distance_matrix, -) - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def edit_distance(Column strings, Column targets): - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance(c_strings, c_targets)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def edit_distance_matrix(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance_matrix(c_strings)) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx deleted file mode 100644 index 6591b527eec..00000000000 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, - hash_character_ngrams as cpp_hash_character_ngrams, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def generate_ngrams(Column strings, int ngrams, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - c_ngrams, - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def generate_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def hash_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_hash_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx deleted file mode 100644 index 0ebf7c281e3..00000000000 --- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.jaccard cimport ( - jaccard_index as cpp_jaccard_index, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def jaccard_index(Column input1, Column input2, int width): - cdef column_view c_input1 = input1.view() - cdef column_view c_input2 = input2.view() - cdef size_type c_width = width - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_jaccard_index( - c_input1, - c_input2, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx deleted file mode 100644 index 59cb8d51440..00000000000 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.minhash cimport ( - minhash as cpp_minhash, - minhash64 as cpp_minhash64, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def minhash(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def minhash64(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash64( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def word_minhash(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def word_minhash64(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash64( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx deleted file mode 100644 index dec4f037d98..00000000000 --- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport ( - ngrams_tokenize as cpp_ngrams_tokenize, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def ngrams_tokenize( - Column strings, - int ngrams, - object py_delimiter, - object py_separator -): - - cdef DeviceScalar delimiter = py_delimiter.device_value - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_ngrams_tokenize( - c_strings, - c_ngrams, - c_delimiter[0], - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx deleted file mode 100644 index 5e86a9ce959..00000000000 --- a/python/cudf/cudf/_lib/nvtext/normalize.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def normalize_spaces(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_normalize_spaces(c_strings)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def normalize_characters(Column strings, bool do_lower=True): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_normalize_characters(c_strings, do_lower)) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx deleted file mode 100644 index 61ae3da5782..00000000000 --- a/python/cudf/cudf/_lib/nvtext/replace.pyx +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.replace cimport ( - filter_tokens as cpp_filter_tokens, - replace_tokens as cpp_replace_tokens, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def replace_tokens(Column strings, - Column targets, - Column replacements, - object py_delimiter): - """ - The `targets` tokens are searched for within each `strings` - in the Column and replaced with the corresponding `replacements` - if found. Tokens are identified by the `py_delimiter` character - provided. - """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef column_view c_replacements = replacements.view() - - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_replace_tokens( - c_strings, - c_targets, - c_replacements, - c_delimiter[0], - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def filter_tokens(Column strings, - size_type min_token_length, - object py_replacement, - object py_delimiter): - """ - Tokens smaller than `min_token_length` are removed from `strings` - in the Column and optionally replaced with the corresponding - `py_replacement` string. Tokens are identified by the `py_delimiter` - character provided. - """ - - cdef DeviceScalar replacement = py_replacement.device_value - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_repl = replacement\ - .get_raw_ptr() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_filter_tokens( - c_strings, - min_token_length, - c_repl[0], - c_delimiter[0], - ) - ) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx deleted file mode 100644 index 5bf25562fed..00000000000 --- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from enum import IntEnum - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.stemmer cimport ( - is_letter as cpp_is_letter, - letter_type, - porter_stemmer_measure as cpp_porter_stemmer_measure, - underlying_type_t_letter_type, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - - -class LetterType(IntEnum): - CONSONANT = letter_type.CONSONANT - VOWEL = letter_type.VOWEL - - -@acquire_spill_lock() -def porter_stemmer_measure(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_porter_stemmer_measure(c_strings)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_letter(Column strings, - object ltype, - size_type index): - cdef column_view c_strings = strings.view() - cdef letter_type c_ltype = ( - ltype - ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_is_letter(c_strings, c_ltype, index)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_letter_multi(Column strings, - object ltype, - Column indices): - cdef column_view c_strings = strings.view() - cdef column_view c_indices = indices.view() - cdef letter_type c_ltype = ( - ltype - ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices)) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx deleted file mode 100644 index ee442ece5c6..00000000000 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ /dev/null @@ -1,62 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.subword_tokenize cimport ( - hashed_vocabulary as cpp_hashed_vocabulary, - load_vocabulary_file as cpp_load_vocabulary_file, - move as tr_move, - subword_tokenize as cpp_subword_tokenize, - tokenizer_result as cpp_tokenizer_result, -) - -from cudf._lib.column cimport Column - - -cdef class Hashed_Vocabulary: - cdef unique_ptr[cpp_hashed_vocabulary] c_obj - - def __cinit__(self, hash_file): - cdef string c_hash_file = str(hash_file).encode() - with nogil: - self.c_obj = move(cpp_load_vocabulary_file(c_hash_file)) - - -@acquire_spill_lock() -def subword_tokenize_inmem_hash( - Column strings, - Hashed_Vocabulary hashed_vocabulary, - uint32_t max_sequence_length=64, - uint32_t stride=48, - bool do_lower=True, - bool do_truncate=False, -): - """ - Subword tokenizes text series by using the pre-loaded hashed vocabulary - """ - cdef column_view c_strings = strings.view() - cdef cpp_tokenizer_result c_result - with nogil: - c_result = tr_move( - cpp_subword_tokenize( - c_strings, - hashed_vocabulary.c_obj.get()[0], - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - ) - # return the 3 tensor components - tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids)) - masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask)) - metadata = Column.from_unique_ptr(move(c_result.tensor_metadata)) - return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx deleted file mode 100644 index a7e63f1e9ae..00000000000 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.tokenize cimport ( - character_tokenize as cpp_character_tokenize, - count_tokens as cpp_count_tokens, - detokenize as cpp_detokenize, - load_vocabulary as cpp_load_vocabulary, - tokenize as cpp_tokenize, - tokenize_vocabulary as cpp_tokenize_vocabulary, - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def _tokenize_scalar(Column strings, object py_delimiter): - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize( - c_strings, - c_delimiter[0], - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def _tokenize_column(Column strings, Column delimiters): - cdef column_view c_strings = strings.view() - cdef column_view c_delimiters = delimiters.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize( - c_strings, - c_delimiters - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def _count_tokens_scalar(Column strings, object py_delimiter): - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_count_tokens( - c_strings, - c_delimiter[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def _count_tokens_column(Column strings, Column delimiters): - cdef column_view c_strings = strings.view() - cdef column_view c_delimiters = delimiters.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_count_tokens( - c_strings, - c_delimiters - ) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def character_tokenize(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_character_tokenize(c_strings) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def detokenize(Column strings, Column indices, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef column_view c_indices = indices.view() - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_detokenize(c_strings, c_indices, c_separator[0]) - ) - - return Column.from_unique_ptr(move(c_result)) - - -cdef class TokenizeVocabulary: - cdef unique_ptr[cpp_tokenize_vocabulary] c_obj - - def __cinit__(self, Column vocab): - cdef column_view c_vocab = vocab.view() - with nogil: - self.c_obj = move(cpp_load_vocabulary(c_vocab)) - - -@acquire_spill_lock() -def tokenize_with_vocabulary(Column strings, - TokenizeVocabulary vocabulary, - object py_delimiter, - size_type default_id): - - cdef DeviceScalar delimiter = py_delimiter.device_value - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_tokenize_with_vocabulary( - c_strings, - vocabulary.c_obj.get()[0], - c_delimiter[0], - default_id - ) - ) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx deleted file mode 100644 index f88c48ce989..00000000000 --- a/python/cudf/cudf/_lib/orc.pyx +++ /dev/null @@ -1,468 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int64_t -from libcpp cimport bool, int -from libcpp.map cimport map -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from collections import OrderedDict - -try: - import ujson as json -except ImportError: - import json - -cimport pylibcudf.libcudf.io.types as cudf_io_types -cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view -from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.orc cimport ( - chunked_orc_writer_options, - orc_chunked_writer, - orc_writer_options, - write_orc as libcudf_write_orc, -) -from pylibcudf.libcudf.io.types cimport ( - column_in_metadata, - compression_type, - sink_info, - table_input_metadata, -) -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names -from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table - -import pylibcudf as plc - -import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES -from cudf._lib.utils import _index_level_name, generate_pandas_metadata -from cudf.core.buffer import acquire_spill_lock - - -# TODO: Consider inlining this function since it seems to only be used in one place. -cpdef read_parsed_orc_statistics(filepath_or_buffer): - """ - Cython function to call into libcudf API, see `read_parsed_orc_statistics`. - - See Also - -------- - cudf.io.orc.read_orc_statistics - """ - - parsed = ( - plc.io.orc.read_parsed_orc_statistics( - plc.io.SourceInfo([filepath_or_buffer]) - ) - ) - - return parsed.column_names, parsed.file_stats, parsed.stripes_stats - - -cpdef read_orc(object filepaths_or_buffers, - object columns=None, - object stripes=None, - object skip_rows=None, - object num_rows=None, - bool use_index=True, - object timestamp_type=None): - """ - Cython function to call into libcudf API, see `read_orc`. - - See Also - -------- - cudf.read_orc - - Notes - ----- - Currently this function only considers the metadata of the first file in the list of - filepaths_or_buffers. - """ - - if columns is not None: - columns = [str(col) for col in columns] - - tbl_w_meta = plc.io.orc.read_orc( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - stripes, - get_skiprows_arg(skip_rows), - get_num_rows_arg(num_rows), - use_index, - plc.types.DataType( - SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[ - cudf.dtype(timestamp_type) - ] - ) - ) - - names = tbl_w_meta.column_names(include_children=False) - - actual_index_names, col_names, is_range_index, reset_index_name, \ - range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data, - names, - skip_rows, - num_rows) - - if columns is not None and (isinstance(columns, list) and len(columns) == 0): - # When `columns=[]`, index needs to be - # established, but not the columns. - nrows = tbl_w_meta.tbl.num_rows() - return {}, cudf.RangeIndex(nrows) - - data, index = data_from_pylibcudf_io( - tbl_w_meta, - col_names if columns is None else names, - actual_index_names - ) - - if is_range_index: - index = range_idx - elif reset_index_name: - index.names = [None] * len(index.names) - - child_name_values = tbl_w_meta.child_names.values() - - data = { - name: update_col_struct_field_names( - col, child_names - ) - for (name, col), child_names in zip(data.items(), child_name_values) - } - - return data, index - - -cdef compression_type _get_comp_type(object compression): - if compression is None or compression is False: - return compression_type.NONE - - compression = str(compression).upper() - if compression == "SNAPPY": - return compression_type.SNAPPY - elif compression == "ZLIB": - return compression_type.ZLIB - elif compression == "ZSTD": - return compression_type.ZSTD - elif compression == "LZ4": - return compression_type.LZ4 - else: - raise ValueError(f"Unsupported `compression` type {compression}") - -cdef tuple _get_index_from_metadata( - vector[map[string, string]] user_data, - object names, - object skip_rows, - object num_rows): - - meta = None - index_col = None - is_range_index = False - reset_index_name = False - range_idx = None - - if user_data.size() > 0: - json_str = user_data[0][b'pandas'].decode('utf-8') - if json_str != "": - meta = json.loads(json_str) - if 'index_columns' in meta and len(meta['index_columns']) > 0: - index_col = meta['index_columns'] - if isinstance(index_col[0], dict) and \ - index_col[0]['kind'] == 'range': - is_range_index = True - else: - index_col_names = OrderedDict() - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = \ - c['name'] or c['field_name'] - if c['name'] is None: - reset_index_name = True - - actual_index_names = None - if index_col is not None and len(index_col) > 0: - if is_range_index: - range_index_meta = index_col[0] - range_idx = cudf.RangeIndex( - start=range_index_meta['start'], - stop=range_index_meta['stop'], - step=range_index_meta['step'], - name=range_index_meta['name'] - ) - if skip_rows is not None: - range_idx = range_idx[skip_rows:] - if num_rows is not None: - range_idx = range_idx[:num_rows] - else: - actual_index_names = list(index_col_names.values()) - names = names[len(actual_index_names):] - - return ( - actual_index_names, - names, - is_range_index, - reset_index_name, - range_idx - ) - -cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics): - """ - Convert ORC statistics terms to CUDF convention: - - ORC "STRIPE" == CUDF "ROWGROUP" - - ORC "ROWGROUP" == CUDF "PAGE" - """ - statistics = str(statistics).upper() - if statistics == "NONE": - return cudf_io_types.statistics_freq.STATISTICS_NONE - elif statistics == "STRIPE": - return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP - elif statistics == "ROWGROUP": - return cudf_io_types.statistics_freq.STATISTICS_PAGE - else: - raise ValueError(f"Unsupported `statistics_freq` type {statistics}") - - -@acquire_spill_lock() -def write_orc( - table, - object path_or_buf, - object compression="snappy", - object statistics="ROWGROUP", - object stripe_size_bytes=None, - object stripe_size_rows=None, - object row_index_stride=None, - object cols_as_map_type=None, - object index=None -): - """ - Cython function to call into libcudf API, see `cudf::io::write_orc`. - - See Also - -------- - cudf.read_orc - """ - cdef compression_type compression_ = _get_comp_type(compression) - cdef unique_ptr[data_sink] data_sink_c - cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) - cdef table_input_metadata tbl_meta - cdef map[string, string] user_data - user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata( - table, index) - ) - - if index is True or ( - index is None and not isinstance(table._index, cudf.RangeIndex) - ): - tv = table_view_from_table(table) - tbl_meta = table_input_metadata(tv) - for level, idx_name in enumerate(table._index.names): - tbl_meta.column_metadata[level].set_name( - str.encode( - _index_level_name(idx_name, level, table._column_names) - ) - ) - num_index_cols_meta = len(table._index.names) - else: - tv = table_view_from_table(table, ignore_index=True) - tbl_meta = table_input_metadata(tv) - num_index_cols_meta = 0 - - if cols_as_map_type is not None: - cols_as_map_type = set(cols_as_map_type) - - for i, name in enumerate(table._column_names, num_index_cols_meta): - tbl_meta.column_metadata[i].set_name(name.encode()) - _set_col_children_metadata( - table[name]._column, - tbl_meta.column_metadata[i], - (cols_as_map_type is not None) - and (name in cols_as_map_type), - ) - - cdef orc_writer_options c_orc_writer_options = move( - orc_writer_options.builder( - sink_info_c, tv - ).metadata(tbl_meta) - .key_value_metadata(move(user_data)) - .compression(compression_) - .enable_statistics(_get_orc_stat_freq(statistics)) - .build() - ) - if stripe_size_bytes is not None: - c_orc_writer_options.set_stripe_size_bytes(stripe_size_bytes) - if stripe_size_rows is not None: - c_orc_writer_options.set_stripe_size_rows(stripe_size_rows) - if row_index_stride is not None: - c_orc_writer_options.set_row_index_stride(row_index_stride) - - with nogil: - libcudf_write_orc(c_orc_writer_options) - - -cdef int64_t get_skiprows_arg(object arg) except*: - arg = 0 if arg is None else arg - if not isinstance(arg, int) or arg < 0: - raise TypeError("skiprows must be an int >= 0") - return arg - -cdef int64_t get_num_rows_arg(object arg) except*: - arg = -1 if arg is None else arg - if not isinstance(arg, int) or arg < -1: - raise TypeError("num_rows must be an int >= -1") - return arg - - -cdef class ORCWriter: - """ - ORCWriter lets you you incrementally write out a ORC file from a series - of cudf tables - - See Also - -------- - cudf.io.orc.to_orc - """ - cdef bool initialized - cdef unique_ptr[orc_chunked_writer] writer - cdef sink_info sink - cdef unique_ptr[data_sink] _data_sink - cdef cudf_io_types.statistics_freq stat_freq - cdef compression_type comp_type - cdef object index - cdef table_input_metadata tbl_meta - cdef object cols_as_map_type - cdef object stripe_size_bytes - cdef object stripe_size_rows - cdef object row_index_stride - - def __cinit__(self, - object path, - object index=None, - object compression="snappy", - object statistics="ROWGROUP", - object cols_as_map_type=None, - object stripe_size_bytes=None, - object stripe_size_rows=None, - object row_index_stride=None): - - self.sink = make_sink_info(path, self._data_sink) - self.stat_freq = _get_orc_stat_freq(statistics) - self.comp_type = _get_comp_type(compression) - self.index = index - self.cols_as_map_type = cols_as_map_type \ - if cols_as_map_type is None else set(cols_as_map_type) - self.stripe_size_bytes = stripe_size_bytes - self.stripe_size_rows = stripe_size_rows - self.row_index_stride = row_index_stride - self.initialized = False - - def write_table(self, table): - """ Writes a single table to the file """ - if not self.initialized: - self._initialize_chunked_state(table) - - keep_index = self.index is not False and ( - table._index.name is not None or - isinstance(table._index, cudf.core.multiindex.MultiIndex) - ) - tv = table_view_from_table(table, not keep_index) - - with nogil: - self.writer.get()[0].write(tv) - - def close(self): - if not self.initialized: - return - - with nogil: - self.writer.get()[0].close() - - def __dealloc__(self): - self.close() - - def _initialize_chunked_state(self, table): - """ - Prepare all the values required to build the - chunked_orc_writer_options anb creates a writer""" - cdef table_view tv - - num_index_cols_meta = 0 - self.tbl_meta = table_input_metadata( - table_view_from_table(table, ignore_index=True), - ) - if self.index is not False: - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) - for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name( - (str.encode(idx_name)) - ) - num_index_cols_meta = len(table._index.names) - else: - if table._index.name is not None: - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) - self.tbl_meta.column_metadata[0].set_name( - str.encode(table._index.name) - ) - num_index_cols_meta = 1 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name.encode()) - _set_col_children_metadata( - table[name]._column, - self.tbl_meta.column_metadata[i], - (self.cols_as_map_type is not None) - and (name in self.cols_as_map_type), - ) - - cdef map[string, string] user_data - pandas_metadata = generate_pandas_metadata(table, self.index) - user_data[str.encode("pandas")] = str.encode(pandas_metadata) - - cdef chunked_orc_writer_options c_opts = move( - chunked_orc_writer_options.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(move(user_data)) - .compression(self.comp_type) - .enable_statistics(self.stat_freq) - .build() - ) - if self.stripe_size_bytes is not None: - c_opts.set_stripe_size_bytes(self.stripe_size_bytes) - if self.stripe_size_rows is not None: - c_opts.set_stripe_size_rows(self.stripe_size_rows) - if self.row_index_stride is not None: - c_opts.set_row_index_stride(self.row_index_stride) - - with nogil: - self.writer.reset(new orc_chunked_writer(c_opts)) - - self.initialized = True - -cdef _set_col_children_metadata(Column col, - column_in_metadata& col_meta, - list_column_as_map=False): - if isinstance(col.dtype, cudf.StructDtype): - for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) - ): - col_meta.child(i).set_name(name.encode()) - _set_col_children_metadata( - child_col, col_meta.child(i), list_column_as_map - ) - elif isinstance(col.dtype, cudf.ListDtype): - if list_column_as_map: - col_meta.set_list_column_as_map() - _set_col_children_metadata( - col.children[cpp_lists_column_view.child_column_index], - col_meta.child(cpp_lists_column_view.child_column_index), - list_column_as_map - ) - else: - return diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx deleted file mode 100644 index fa2690c7f21..00000000000 --- a/python/cudf/cudf/_lib/parquet.pyx +++ /dev/null @@ -1,956 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import io - -import pyarrow as pa - -import cudf -from cudf.core.buffer import acquire_spill_lock - -try: - import ujson as json -except ImportError: - import json - -import numpy as np - -from cython.operator cimport dereference - -from cudf.api.types import is_list_like - -from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io - -from cudf._lib.utils import _index_level_name, generate_pandas_metadata - -from libc.stdint cimport int64_t, uint8_t -from libcpp cimport bool -from libcpp.map cimport map -from libcpp.memory cimport make_unique, unique_ptr -from libcpp.string cimport string -from libcpp.unordered_map cimport unordered_map -from libcpp.utility cimport move -from libcpp.vector cimport vector - -cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink -cimport pylibcudf.libcudf.io.types as cudf_io_types -from pylibcudf.expressions cimport Expression -from pylibcudf.io.parquet cimport ChunkedParquetReader -from pylibcudf.libcudf.io.parquet cimport ( - chunked_parquet_writer_options, - merge_row_group_metadata as parquet_merge_metadata, - parquet_chunked_writer as cpp_parquet_chunked_writer, - parquet_writer_options, - write_parquet as parquet_writer, -) -from pylibcudf.libcudf.io.parquet_metadata cimport ( - parquet_metadata, - read_parquet_metadata as parquet_metadata_reader, -) -from pylibcudf.libcudf.io.types cimport ( - column_in_metadata, - table_input_metadata, -) -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport ( - add_df_col_struct_names, - make_sinks_info, - make_source_info, -) -from cudf._lib.utils cimport table_view_from_table - -import pylibcudf as plc - -from pylibcudf cimport Table - -from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT - - -cdef class BufferArrayFromVector: - cdef Py_ssize_t length - cdef unique_ptr[vector[uint8_t]] in_vec - - # these two things declare part of the buffer interface - cdef Py_ssize_t shape[1] - cdef Py_ssize_t strides[1] - - @staticmethod - cdef BufferArrayFromVector from_unique_ptr( - unique_ptr[vector[uint8_t]] in_vec - ): - cdef BufferArrayFromVector buf = BufferArrayFromVector() - buf.in_vec = move(in_vec) - buf.length = dereference(buf.in_vec).size() - return buf - - def __getbuffer__(self, Py_buffer *buffer, int flags): - cdef Py_ssize_t itemsize = sizeof(uint8_t) - - self.shape[0] = self.length - self.strides[0] = 1 - - buffer.buf = dereference(self.in_vec).data() - - buffer.format = NULL # byte - buffer.internal = NULL - buffer.itemsize = itemsize - buffer.len = self.length * itemsize # product(shape) * itemsize - buffer.ndim = 1 - buffer.obj = self - buffer.readonly = 0 - buffer.shape = self.shape - buffer.strides = self.strides - buffer.suboffsets = NULL - - def __releasebuffer__(self, Py_buffer *buffer): - pass - - -def _parse_metadata(meta): - file_is_range_index = False - file_index_cols = None - file_column_dtype = None - - if 'index_columns' in meta and len(meta['index_columns']) > 0: - file_index_cols = meta['index_columns'] - - if isinstance(file_index_cols[0], dict) and \ - file_index_cols[0]['kind'] == 'range': - file_is_range_index = True - if 'column_indexes' in meta and len(meta['column_indexes']) == 1: - file_column_dtype = meta['column_indexes'][0]["numpy_type"] - return file_is_range_index, file_index_cols, file_column_dtype - - -cdef object _process_metadata(object df, - list names, - dict child_names, - list per_file_user_data, - object row_groups, - object filepaths_or_buffers, - bool allow_range_index, - bool use_pandas_metadata, - size_type nrows=-1, - int64_t skip_rows=0, - ): - - add_df_col_struct_names(df, child_names) - index_col = None - is_range_index = True - column_index_type = None - index_col_names = None - meta = None - for single_file in per_file_user_data: - if b'pandas' not in single_file: - continue - json_str = single_file[b'pandas'].decode('utf-8') - meta = json.loads(json_str) - file_is_range_index, index_col, column_index_type = _parse_metadata(meta) - is_range_index &= file_is_range_index - - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] - - if meta is not None: - # Book keep each column metadata as the order - # of `meta["columns"]` and `column_names` are not - # guaranteed to be deterministic and same always. - meta_data_per_column = { - col_meta['name']: col_meta for col_meta in meta["columns"] - } - - # update the decimal precision of each column - for col in names: - if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): - df._data[col].dtype.precision = ( - meta_data_per_column[col]["metadata"]["precision"] - ) - - # Set the index column - if index_col is not None and len(index_col) > 0: - if is_range_index: - if not allow_range_index: - return df - - if len(per_file_user_data) > 1: - range_index_meta = { - "kind": "range", - "name": None, - "start": 0, - "stop": len(df), - "step": 1 - } - else: - range_index_meta = index_col[0] - - if row_groups is not None: - per_file_metadata = [ - pa.parquet.read_metadata( - # Pyarrow cannot read directly from bytes - io.BytesIO(s) if isinstance(s, bytes) else s - ) for s in filepaths_or_buffers - ] - - filtered_idx = [] - for i, file_meta in enumerate(per_file_metadata): - row_groups_i = [] - start = 0 - for row_group in range(file_meta.num_row_groups): - stop = start + file_meta.row_group(row_group).num_rows - row_groups_i.append((start, stop)) - start = stop - - for rg in row_groups[i]: - filtered_idx.append( - cudf.RangeIndex( - start=row_groups_i[rg][0], - stop=row_groups_i[rg][1], - step=range_index_meta['step'] - ) - ) - - if len(filtered_idx) > 0: - idx = cudf.concat(filtered_idx) - else: - idx = cudf.Index._from_column(cudf.core.column.column_empty(0)) - else: - start = range_index_meta["start"] + skip_rows - stop = range_index_meta["stop"] - if nrows != -1: - stop = start + nrows - idx = cudf.RangeIndex( - start=start, - stop=stop, - step=range_index_meta['step'], - name=range_index_meta['name'] - ) - - df._index = idx - elif set(index_col).issubset(names): - index_data = df[index_col] - actual_index_names = iter(index_col_names.values()) - if index_data._num_columns == 1: - idx = cudf.Index._from_column( - index_data._columns[0], - name=next(actual_index_names) - ) - else: - idx = cudf.MultiIndex.from_frame( - index_data, - names=list(actual_index_names) - ) - df.drop(columns=index_col, inplace=True) - df._index = idx - else: - if use_pandas_metadata: - df.index.names = index_col - - if df._num_columns == 0 and column_index_type is not None: - df._data.label_dtype = cudf.dtype(column_index_type) - - return df - - -def read_parquet_chunked( - filepaths_or_buffers, - columns=None, - row_groups=None, - use_pandas_metadata=True, - size_t chunk_read_limit=0, - size_t pass_read_limit=1024000000, - size_type nrows=-1, - int64_t skip_rows=0, - allow_mismatched_pq_schemas=False -): - # Note: If this function ever takes accepts filters - # allow_range_index needs to be False when a filter is passed - # (see read_parquet) - allow_range_index = columns is not None and len(columns) != 0 - - reader = ChunkedParquetReader( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - row_groups, - use_pandas_metadata=use_pandas_metadata, - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - skip_rows=skip_rows, - nrows=nrows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, - ) - - tbl_w_meta = reader.read_chunk() - column_names = tbl_w_meta.column_names(include_children=False) - child_names = tbl_w_meta.child_names - per_file_user_data = tbl_w_meta.per_file_user_data - concatenated_columns = tbl_w_meta.tbl.columns() - - # save memory - del tbl_w_meta - - cdef Table tbl - while reader.has_next(): - tbl = reader.read_chunk().tbl - - for i in range(tbl.num_columns()): - concatenated_columns[i] = plc.concatenate.concatenate( - [concatenated_columns[i], tbl._columns[i]] - ) - # Drop residual columns to save memory - tbl._columns[i] = None - - df = cudf.DataFrame._from_data( - *_data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns], - column_names=column_names, - index_names=None - ) - ) - df = _process_metadata(df, column_names, child_names, - per_file_user_data, row_groups, - filepaths_or_buffers, - allow_range_index, use_pandas_metadata, - nrows=nrows, skip_rows=skip_rows) - return df - - -cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, - use_pandas_metadata=True, - Expression filters=None, - size_type nrows=-1, - int64_t skip_rows=0, - allow_mismatched_pq_schemas=False): - """ - Cython function to call into libcudf API, see `read_parquet`. - - filters, if not None, should be an Expression that evaluates to a - boolean predicate as a function of columns being read. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - - allow_range_index = True - if columns is not None and len(columns) == 0 or filters: - allow_range_index = False - - # Read Parquet - - tbl_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo(filepaths_or_buffers), - columns, - row_groups, - filters, - convert_strings_to_categories = False, - use_pandas_metadata = use_pandas_metadata, - skip_rows = skip_rows, - nrows = nrows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, - ) - - df = cudf.DataFrame._from_data( - *data_from_pylibcudf_io(tbl_w_meta) - ) - - df = _process_metadata(df, tbl_w_meta.column_names(include_children=False), - tbl_w_meta.child_names, tbl_w_meta.per_file_user_data, - row_groups, filepaths_or_buffers, - allow_range_index, use_pandas_metadata, - nrows=nrows, skip_rows=skip_rows) - return df - -cpdef read_parquet_metadata(filepaths_or_buffers): - """ - Cython function to call into libcudf API, see `read_parquet_metadata`. - - See Also - -------- - cudf.io.parquet.read_parquet - cudf.io.parquet.to_parquet - """ - cdef cudf_io_types.source_info source = make_source_info(filepaths_or_buffers) - - args = move(source) - - cdef parquet_metadata c_result - - # Read Parquet metadata - with nogil: - c_result = move(parquet_metadata_reader(args)) - - # access and return results - num_rows = c_result.num_rows() - num_rowgroups = c_result.num_rowgroups() - - # extract row group metadata and sanitize keys - row_group_metadata = [{k.decode(): v for k, v in metadata} - for metadata in c_result.rowgroup_metadata()] - - # read all column names including index column, if any - col_names = [info.name().decode() for info in c_result.schema().root().children()] - - # access the Parquet file_footer to find the index - index_col = None - cdef unordered_map[string, string] file_footer = c_result.metadata() - - # get index column name(s) - index_col_names = None - json_str = file_footer[b'pandas'].decode('utf-8') - meta = None - if json_str != "": - meta = json.loads(json_str) - file_is_range_index, index_col, _ = _parse_metadata(meta) - if not file_is_range_index and index_col is not None \ - and index_col_names is None: - index_col_names = {} - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = c['name'] - - # remove the index column from the list of column names - # only if index_col_names is not None - if index_col_names is not None: - col_names = [name for name in col_names if name not in index_col_names] - - # num_columns = length of list(col_names) - num_columns = len(col_names) - - # return the metadata - return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata - - -@acquire_spill_lock() -def write_parquet( - table, - object filepaths_or_buffers, - object index=None, - object compression="snappy", - object statistics="ROWGROUP", - object metadata_file_path=None, - object int96_timestamps=False, - object row_group_size_bytes=None, - object row_group_size_rows=None, - object max_page_size_bytes=None, - object max_page_size_rows=None, - object max_dictionary_size=None, - object partitions_info=None, - object force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - object skip_compression=None, - object column_encoding=None, - object column_type_length=None, - object output_as_binary=None, - write_arrow_schema=False, -): - """ - Cython function to call into libcudf API, see `write_parquet`. - - See Also - -------- - cudf.io.parquet.write_parquet - """ - - # Create the write options - cdef table_input_metadata tbl_meta - - cdef vector[map[string, string]] user_data - cdef table_view tv - cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sinks - cdef cudf_io_types.sink_info sink = make_sinks_info( - filepaths_or_buffers, _data_sinks - ) - - if index is True or ( - index is None and not isinstance(table._index, cudf.RangeIndex) - ): - tv = table_view_from_table(table) - tbl_meta = table_input_metadata(tv) - for level, idx_name in enumerate(table._index.names): - tbl_meta.column_metadata[level].set_name( - str.encode( - _index_level_name(idx_name, level, table._column_names) - ) - ) - num_index_cols_meta = len(table._index.names) - else: - tv = table_view_from_table(table, ignore_index=True) - tbl_meta = table_input_metadata(tv) - num_index_cols_meta = 0 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - if not isinstance(name, str): - if cudf.get_option("mode.pandas_compatible"): - tbl_meta.column_metadata[i].set_name(str(name).encode()) - else: - raise ValueError( - "Writing a Parquet file requires string column names" - ) - else: - tbl_meta.column_metadata[i].set_name(name.encode()) - - _set_col_metadata( - table[name]._column, - tbl_meta.column_metadata[i], - force_nullable_schema, - None, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - - cdef map[string, string] tmp_user_data - if partitions_info is not None: - for start_row, num_row in partitions_info: - partitioned_df = table.iloc[start_row: start_row + num_row].copy( - deep=False - ) - pandas_metadata = generate_pandas_metadata(partitioned_df, index) - tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata) - user_data.push_back(tmp_user_data) - tmp_user_data.clear() - else: - pandas_metadata = generate_pandas_metadata(table, index) - tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata) - user_data.push_back(tmp_user_data) - - if header_version not in ("1.0", "2.0"): - raise ValueError( - f"Invalid parquet header version: {header_version}. " - "Valid values are '1.0' and '2.0'" - ) - - cdef cudf_io_types.dictionary_policy dict_policy = ( - cudf_io_types.dictionary_policy.ADAPTIVE - if use_dictionary - else cudf_io_types.dictionary_policy.NEVER - ) - - cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression) - cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics) - - cdef unique_ptr[vector[uint8_t]] out_metadata_c - cdef vector[string] c_column_chunks_file_paths - cdef bool _int96_timestamps = int96_timestamps - cdef vector[cudf_io_types.partition_info] partitions - - # Perform write - cdef parquet_writer_options args = move( - parquet_writer_options.builder(sink, tv) - .metadata(tbl_meta) - .key_value_metadata(move(user_data)) - .compression(comp_type) - .stats_level(stat_freq) - .int96_timestamps(_int96_timestamps) - .write_v2_headers(header_version == "2.0") - .dictionary_policy(dict_policy) - .utc_timestamps(False) - .write_arrow_schema(write_arrow_schema) - .build() - ) - if partitions_info is not None: - partitions.reserve(len(partitions_info)) - for part in partitions_info: - partitions.push_back( - cudf_io_types.partition_info(part[0], part[1]) - ) - args.set_partitions(move(partitions)) - if metadata_file_path is not None: - if is_list_like(metadata_file_path): - for path in metadata_file_path: - c_column_chunks_file_paths.push_back(str.encode(path)) - else: - c_column_chunks_file_paths.push_back( - str.encode(metadata_file_path) - ) - args.set_column_chunks_file_paths(move(c_column_chunks_file_paths)) - if row_group_size_bytes is not None: - args.set_row_group_size_bytes(row_group_size_bytes) - if row_group_size_rows is not None: - args.set_row_group_size_rows(row_group_size_rows) - if max_page_size_bytes is not None: - args.set_max_page_size_bytes(max_page_size_bytes) - if max_page_size_rows is not None: - args.set_max_page_size_rows(max_page_size_rows) - if max_dictionary_size is not None: - args.set_max_dictionary_size(max_dictionary_size) - - with nogil: - out_metadata_c = move(parquet_writer(args)) - - if metadata_file_path is not None: - out_metadata_py = BufferArrayFromVector.from_unique_ptr( - move(out_metadata_c) - ) - return np.asarray(out_metadata_py) - else: - return None - - -cdef class ParquetWriter: - """ - ParquetWriter lets you incrementally write out a Parquet file from a series - of cudf tables - - Parameters - ---------- - filepath_or_buffer : str, io.IOBase, os.PathLike, or list - File path or buffer to write to. The argument may also correspond - to a list of file paths or buffers. - index : bool or None, default None - If ``True``, include a dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. If ``None``, - index(es) other than RangeIndex will be saved as columns. - compression : {'snappy', None}, default 'snappy' - Name of the compression to use. Use ``None`` for no compression. - statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' - Level at which column statistics should be included in file. - row_group_size_bytes: int, default ``uint64 max`` - Maximum size of each stripe of the output. - By default, a virtually infinite size equal to ``uint64 max`` will be used. - row_group_size_rows: int, default 1000000 - Maximum number of rows of each stripe of the output. - By default, 1000000 (10^6 rows) will be used. - max_page_size_bytes: int, default 524288 - Maximum uncompressed size of each page of the output. - By default, 524288 (512KB) will be used. - max_page_size_rows: int, default 20000 - Maximum number of rows of each page of the output. - By default, 20000 will be used. - max_dictionary_size: int, default 1048576 - Maximum size of the dictionary page for each output column chunk. Dictionary - encoding for column chunks that exceeds this limit will be disabled. - By default, 1048576 (1MB) will be used. - use_dictionary : bool, default True - If ``True``, enable dictionary encoding for Parquet page data - subject to ``max_dictionary_size`` constraints. - If ``False``, disable dictionary encoding for Parquet page data. - store_schema : bool, default False - If ``True``, enable computing and writing arrow schema to Parquet - file footer's key-value metadata section for faithful round-tripping. - See Also - -------- - cudf.io.parquet.write_parquet - """ - cdef bool initialized - cdef unique_ptr[cpp_parquet_chunked_writer] writer - cdef table_input_metadata tbl_meta - cdef cudf_io_types.sink_info sink - cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sink - cdef cudf_io_types.statistics_freq stat_freq - cdef cudf_io_types.compression_type comp_type - cdef object index - cdef size_t row_group_size_bytes - cdef size_type row_group_size_rows - cdef size_t max_page_size_bytes - cdef size_type max_page_size_rows - cdef size_t max_dictionary_size - cdef cudf_io_types.dictionary_policy dict_policy - cdef bool write_arrow_schema - - def __cinit__(self, object filepath_or_buffer, object index=None, - object compression="snappy", str statistics="ROWGROUP", - size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT, - size_type row_group_size_rows=1000000, - size_t max_page_size_bytes=524288, - size_type max_page_size_rows=20000, - size_t max_dictionary_size=1048576, - bool use_dictionary=True, - bool store_schema=False): - filepaths_or_buffers = ( - list(filepath_or_buffer) - if is_list_like(filepath_or_buffer) - else [filepath_or_buffer] - ) - self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink) - self.stat_freq = _get_stat_freq(statistics) - self.comp_type = _get_comp_type(compression) - self.index = index - self.initialized = False - self.row_group_size_bytes = row_group_size_bytes - self.row_group_size_rows = row_group_size_rows - self.max_page_size_bytes = max_page_size_bytes - self.max_page_size_rows = max_page_size_rows - self.max_dictionary_size = max_dictionary_size - self.dict_policy = ( - cudf_io_types.dictionary_policy.ADAPTIVE - if use_dictionary - else cudf_io_types.dictionary_policy.NEVER - ) - self.write_arrow_schema = store_schema - - def write_table(self, table, object partitions_info=None): - """ Writes a single table to the file """ - if not self.initialized: - self._initialize_chunked_state( - table, - num_partitions=len(partitions_info) if partitions_info else 1 - ) - - cdef table_view tv - if self.index is not False and ( - table._index.name is not None or - isinstance(table._index, cudf.core.multiindex.MultiIndex)): - tv = table_view_from_table(table) - else: - tv = table_view_from_table(table, ignore_index=True) - - cdef vector[cudf_io_types.partition_info] partitions - if partitions_info is not None: - for part in partitions_info: - partitions.push_back( - cudf_io_types.partition_info(part[0], part[1]) - ) - - with nogil: - self.writer.get()[0].write(tv, partitions) - - def close(self, object metadata_file_path=None): - cdef unique_ptr[vector[uint8_t]] out_metadata_c - cdef vector[string] column_chunks_file_paths - - if not self.initialized: - return None - - # Update metadata-collection options - if metadata_file_path is not None: - if is_list_like(metadata_file_path): - for path in metadata_file_path: - column_chunks_file_paths.push_back(str.encode(path)) - else: - column_chunks_file_paths.push_back( - str.encode(metadata_file_path) - ) - - with nogil: - out_metadata_c = move( - self.writer.get()[0].close(column_chunks_file_paths) - ) - - if metadata_file_path is not None: - out_metadata_py = BufferArrayFromVector.from_unique_ptr( - move(out_metadata_c) - ) - return np.asarray(out_metadata_py) - return None - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - def _initialize_chunked_state(self, table, num_partitions=1): - """ Prepares all the values required to build the - chunked_parquet_writer_options and creates a writer""" - cdef table_view tv - - # Set the table_metadata - num_index_cols_meta = 0 - self.tbl_meta = table_input_metadata( - table_view_from_table(table, ignore_index=True)) - if self.index is not False: - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) - for level, idx_name in enumerate(table._index.names): - self.tbl_meta.column_metadata[level].set_name( - (str.encode(idx_name)) - ) - num_index_cols_meta = len(table._index.names) - else: - if table._index.name is not None: - tv = table_view_from_table(table) - self.tbl_meta = table_input_metadata(tv) - self.tbl_meta.column_metadata[0].set_name( - str.encode(table._index.name) - ) - num_index_cols_meta = 1 - - for i, name in enumerate(table._column_names, num_index_cols_meta): - self.tbl_meta.column_metadata[i].set_name(name.encode()) - _set_col_metadata( - table[name]._column, - self.tbl_meta.column_metadata[i], - ) - - index = ( - False if isinstance(table._index, cudf.RangeIndex) else self.index - ) - pandas_metadata = generate_pandas_metadata(table, index) - cdef map[string, string] tmp_user_data - tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata) - cdef vector[map[string, string]] user_data - user_data = vector[map[string, string]](num_partitions, tmp_user_data) - - cdef chunked_parquet_writer_options args - with nogil: - args = move( - chunked_parquet_writer_options.builder(self.sink) - .metadata(self.tbl_meta) - .key_value_metadata(move(user_data)) - .compression(self.comp_type) - .stats_level(self.stat_freq) - .row_group_size_bytes(self.row_group_size_bytes) - .row_group_size_rows(self.row_group_size_rows) - .max_page_size_bytes(self.max_page_size_bytes) - .max_page_size_rows(self.max_page_size_rows) - .max_dictionary_size(self.max_dictionary_size) - .write_arrow_schema(self.write_arrow_schema) - .build() - ) - args.set_dictionary_policy(self.dict_policy) - self.writer.reset(new cpp_parquet_chunked_writer(args)) - self.initialized = True - - -cpdef merge_filemetadata(object filemetadata_list): - """ - Cython function to call into libcudf API, see `merge_row_group_metadata`. - - See Also - -------- - cudf.io.parquet.merge_row_group_metadata - """ - cdef vector[unique_ptr[vector[uint8_t]]] list_c - cdef vector[uint8_t] blob_c - cdef unique_ptr[vector[uint8_t]] output_c - - for blob_py in filemetadata_list: - blob_c = blob_py - list_c.push_back(move(make_unique[vector[uint8_t]](blob_c))) - - with nogil: - output_c = move(parquet_merge_metadata(list_c)) - - out_metadata_py = BufferArrayFromVector.from_unique_ptr(move(output_c)) - return np.asarray(out_metadata_py) - - -cdef cudf_io_types.statistics_freq _get_stat_freq(object statistics): - statistics = str(statistics).upper() - if statistics == "NONE": - return cudf_io_types.statistics_freq.STATISTICS_NONE - elif statistics == "ROWGROUP": - return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP - elif statistics == "PAGE": - return cudf_io_types.statistics_freq.STATISTICS_PAGE - elif statistics == "COLUMN": - return cudf_io_types.statistics_freq.STATISTICS_COLUMN - else: - raise ValueError("Unsupported `statistics_freq` type") - - -cdef cudf_io_types.compression_type _get_comp_type(object compression): - if compression is None: - return cudf_io_types.compression_type.NONE - - compression = str(compression).upper() - if compression == "SNAPPY": - return cudf_io_types.compression_type.SNAPPY - elif compression == "ZSTD": - return cudf_io_types.compression_type.ZSTD - elif compression == "LZ4": - return cudf_io_types.compression_type.LZ4 - else: - raise ValueError("Unsupported `compression` type") - - -cdef cudf_io_types.column_encoding _get_encoding_type(object encoding): - if encoding is None: - return cudf_io_types.column_encoding.USE_DEFAULT - - enc = str(encoding).upper() - if enc == "PLAIN": - return cudf_io_types.column_encoding.PLAIN - elif enc == "DICTIONARY": - return cudf_io_types.column_encoding.DICTIONARY - elif enc == "DELTA_BINARY_PACKED": - return cudf_io_types.column_encoding.DELTA_BINARY_PACKED - elif enc == "DELTA_LENGTH_BYTE_ARRAY": - return cudf_io_types.column_encoding.DELTA_LENGTH_BYTE_ARRAY - elif enc == "DELTA_BYTE_ARRAY": - return cudf_io_types.column_encoding.DELTA_BYTE_ARRAY - elif enc == "BYTE_STREAM_SPLIT": - return cudf_io_types.column_encoding.BYTE_STREAM_SPLIT - elif enc == "USE_DEFAULT": - return cudf_io_types.column_encoding.USE_DEFAULT - else: - raise ValueError("Unsupported `column_encoding` type") - - -cdef _set_col_metadata( - Column col, - column_in_metadata& col_meta, - bool force_nullable_schema=False, - str path=None, - object skip_compression=None, - object column_encoding=None, - object column_type_length=None, - object output_as_binary=None, -): - need_path = (skip_compression is not None or column_encoding is not None or - column_type_length is not None or output_as_binary is not None) - name = col_meta.get_name().decode('UTF-8') if need_path else None - full_path = path + "." + name if path is not None else name - - if force_nullable_schema: - # Only set nullability if `force_nullable_schema` - # is true. - col_meta.set_nullability(True) - - if skip_compression is not None and full_path in skip_compression: - col_meta.set_skip_compression(True) - - if column_encoding is not None and full_path in column_encoding: - col_meta.set_encoding(_get_encoding_type(column_encoding[full_path])) - - if column_type_length is not None and full_path in column_type_length: - col_meta.set_output_as_binary(True) - col_meta.set_type_length(column_type_length[full_path]) - - if output_as_binary is not None and full_path in output_as_binary: - col_meta.set_output_as_binary(True) - - if isinstance(col.dtype, cudf.StructDtype): - for i, (child_col, name) in enumerate( - zip(col.children, list(col.dtype.fields)) - ): - col_meta.child(i).set_name(name.encode()) - _set_col_metadata( - child_col, - col_meta.child(i), - force_nullable_schema, - full_path, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - elif isinstance(col.dtype, cudf.ListDtype): - if full_path is not None: - full_path = full_path + ".list" - col_meta.child(1).set_name("element".encode()) - _set_col_metadata( - col.children[1], - col_meta.child(1), - force_nullable_schema, - full_path, - skip_compression, - column_encoding, - column_type_length, - output_as_binary - ) - elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): - col_meta.set_decimal_precision(col.dtype.precision) diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx deleted file mode 100644 index 13997da8403..00000000000 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - -from cudf._lib.reduce import minmax -from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count - - -@acquire_spill_lock() -def partition(list source_columns, Column partition_map, - object num_partitions): - """Partition source columns given a partitioning map - - Parameters - ---------- - source_columns: list[Column] - Columns to partition - partition_map: Column - Column of integer values that map each row in the input to a - partition - num_partitions: Optional[int] - Number of output partitions (deduced from unique values in - partition_map if None) - - Returns - ------- - Pair of reordered columns and partition offsets - - Raises - ------ - ValueError - If the partition map has invalid entries (not all in [0, - num_partitions)). - """ - - if num_partitions is None: - num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True) - - if partition_map.size > 0: - lo, hi = minmax(partition_map) - if lo < 0 or hi >= num_partitions: - raise ValueError("Partition map has invalid values") - - plc_table, offsets = plc.partitioning.partition( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), - partition_map.to_pylibcudf(mode="read"), - num_partitions - ) - return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx deleted file mode 100644 index 7666b7ff8da..00000000000 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool -from libcpp.vector cimport vector - -from cudf._lib.column cimport Column -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_sorted, -) - -from cudf._lib.types import Interpolation - -from pylibcudf.libcudf.types cimport interpolation, sorted - -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf as plc - - -@acquire_spill_lock() -def quantile( - Column input, - vector[double] q, - str interp, - Column ordered_indices, - bool exact, -): - cdef interpolation c_interp = ( - Interpolation[interp.upper()] - ) - - return Column.from_pylibcudf( - plc.quantiles.quantile( - input.to_pylibcudf(mode="read"), - q, - c_interp, - ordered_indices.to_pylibcudf(mode="read"), - exact - ) - ) - - -def quantile_table( - list source_columns, - vector[double] q, - object interp, - object is_input_sorted, - list column_order, - list null_precedence, -): - - cdef interpolation c_interp = ( - interp - ) - cdef sorted c_is_input_sorted = ( - is_input_sorted - ) - - return columns_from_pylibcudf_table( - plc.quantiles.quantiles( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]), - q, - c_interp, - c_is_input_sorted, - column_order, - null_precedence - ) - ) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx deleted file mode 100644 index 944753d28b8..00000000000 --- a/python/cudf/cudf/_lib/reduce.pyx +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import warnings - -import cudf -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id - -import pylibcudf - -from cudf._lib.aggregation import make_aggregation - - -@acquire_spill_lock() -def reduce(reduction_op, Column incol, dtype=None, **kwargs): - """ - Top level Cython reduce function wrapping libcudf reductions. - - Parameters - ---------- - reduction_op : string - A string specifying the operation, e.g. sum, prod - incol : Column - A cuDF Column object - dtype: numpy.dtype, optional - A numpy data type to use for the output, defaults - to the same type as the input column - """ - if dtype is not None: - warnings.warn( - "dtype is deprecated and will be remove in a future release. " - "Cast the result (e.g. .astype) after the operation instead.", - FutureWarning - ) - col_dtype = dtype - else: - col_dtype = incol._reduction_result_dtype(reduction_op) - - # check empty case - if len(incol) <= incol.null_count: - if reduction_op == 'sum' or reduction_op == 'sum_of_squares': - return incol.dtype.type(0) - if reduction_op == 'product': - return incol.dtype.type(1) - if reduction_op == "any": - return False - - return cudf.utils.dtypes._get_nan_for_dtype(col_dtype) - - result = pylibcudf.reduce.reduce( - incol.to_pylibcudf(mode="read"), - make_aggregation(reduction_op, kwargs).c_obj, - dtype_to_pylibcudf_type(col_dtype), - ) - - if is_decimal_type_id(result.type().id()): - scale = -result.type().scale() - precision = _reduce_precision(col_dtype, reduction_op, len(incol)) - return DeviceScalar.from_pylibcudf( - result, - dtype=col_dtype.__class__(precision, scale), - ).value - scalar = DeviceScalar.from_pylibcudf(result).value - if isinstance(col_dtype, cudf.StructDtype): - # TODO: Utilize column_metadata in libcudf to maintain field labels - return dict(zip(col_dtype.fields.keys(), scalar.values())) - return scalar - - -@acquire_spill_lock() -def scan(scan_op, Column incol, inclusive, **kwargs): - """ - Top level Cython scan function wrapping libcudf scans. - - Parameters - ---------- - incol : Column - A cuDF Column object - scan_op : string - A string specifying the operation, e.g. cumprod - inclusive: bool - Flag for including nulls in relevant scan - """ - return Column.from_pylibcudf( - pylibcudf.reduce.scan( - incol.to_pylibcudf(mode="read"), - make_aggregation(scan_op, kwargs).c_obj, - pylibcudf.reduce.ScanType.INCLUSIVE if inclusive - else pylibcudf.reduce.ScanType.EXCLUSIVE, - ) - ) - - -@acquire_spill_lock() -def minmax(Column incol): - """ - Top level Cython minmax function wrapping libcudf minmax. - - Parameters - ---------- - incol : Column - A cuDF Column object - - Returns - ------- - A pair of ``(min, max)`` values of ``incol`` - """ - min, max = pylibcudf.reduce.minmax(incol.to_pylibcudf(mode="read")) - return ( - cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(min)), - cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(max)), - ) - - -def _reduce_precision(dtype, op, nrows): - """ - Returns the result precision when performing the reduce - operation `op` for the given dtype and column size. - - See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - """ # noqa: E501 - p = dtype.precision - if op in ("min", "max"): - new_p = p - elif op == "sum": - new_p = p + nrows - 1 - elif op == "product": - new_p = p * nrows + nrows - 1 - elif op == "sum_of_squares": - new_p = 2 * p + nrows - else: - raise NotImplementedError() - return max(min(new_p, dtype.MAX_PRECISION), 0) diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx deleted file mode 100644 index b50c6dd25e3..00000000000 --- a/python/cudf/cudf/_lib/replace.pyx +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.api.types import is_scalar -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf - -from cudf._lib.scalar import as_device_scalar - - -@acquire_spill_lock() -def replace(Column input_col, Column values_to_replace, - Column replacement_values): - """ - Replaces values from values_to_replace with corresponding value from - replacement_values in input_col - - Parameters - ---------- - input_col : Column whose value will be updated - values_to_replace : Column with values which needs to be replaced - replacement_values : Column with values which will replace - """ - - return Column.from_pylibcudf( - pylibcudf.replace.find_and_replace_all( - input_col.to_pylibcudf(mode="read"), - values_to_replace.to_pylibcudf(mode="read"), - replacement_values.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def replace_nulls_column(Column input_col, Column replacement_values): - """ - Replaces null values in input_col with corresponding values from - replacement_values - - Parameters - ---------- - input_col : Column whose value will be updated - replacement_values : Column with values which will replace nulls - """ - return Column.from_pylibcudf( - pylibcudf.replace.replace_nulls( - input_col.to_pylibcudf(mode="read"), - replacement_values.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def replace_nulls_scalar(Column input_col, DeviceScalar replacement_value): - """ - Replaces null values in input_col with replacement_value - - Parameters - ---------- - input_col : Column whose value will be updated - replacement_value : DeviceScalar with value which will replace nulls - """ - return Column.from_pylibcudf( - pylibcudf.replace.replace_nulls( - input_col.to_pylibcudf(mode="read"), - replacement_value.c_value, - ) - ) - - -@acquire_spill_lock() -def replace_nulls_fill(Column input_col, object method): - """ - Replaces null values in input_col with replacement_value - - Parameters - ---------- - input_col : Column whose value will be updated - method : 'ffill' or 'bfill' - """ - return Column.from_pylibcudf( - pylibcudf.replace.replace_nulls( - input_col.to_pylibcudf(mode="read"), - pylibcudf.replace.ReplacePolicy.PRECEDING - if method == 'ffill' - else pylibcudf.replace.ReplacePolicy.FOLLOWING, - ) - ) - - -def replace_nulls( - Column input_col, - object replacement=None, - object method=None, - object dtype=None -): - """ - Calls one of the version of replace_nulls depending on type - of replacement - """ - - if replacement is None and method is None: - raise ValueError("Must specify a fill 'value' or 'method'.") - - if replacement and method: - raise ValueError("Cannot specify both 'value' and 'method'.") - - if method: - return replace_nulls_fill(input_col, method) - elif is_scalar(replacement): - return replace_nulls_scalar( - input_col, - as_device_scalar(replacement, dtype=dtype) - ) - else: - return replace_nulls_column(input_col, replacement) - - -@acquire_spill_lock() -def clamp(Column input_col, DeviceScalar lo, DeviceScalar hi): - """ - Clip the input_col such that values < lo will be replaced by lo - and > hi will be replaced by hi - - Parameters - ---------- - input_col : Column whose value will be updated - lo : DeviceScalar value for clipping lower values - hi : DeviceScalar value for clipping upper values - """ - return Column.from_pylibcudf( - pylibcudf.replace.clamp( - input_col.to_pylibcudf(mode="read"), - lo.c_value, - hi.c_value, - ) - ) - - -@acquire_spill_lock() -def clip(Column input_col, object lo, object hi): - """ - Clip the input_col such that values < lo will be replaced by lo - and > hi will be replaced by hi - """ - - lo_scalar = as_device_scalar(lo, dtype=input_col.dtype) - hi_scalar = as_device_scalar(hi, dtype=input_col.dtype) - - return clamp(input_col, lo_scalar, hi_scalar) - - -@acquire_spill_lock() -def normalize_nans_and_zeros_inplace(Column input_col): - """ - Inplace normalizing - """ - pylibcudf.replace.normalize_nans_and_zeros( - input_col.to_pylibcudf(mode="write"), inplace=True - ) - - -@acquire_spill_lock() -def normalize_nans_and_zeros_column(Column input_col): - """ - Returns a new normalized Column - """ - return Column.from_pylibcudf( - pylibcudf.replace.normalize_nans_and_zeros( - input_col.to_pylibcudf(mode="read") - ) - ) - - -def normalize_nans_and_zeros(Column input_col, in_place=False): - """ - Normalize the NaN and zeros in input_col - Convert -NaN -> NaN - Convert -0.0 -> 0.0 - - Parameters - ---------- - input_col : Column that needs to be normalized - in_place : boolean whether to normalize in place or return new column - """ - - if in_place is True: - normalize_nans_and_zeros_inplace(input_col) - else: - return normalize_nans_and_zeros_column(input_col) diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx deleted file mode 100644 index 6cebeb2bc16..00000000000 --- a/python/cudf/cudf/_lib/reshape.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf as plc - - -@acquire_spill_lock() -def interleave_columns(list source_columns): - return Column.from_pylibcudf( - plc.reshape.interleave_columns( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]) - ) - ) - - -@acquire_spill_lock() -def tile(list source_columns, size_type count): - cdef size_type c_count = count - - return columns_from_pylibcudf_table( - plc.reshape.tile( - plc.Table([ - c.to_pylibcudf(mode="read") for c in source_columns - ]), - c_count - ) - ) diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx deleted file mode 100644 index 687b261c2c7..00000000000 --- a/python/cudf/cudf/_lib/rolling.pyx +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf - -from cudf._lib.aggregation import make_aggregation - - -@acquire_spill_lock() -def rolling(Column source_column, - Column pre_column_window, - Column fwd_column_window, - window, - min_periods, - center, - op, - agg_params): - """ - Rolling on input executing operation within the given window for each row - - Parameters - ---------- - source_column : input column on which rolling operation is executed - pre_column_window : prior window for each element of source_column - fwd_column_window : forward window for each element of source_column - window : Size of the moving window, can be integer or None - min_periods : Minimum number of observations in window required to have - a value (otherwise result is null) - center : Set the labels at the center of the window - op : operation to be executed - agg_params : dict, parameter for the aggregation (e.g. ddof for VAR/STD) - - Returns - ------- - A Column with rolling calculations - """ - - if window is None: - if center: - # TODO: we can support this even though Pandas currently does not - raise NotImplementedError( - "center is not implemented for offset-based windows" - ) - pre = pre_column_window.to_pylibcudf(mode="read") - fwd = fwd_column_window.to_pylibcudf(mode="read") - else: - if center: - pre = (window // 2) + 1 - fwd = window - (pre) - else: - pre = window - fwd = 0 - - return Column.from_pylibcudf( - pylibcudf.rolling.rolling_window( - source_column.to_pylibcudf(mode="read"), - pre, - fwd, - min_periods, - make_aggregation( - op, {'dtype': source_column.dtype} if callable(op) else agg_params - ).c_obj, - ) - ) diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx deleted file mode 100644 index f961c09e6f6..00000000000 --- a/python/cudf/cudf/_lib/round.pyx +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc -from pylibcudf.round import RoundingMethod - - -@acquire_spill_lock() -def round(Column input_col, int decimal_places=0, how="half_even"): - """ - Round column values to the given number of decimal places - - Parameters - ---------- - input_col : Column whose values will be rounded - decimal_places : The number or decimal places to round to - - Returns - ------- - A Column with values rounded to the given number of decimal places - """ - if how not in {"half_even", "half_up"}: - raise ValueError("'how' must be either 'half_even' or 'half_up'") - - how = ( - RoundingMethod.HALF_EVEN if how == "half_even" - else RoundingMethod.HALF_UP - ) - - return Column.from_pylibcudf( - plc.round.round( - input_col.to_pylibcudf(mode="read"), - decimal_places, - how - ) - ) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd deleted file mode 100644 index 27095ca02d4..00000000000 --- a/python/cudf/cudf/_lib/scalar.pxd +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr - -from pylibcudf.libcudf.scalar.scalar cimport scalar -from rmm._lib.memory_resource cimport DeviceMemoryResource - - -cdef class DeviceScalar: - cdef public object c_value - - cdef object _dtype - - cdef const scalar* get_raw_ptr(self) except * - - @staticmethod - cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*) - - @staticmethod - cdef DeviceScalar from_pylibcudf(pscalar, dtype=*) - - cdef void _set_dtype(self, dtype=*) - - cpdef bool is_valid(DeviceScalar s) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0dde91316fb..e4b828565bf 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,382 +1,5 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import copy - -import numpy as np -import pandas as pd -import pyarrow as pa - -from libc.stdint cimport int64_t -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - import pylibcudf +from rmm._lib.memory_resource cimport DeviceMemoryResource -import cudf -from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf.core.dtypes import ListDtype, StructDtype -from cudf.core.missing import NA, NaT - -cimport pylibcudf.libcudf.types as libcudf_types -# We currently need this cimport because some of the implementations here -# access the c_obj of the scalar, and because we need to be able to call -# pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until -# DeviceScalar is phased out entirely from cuDF Cython (at which point -# cudf.Scalar will be directly backed by pylibcudf.Scalar). -from pylibcudf cimport Scalar as plc_Scalar -from pylibcudf.libcudf.scalar.scalar cimport ( - duration_scalar, - list_scalar, - scalar, - struct_scalar, - timestamp_scalar, -) -from pylibcudf.libcudf.wrappers.durations cimport ( - duration_ms, - duration_ns, - duration_s, - duration_us, -) -from pylibcudf.libcudf.wrappers.timestamps cimport ( - timestamp_ms, - timestamp_ns, - timestamp_s, - timestamp_us, -) - -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id - - -def _replace_nested(obj, check, replacement): - if isinstance(obj, list): - for i, item in enumerate(obj): - if check(item): - obj[i] = replacement - elif isinstance(item, (dict, list)): - _replace_nested(item, check, replacement) - elif isinstance(obj, dict): - for k, v in obj.items(): - if check(v): - obj[k] = replacement - elif isinstance(v, (dict, list)): - _replace_nested(v, check, replacement) - - -def gather_metadata(dtypes): - """Convert a dict of dtypes to a list of ColumnMetadata objects. - - The metadata is constructed recursively so that nested types are - represented as nested ColumnMetadata objects. - - Parameters - ---------- - dtypes : dict - A dict mapping column names to dtypes. - - Returns - ------- - List[ColumnMetadata] - A list of ColumnMetadata objects. - """ - out = [] - for name, dtype in dtypes.items(): - v = pylibcudf.interop.ColumnMetadata(name) - if isinstance(dtype, cudf.StructDtype): - v.children_meta = gather_metadata(dtype.fields) - elif isinstance(dtype, cudf.ListDtype): - # Offsets column is unnamed and has no children - v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) - v.children_meta.extend( - gather_metadata({"": dtype.element_type}) - ) - out.append(v) - return out - - -cdef class DeviceScalar: - - # TODO: I think this should be removable, except that currently the way - # that from_unique_ptr is implemented is probably dereferencing this in an - # invalid state. See what the best way to fix that is. - def __cinit__(self, *args, **kwargs): - self.c_value = pylibcudf.Scalar.__new__(pylibcudf.Scalar) - - def __init__(self, value, dtype): - """ - Type representing an *immutable* scalar value on the device - - Parameters - ---------- - value : scalar - An object of scalar type, i.e., one for which - `np.isscalar()` returns `True`. Can also be `None`, - to represent a "null" scalar. In this case, - dtype *must* be provided. - dtype : dtype - A NumPy dtype. - """ - dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') - - if cudf.utils.utils.is_na_like(value): - value = None - else: - # TODO: For now we always deepcopy the input value to avoid - # overwriting the input values when replacing nulls. Since it's - # just host values it's not that expensive, but we could consider - # alternatives. - value = copy.deepcopy(value) - _replace_nested(value, cudf.utils.utils.is_na_like, None) - - if isinstance(dtype, cudf.core.dtypes._BaseDtype): - pa_type = dtype.to_arrow() - elif pd.api.types.is_string_dtype(dtype): - # Have to manually convert object types, which we use internally - # for strings but pyarrow only supports as unicode 'U' - pa_type = pa.string() - else: - pa_type = pa.from_numpy_dtype(dtype) - - if isinstance(pa_type, pa.ListType) and value is None: - # pyarrow doesn't correctly handle None values for list types, so - # we have to create this one manually. - # https://github.com/apache/arrow/issues/40319 - pa_array = pa.array([None], type=pa_type) - else: - pa_array = pa.array([pa.scalar(value, type=pa_type)]) - - pa_table = pa.Table.from_arrays([pa_array], names=[""]) - table = pylibcudf.interop.from_arrow(pa_table) - - column = table.columns()[0] - if isinstance(dtype, cudf.core.dtypes.DecimalDtype): - if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - column = pylibcudf.unary.cast( - column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL32, -dtype.scale) - ) - elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - column = pylibcudf.unary.cast( - column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL64, -dtype.scale) - ) - - self.c_value = pylibcudf.copying.get_element(column, 0) - self._dtype = dtype - - def _to_host_scalar(self): - is_datetime = self.dtype.kind == "M" - is_timedelta = self.dtype.kind == "m" - - null_type = NaT if is_datetime or is_timedelta else NA - - metadata = gather_metadata({"": self.dtype})[0] - ps = pylibcudf.interop.to_arrow(self.c_value, metadata) - if not ps.is_valid: - return null_type - - # TODO: The special handling of specific types below does not currently - # extend to nested types containing those types (e.g. List[timedelta] - # where the timedelta would overflow). We should eventually account for - # those cases, but that will require more careful consideration of how - # to traverse the contents of the nested data. - if is_datetime or is_timedelta: - time_unit, _ = np.datetime_data(self.dtype) - # Cast to int64 to avoid overflow - ps_cast = ps.cast('int64').as_py() - out_type = np.datetime64 if is_datetime else np.timedelta64 - ret = out_type(ps_cast, time_unit) - elif cudf.api.types.is_numeric_dtype(self.dtype): - ret = ps.type.to_pandas_dtype()(ps.as_py()) - else: - ret = ps.as_py() - - _replace_nested(ret, lambda item: item is None, NA) - return ret - - @property - def dtype(self): - """ - The NumPy dtype corresponding to the data type of the underlying - device scalar. - """ - return self._dtype - - @property - def value(self): - """ - Returns a host copy of the underlying device scalar. - """ - return self._to_host_scalar() - - cdef const scalar* get_raw_ptr(self) except *: - return ( self.c_value).c_obj.get() - - cpdef bool is_valid(self): - """ - Returns if the Scalar is valid or not(i.e., ). - """ - return self.c_value.is_valid() - - def __repr__(self): - if cudf.utils.utils.is_na_like(self.value): - return ( - f"{self.__class__.__name__}" - f"({self.value}, {repr(self.dtype)})" - ) - else: - return f"{self.__class__.__name__}({repr(self.value)})" - - @staticmethod - cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=None): - """ - Construct a Scalar object from a unique_ptr. - """ - cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) - # Note: This line requires pylibcudf to be cimported - s.c_value = plc_Scalar.from_libcudf(move(ptr)) - s._set_dtype(dtype) - return s - - @staticmethod - cdef DeviceScalar from_pylibcudf(pscalar, dtype=None): - cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) - s.c_value = pscalar - s._set_dtype(dtype) - return s - - cdef void _set_dtype(self, dtype=None): - cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() - - if dtype is not None: - self._dtype = dtype - elif cdtype.id() in { - libcudf_types.type_id.DECIMAL32, - libcudf_types.type_id.DECIMAL64, - libcudf_types.type_id.DECIMAL128, - }: - raise TypeError( - "Must pass a dtype when constructing from a fixed-point scalar" - ) - elif cdtype.id() == libcudf_types.type_id.STRUCT: - struct_table_view = (self.get_raw_ptr())[0].view() - self._dtype = StructDtype({ - str(i): dtype_from_column_view(struct_table_view.column(i)) - for i in range(struct_table_view.num_columns()) - }) - elif cdtype.id() == libcudf_types.type_id.LIST: - if ( - self.get_raw_ptr() - )[0].view().type().id() == libcudf_types.type_id.LIST: - self._dtype = dtype_from_column_view( - (self.get_raw_ptr())[0].view() - ) - else: - self._dtype = ListDtype( - LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - ( - (self.get_raw_ptr())[0] - .view().type().id() - ) - ] - ) - else: - self._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (cdtype.id()) - ] - - -# TODO: Currently the only uses of this function and the one below are in -# _create_proxy_nat_scalar. See if that code path can be simplified to excise -# or at least simplify these implementations. -cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - - value = value if valid else 0 - - if dtype == "datetime64[s]": - s.reset( - new timestamp_scalar[timestamp_s](np.int64(value), valid) - ) - elif dtype == "datetime64[ms]": - s.reset( - new timestamp_scalar[timestamp_ms](np.int64(value), valid) - ) - elif dtype == "datetime64[us]": - s.reset( - new timestamp_scalar[timestamp_us](np.int64(value), valid) - ) - elif dtype == "datetime64[ns]": - s.reset( - new timestamp_scalar[timestamp_ns](np.int64(value), valid) - ) - else: - raise ValueError(f"dtype not supported: {dtype}") - -cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - - value = value if valid else 0 - - if dtype == "timedelta64[s]": - s.reset( - new duration_scalar[duration_s](np.int64(value), valid) - ) - elif dtype == "timedelta64[ms]": - s.reset( - new duration_scalar[duration_ms](np.int64(value), valid) - ) - elif dtype == "timedelta64[us]": - s.reset( - new duration_scalar[duration_us](np.int64(value), valid) - ) - elif dtype == "timedelta64[ns]": - s.reset( - new duration_scalar[duration_ns](np.int64(value), valid) - ) - else: - raise ValueError(f"dtype not supported: {dtype}") - - -def as_device_scalar(val, dtype=None): - if isinstance(val, (cudf.Scalar, DeviceScalar)): - if dtype == val.dtype or dtype is None: - if isinstance(val, DeviceScalar): - return val - else: - return val.device_value - else: - raise TypeError("Can't update dtype of existing GPU scalar") - else: - return cudf.Scalar(val, dtype=dtype).device_value - - -def _is_null_host_scalar(slr): - if cudf.utils.utils.is_na_like(slr): - return True - elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \ - slr is pd.NaT: - return True - else: - return False - - -def _create_proxy_nat_scalar(dtype): - cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar) - - dtype = cudf.dtype(dtype) - if dtype.char in 'mM': - nat = dtype.type('NaT').astype(dtype) - if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar( - ( result.c_value).c_obj, nat, dtype, True - ) - elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar( - ( result.c_value).c_obj, nat, dtype, True - ) - return result - else: - raise TypeError('NAT only valid for datetime and timedelta') +def g(): + pass diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx deleted file mode 100644 index 8108361052b..00000000000 --- a/python/cudf/cudf/_lib/search.pyx +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf - - -@acquire_spill_lock() -def search_sorted( - list source, list values, side, ascending=True, na_position="last" -): - """Find indices where elements should be inserted to maintain order - - Parameters - ---------- - source : list of columns - List of columns to search in - values : List of columns - List of value columns to search for - side : str {'left', 'right'} optional - If 'left', the index of the first suitable location is given. - If 'right', return the last such index - """ - # Note: We are ignoring index columns here - column_order = [ - pylibcudf.types.Order.ASCENDING - if ascending - else pylibcudf.types.Order.DESCENDING - ] * len(source) - null_precedence = [ - pylibcudf.types.NullOrder.AFTER - if na_position == "last" - else pylibcudf.types.NullOrder.BEFORE - ] * len(source) - - func = getattr( - pylibcudf.search, - "lower_bound" if side == "left" else "upper_bound", - ) - return Column.from_pylibcudf( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - column_order, - null_precedence, - ) - ) - - -@acquire_spill_lock() -def contains(Column haystack, Column needles): - """Check whether column contains multiple values - - Parameters - ---------- - column : NumericalColumn - Column to search in - needles : - A column of values to search for - """ - return Column.from_pylibcudf( - pylibcudf.search.contains( - haystack.to_pylibcudf(mode="read"), - needles.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx deleted file mode 100644 index 185552ede82..00000000000 --- a/python/cudf/cudf/_lib/sort.pyx +++ /dev/null @@ -1,401 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from itertools import repeat - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from pylibcudf.libcudf.aggregation cimport rank_method -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.search cimport lower_bound, upper_bound -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport null_order, order as cpp_order - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - table_view_from_columns, -) - -import pylibcudf - - -@acquire_spill_lock() -def is_sorted( - list source_columns, object ascending=None, object null_position=None -): - """ - Checks whether the rows of a `table` are sorted in lexicographical order. - - Parameters - ---------- - source_columns : list of columns - columns to be checked for sort order - ascending : None or list-like of booleans - None or list-like of boolean values indicating expected sort order of - each column. If list-like, size of list-like must be len(columns). If - None, all columns expected sort order is set to ascending. False (0) - - descending, True (1) - ascending. - null_position : None or list-like of booleans - None or list-like of boolean values indicating desired order of nulls - compared to other elements. If list-like, size of list-like must be - len(columns). If None, null order is set to before. False (0) - after, - True (1) - before. - - Returns - ------- - returns : boolean - Returns True, if sorted as expected by ``ascending`` and - ``null_position``, False otherwise. - """ - - if ascending is None: - column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns) - else: - if len(ascending) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(ascending)} for `ascending`" - ) - column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns) - for idx, val in enumerate(ascending): - if val: - column_order[idx] = pylibcudf.types.Order.ASCENDING - - if null_position is None: - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - else: - if len(null_position) != len(source_columns): - raise ValueError( - f"Expected a list-like of length {len(source_columns)}, " - f"got length {len(null_position)} for `null_position`" - ) - null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns) - for idx, val in enumerate(null_position): - if val: - null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE - - return pylibcudf.sorting.is_sorted( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in source_columns] - ), - column_order, - null_precedence - ) - - -def ordering(column_order, null_precedence): - """ - Construct order and null order vectors - - Parameters - ---------- - column_order - Iterable of bool (True for ascending order, False for descending) - null_precedence - Iterable string for null positions ("first" for start, "last" for end) - - Both iterables must be the same length (not checked) - - Returns - ------- - pair of vectors (order, and null_order) - """ - c_column_order = [] - c_null_precedence = [] - for asc, null in zip(column_order, null_precedence): - c_column_order.append( - pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING - ) - if asc ^ (null == "first"): - c_null_precedence.append(pylibcudf.types.NullOrder.AFTER) - elif asc ^ (null == "last"): - c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE) - else: - raise ValueError(f"Invalid null precedence {null}") - return c_column_order, c_null_precedence - - -@acquire_spill_lock() -def order_by( - list columns_from_table, - object ascending, - str na_position, - *, - bool stable -): - """ - Get index to sort the table in ascending/descending order. - - Parameters - ---------- - columns_from_table : list[Column] - Columns from the table which will be sorted - ascending : sequence[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : str - Whether null values should show up at the "first" or "last" - position of **all** sorted column. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - Column of indices that sorts the table - """ - order = ordering(ascending, repeat(na_position)) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order") - - return Column.from_pylibcudf( - func( - pylibcudf.Table( - [c.to_pylibcudf(mode="read") for c in columns_from_table], - ), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort( - list values, - list column_order=None, - list null_precedence=None, -): - """ - Sort the table in ascending/descending order. - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - return columns_from_pylibcudf_table( - pylibcudf.sorting.sort( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def sort_by_key( - list values, - list keys, - object ascending, - object na_position, - *, - bool stable, -): - """ - Sort a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - ascending : list[bool] - Sequence of boolean values which correspond to each column - in the table to be sorted signifying the order of each column - True - Ascending and False - Descending - na_position : list[str] - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - order = ordering(ascending, na_position) - func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key") - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def segmented_sort_by_key( - list values, - list keys, - Column segment_offsets, - list column_order=None, - list null_precedence=None, - *, - bool stable, -): - """ - Sort segments of a table by given keys - - Parameters - ---------- - values : list[Column] - Columns of the table which will be sorted - keys : list[Column] - Columns making up the sort key - offsets : Column - Segment offsets - column_order : list[bool], optional - Sequence of boolean values which correspond to each column in - keys providing the sort order (default all True). - With True <=> ascending; False <=> descending. - null_precedence : list[str], optional - Sequence of "first" or "last" values (default "first") - indicating the position of null values when sorting the keys. - stable : bool - Should the sort be stable? (no default) - - Returns - ------- - list[Column] - list of value columns sorted by keys - """ - ncol = len(values) - order = ordering( - column_order or repeat(True, ncol), - null_precedence or repeat("first", ncol), - ) - func = getattr( - pylibcudf.sorting, - f"{'stable_' if stable else ''}segmented_sort_by_key" - ) - return columns_from_pylibcudf_table( - func( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]), - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]), - segment_offsets.to_pylibcudf(mode="read"), - order[0], - order[1], - ) - ) - - -@acquire_spill_lock() -def digitize(list source_columns, list bins, bool right=False): - """ - Return the indices of the bins to which each value in source_table belongs. - - Parameters - ---------- - source_columns : Input columns to be binned. - bins : List containing columns of bins - right : Indicating whether the intervals include the - right or the left bin edge. - """ - - cdef table_view bins_view = table_view_from_columns(bins) - cdef table_view source_table_view = table_view_from_columns( - source_columns - ) - cdef vector[cpp_order] column_order = ( - vector[cpp_order]( - bins_view.num_columns(), - cpp_order.ASCENDING - ) - ) - cdef vector[null_order] null_precedence = ( - vector[null_order]( - bins_view.num_columns(), - null_order.BEFORE - ) - ) - - cdef unique_ptr[column] c_result - if right: - with nogil: - c_result = move(lower_bound( - bins_view, - source_table_view, - column_order, - null_precedence) - ) - else: - with nogil: - c_result = move(upper_bound( - bins_view, - source_table_view, - column_order, - null_precedence) - ) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def rank_columns(list source_columns, rank_method method, str na_option, - bool ascending, bool pct - ): - """ - Compute numerical data ranks (1 through n) of each column in the dataframe - """ - column_order = ( - pylibcudf.types.Order.ASCENDING - if ascending - else pylibcudf.types.Order.DESCENDING - ) - # ascending - # #top = na_is_smallest - # #bottom = na_is_largest - # #keep = na_is_largest - # descending - # #top = na_is_largest - # #bottom = na_is_smallest - # #keep = na_is_smallest - if ascending: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.BEFORE - else: - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - if na_option == 'top': - null_precedence = pylibcudf.types.NullOrder.AFTER - else: - null_precedence = pylibcudf.types.NullOrder.BEFORE - c_null_handling = ( - pylibcudf.types.NullPolicy.EXCLUDE - if na_option == 'keep' - else pylibcudf.types.NullPolicy.INCLUDE - ) - - return [ - Column.from_pylibcudf( - pylibcudf.sorting.rank( - col.to_pylibcudf(mode="read"), - method, - column_order, - c_null_handling, - null_precedence, - pct, - ) - ) - for col in source_columns - ] diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx deleted file mode 100644 index 1b8831940e3..00000000000 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_pylibcudf_table - -import pylibcudf - - -@acquire_spill_lock() -def drop_nulls(list columns, how="any", keys=None, thresh=None): - """ - Drops null rows from cols depending on key columns. - - Parameters - ---------- - columns : list of columns - how : "any" or "all". If thresh is None, drops rows of cols that have any - nulls or all nulls (respectively) in subset (default: "any") - keys : List of column indices. If set, then these columns are checked for - nulls rather than all of columns (optional) - thresh : Minimum number of non-nulls required to keep a row (optional) - - Returns - ------- - columns with null rows dropped - """ - if how not in {"any", "all"}: - raise ValueError("how must be 'any' or 'all'") - - keys = list(keys if keys is not None else range(len(columns))) - - # Note: If how == "all" and thresh is specified this prioritizes thresh - if thresh is not None: - keep_threshold = thresh - elif how == "all": - keep_threshold = 1 - else: - keep_threshold = len(keys) - - return columns_from_pylibcudf_table( - pylibcudf.stream_compaction.drop_nulls( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]), - keys, - keep_threshold, - ) - ) - - -@acquire_spill_lock() -def apply_boolean_mask(list columns, Column boolean_mask): - """ - Drops the rows which correspond to False in boolean_mask. - - Parameters - ---------- - columns : list of columns whose rows are dropped as per boolean_mask - boolean_mask : a boolean column of same size as source_table - - Returns - ------- - columns obtained from applying mask - """ - return columns_from_pylibcudf_table( - pylibcudf.stream_compaction.apply_boolean_mask( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]), - boolean_mask.to_pylibcudf(mode="read"), - ) - ) - - -_keep_options = { - "first": pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_FIRST, - "last": pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_LAST, - False: pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_NONE, -} - - -@acquire_spill_lock() -def drop_duplicates(list columns, - object keys=None, - object keep='first', - bool nulls_are_equal=True): - """ - Drops rows in source_table as per duplicate rows in keys. - - Parameters - ---------- - columns : List of columns - keys : List of column indices. If set, then these columns are checked for - duplicates rather than all of columns (optional) - keep : keep 'first' or 'last' or none of the duplicate rows - nulls_are_equal : if True, nulls are treated equal else not. - - Returns - ------- - columns with duplicate dropped - """ - if (keep_option := _keep_options.get(keep)) is None: - raise ValueError('keep must be either "first", "last" or False') - - return columns_from_pylibcudf_table( - pylibcudf.stream_compaction.stable_distinct( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]), - list(keys if keys is not None else range(len(columns))), - keep_option, - pylibcudf.types.NullEquality.EQUAL - if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL, - pylibcudf.types.NanEquality.ALL_EQUAL, - ) - ) - - -@acquire_spill_lock() -def distinct_indices( - list columns, - object keep="first", - bool nulls_equal=True, - bool nans_equal=True, -): - """ - Return indices of the distinct rows in a table. - - Parameters - ---------- - columns : list of columns to check for duplicates - keep : treat "first", "last", or (False) none of any duplicate - rows as distinct - nulls_equal : Should nulls compare equal - nans_equal: Should nans compare equal - - Returns - ------- - Column of indices - - See Also - -------- - drop_duplicates - """ - if (keep_option := _keep_options.get(keep)) is None: - raise ValueError('keep must be either "first", "last" or False') - - return Column.from_pylibcudf( - pylibcudf.stream_compaction.distinct_indices( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]), - keep_option, - pylibcudf.types.NullEquality.EQUAL - if nulls_equal else pylibcudf.types.NullEquality.UNEQUAL, - pylibcudf.types.NanEquality.ALL_EQUAL - if nans_equal else pylibcudf.types.NanEquality.UNEQUAL, - ) - ) - - -@acquire_spill_lock() -def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False): - """ - Finds number of unique rows in `source_column` - - Parameters - ---------- - source_column : source table checked for unique rows - ignore_nulls : If True nulls are ignored, - else counted as one more distinct value - nan_as_null : If True, NAN is considered NULL, - else counted as one more distinct value - - Returns - ------- - Count of number of unique rows in `source_column` - """ - return pylibcudf.stream_compaction.distinct_count( - source_column.to_pylibcudf(mode="read"), - pylibcudf.types.NullPolicy.EXCLUDE - if ignore_nulls else pylibcudf.types.NullPolicy.INCLUDE, - pylibcudf.types.NanPolicy.NAN_IS_NULL - if nan_as_null else pylibcudf.types.NanPolicy.NAN_IS_VALID, - ) diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx deleted file mode 100644 index 60a6795a402..00000000000 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ /dev/null @@ -1,766 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( - from_booleans as cpp_from_booleans, - to_booleans as cpp_to_booleans, -) -from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - is_timestamp as cpp_is_timestamp, -) -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - from_floats as cpp_from_floats, - to_floats as cpp_to_floats, -) -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - from_integers as cpp_from_integers, - hex_to_integers as cpp_hex_to_integers, - integers_to_hex as cpp_integers_to_hex, - is_hex as cpp_is_hex, - to_integers as cpp_to_integers, -) -from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport ( - integers_to_ipv4 as cpp_integers_to_ipv4, - ipv4_to_integers as cpp_ipv4_to_integers, - is_ipv4 as cpp_is_ipv4, -) -from pylibcudf.libcudf.types cimport data_type, type_id - -from cudf._lib.types cimport underlying_type_t_type_id - -import pylibcudf as plc - -import cudf - -from cudf._lib.types cimport dtype_to_pylibcudf_type - - -def floating_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_floats( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def string_to_floating(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) - ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_floats( - input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) - - -def dtos(Column input_col): - """ - Converting/Casting input column of type double to string column - - Parameters - ---------- - input_col : input column of type double - - Returns - ------- - A Column with double values cast to string - """ - - return floating_to_string(input_col) - - -def stod(Column input_col): - """ - Converting/Casting input column of type string to double - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to double - """ - - return string_to_floating(input_col, cudf.dtype("float64")) - - -def ftos(Column input_col): - """ - Converting/Casting input column of type float to string column - - Parameters - ---------- - input_col : input column of type double - - Returns - ------- - A Column with float values cast to string - """ - - return floating_to_string(input_col) - - -def stof(Column input_col): - """ - Converting/Casting input column of type string to float - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to float - """ - - return string_to_floating(input_col, cudf.dtype("float32")) - - -def integer_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_integers( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def string_to_integer(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) - ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_integers( - input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) - - -def i8tos(Column input_col): - """ - Converting/Casting input column of type int8 to string column - - Parameters - ---------- - input_col : input column of type int8 - - Returns - ------- - A Column with int8 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi8(Column input_col): - """ - Converting/Casting input column of type string to int8 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int8 - """ - - return string_to_integer(input_col, cudf.dtype("int8")) - - -def i16tos(Column input_col): - """ - Converting/Casting input column of type int16 to string column - - Parameters - ---------- - input_col : input column of type int16 - - Returns - ------- - A Column with int16 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi16(Column input_col): - """ - Converting/Casting input column of type string to int16 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int16 - """ - - return string_to_integer(input_col, cudf.dtype("int16")) - - -def itos(Column input_col): - """ - Converting/Casting input column of type int32 to string column - - Parameters - ---------- - input_col : input column of type int32 - - Returns - ------- - A Column with int32 values cast to string - """ - - return integer_to_string(input_col) - - -def stoi(Column input_col): - """ - Converting/Casting input column of type string to int32 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int32 - """ - - return string_to_integer(input_col, cudf.dtype("int32")) - - -def ltos(Column input_col): - """ - Converting/Casting input column of type int64 to string column - - Parameters - ---------- - input_col : input column of type int64 - - Returns - ------- - A Column with int64 values cast to string - """ - - return integer_to_string(input_col) - - -def stol(Column input_col): - """ - Converting/Casting input column of type string to int64 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to int64 - """ - - return string_to_integer(input_col, cudf.dtype("int64")) - - -def ui8tos(Column input_col): - """ - Converting/Casting input column of type uint8 to string column - - Parameters - ---------- - input_col : input column of type uint8 - - Returns - ------- - A Column with uint8 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui8(Column input_col): - """ - Converting/Casting input column of type string to uint8 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint8 - """ - - return string_to_integer(input_col, cudf.dtype("uint8")) - - -def ui16tos(Column input_col): - """ - Converting/Casting input column of type uint16 to string column - - Parameters - ---------- - input_col : input column of type uint16 - - Returns - ------- - A Column with uint16 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui16(Column input_col): - """ - Converting/Casting input column of type string to uint16 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint16 - """ - - return string_to_integer(input_col, cudf.dtype("uint16")) - - -def uitos(Column input_col): - """ - Converting/Casting input column of type uint32 to string column - - Parameters - ---------- - input_col : input column of type uint32 - - Returns - ------- - A Column with uint32 values cast to string - """ - - return integer_to_string(input_col) - - -def stoui(Column input_col): - """ - Converting/Casting input column of type string to uint32 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint32 - """ - - return string_to_integer(input_col, cudf.dtype("uint32")) - - -def ultos(Column input_col): - """ - Converting/Casting input column of type uint64 to string column - - Parameters - ---------- - input_col : input column of type uint64 - - Returns - ------- - A Column with uint64 values cast to string - """ - - return integer_to_string(input_col) - - -def stoul(Column input_col): - """ - Converting/Casting input column of type string to uint64 - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with strings cast to uint64 - """ - - return string_to_integer(input_col, cudf.dtype("uint64")) - - -def _to_booleans(Column input_col, object string_true="True"): - """ - Converting/Casting input column of type string to boolean column - - Parameters - ---------- - input_col : input column of type string - string_true : string that represents True - - Returns - ------- - A Column with string values cast to boolean - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_booleans( - input_column_view, - string_scalar_true[0])) - - return Column.from_unique_ptr(move(c_result)) - - -def to_booleans(Column input_col): - - return _to_booleans(input_col) - - -def _from_booleans( - Column input_col, - object string_true="True", - object string_false="False"): - """ - Converting/Casting input column of type boolean to string column - - Parameters - ---------- - input_col : input column of type boolean - string_true : string that represents True - string_false : string that represents False - - Returns - ------- - A Column with boolean values cast to string - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef DeviceScalar str_false = as_device_scalar(string_false) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef const string_scalar* string_scalar_false = ( - str_false.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_booleans( - input_column_view, - string_scalar_true[0], - string_scalar_false[0])) - - return Column.from_unique_ptr(move(c_result)) - - -def from_booleans(Column input_col): - return _from_booleans(input_col) - - -def int2timestamp( - Column input_col, - str format, - Column names): - """ - Converting/Casting input date-time column to string - column with specified format - - Parameters - ---------- - input_col : input column of type timestamp in integer format - format : The string specifying output format - names : The string names to use for weekdays ("%a", "%A") and - months ("%b", "%B") - - Returns - ------- - A Column with date-time represented in string format - - """ - cdef string c_timestamp_format = format.encode("UTF-8") - return Column.from_pylibcudf( - plc.strings.convert.convert_datetime.from_timestamps( - input_col.to_pylibcudf(mode="read"), - c_timestamp_format, - names.to_pylibcudf(mode="read") - ) - ) - - -def timestamp2int(Column input_col, dtype, format): - """ - Converting/Casting input string column to date-time column with specified - timestamp_format - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with string represented in date-time format - - """ - dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') - return Column.from_pylibcudf( - plc.strings.convert.convert_datetime.to_timestamps( - input_col.to_pylibcudf(mode="read"), - dtype, - c_timestamp_format - ) - ) - - -def istimestamp(Column input_col, str format): - """ - Check input string column matches the specified timestamp format - - Parameters - ---------- - input_col : input column of type string - - format : format string of timestamp specifiers - - Returns - ------- - A Column of boolean values identifying strings that matched the format. - - """ - if input_col.size == 0: - return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) - cdef column_view input_column_view = input_col.view() - cdef string c_timestamp_format = str(format).encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_is_timestamp( - input_column_view, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) - - -def timedelta2int(Column input_col, dtype, format): - """ - Converting/Casting input string column to TimeDelta column with specified - format - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column with string represented in TimeDelta format - - """ - dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') - return Column.from_pylibcudf( - plc.strings.convert.convert_durations.to_durations( - input_col.to_pylibcudf(mode="read"), - dtype, - c_timestamp_format - ) - ) - - -def int2timedelta(Column input_col, str format): - """ - Converting/Casting input Timedelta column to string - column with specified format - - Parameters - ---------- - input_col : input column of type Timedelta in integer format - - Returns - ------- - A Column with Timedelta represented in string format - - """ - - cdef string c_duration_format = format.encode('UTF-8') - return Column.from_pylibcudf( - plc.strings.convert.convert_durations.from_durations( - input_col.to_pylibcudf(mode="read"), - c_duration_format - ) - ) - - -def int2ip(Column input_col): - """ - Converting/Casting integer column to string column in ipv4 format - - Parameters - ---------- - input_col : input integer column - - Returns - ------- - A Column with integer represented in string ipv4 format - - """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_ipv4(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def ip2int(Column input_col): - """ - Converting string ipv4 column to integer column - - Parameters - ---------- - input_col : input string column - - Returns - ------- - A Column with ipv4 represented as integer - - """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_ipv4_to_integers(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def is_ipv4(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn - where nnn is integer digits in [0,255]. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_ipv4( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) - - -def htoi(Column input_col, **kwargs): - """ - Converting input column of type string having hex values - to integer of out_type - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - A Column of integers parsed from hexadecimal string values. - """ - - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")] - ) - ) - cdef data_type c_out_type = data_type(tid) - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_hex_to_integers(input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) - - -def is_hex(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have hex characters. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_hex( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) - - -def itoh(Column input_col): - """ - Converting input column of type integer to a string - column with hexadecimal character digits. - - Parameters - ---------- - input_col : input column of type integer - - Returns - ------- - A Column of strings with hexadecimal characters. - """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_hex(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt deleted file mode 100644 index ceeff71683c..00000000000 --- a/python/cudf/cudf/_lib/strings/CMakeLists.txt +++ /dev/null @@ -1,45 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources - attributes.pyx - capitalize.pyx - case.pyx - char_types.pyx - combine.pyx - contains.pyx - extract.pyx - find.pyx - find_multiple.pyx - findall.pyx - json.pyx - padding.pyx - repeat.pyx - replace.pyx - replace_re.pyx - strip.pyx - substring.pyx - translate.pyx - wrap.pyx -) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) - -add_subdirectory(convert) -add_subdirectory(split) diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py deleted file mode 100644 index 4bf8a9b1a8f..00000000000 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams, - generate_ngrams, - hash_character_ngrams, -) -from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import ( - minhash, - minhash64, - word_minhash, - word_minhash64, -) -from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize -from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces -from cudf._lib.nvtext.replace import filter_tokens, replace_tokens -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter, - is_letter_multi, - porter_stemmer_measure, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column, - _count_tokens_scalar, - _tokenize_column, - _tokenize_scalar, - character_tokenize, - detokenize, - tokenize_with_vocabulary, -) -from cudf._lib.strings.attributes import ( - code_points, - count_bytes, - count_characters, -) -from cudf._lib.strings.capitalize import capitalize, is_title, title -from cudf._lib.strings.case import swapcase, to_lower, to_upper -from cudf._lib.strings.char_types import ( - filter_alphanum, - is_alnum, - is_alpha, - is_decimal, - is_digit, - is_lower, - is_numeric, - is_space, - is_upper, -) -from cudf._lib.strings.combine import ( - concatenate, - join, - join_lists_with_column, - join_lists_with_scalar, -) -from cudf._lib.strings.contains import contains_re, count_re, like, match_re -from cudf._lib.strings.convert.convert_fixed_point import to_decimal -from cudf._lib.strings.convert.convert_floats import is_float -from cudf._lib.strings.convert.convert_integers import is_integer -from cudf._lib.strings.convert.convert_urls import url_decode, url_encode -from cudf._lib.strings.extract import extract -from cudf._lib.strings.find import ( - contains, - contains_multiple, - endswith, - endswith_multiple, - find, - rfind, - startswith, - startswith_multiple, -) -from cudf._lib.strings.find_multiple import find_multiple -from cudf._lib.strings.findall import findall -from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object -from cudf._lib.strings.padding import ( - SideType, - center, - ljust, - pad, - rjust, - zfill, -) -from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence -from cudf._lib.strings.replace import ( - insert, - replace, - replace_multi, - slice_replace, -) -from cudf._lib.strings.replace_re import ( - replace_multi_re, - replace_re, - replace_with_backrefs, -) -from cudf._lib.strings.split.partition import partition, rpartition -from cudf._lib.strings.split.split import ( - rsplit, - rsplit_re, - rsplit_record, - rsplit_record_re, - split, - split_re, - split_record, - split_record_re, -) -from cudf._lib.strings.strip import lstrip, rstrip, strip -from cudf._lib.strings.substring import get, slice_from, slice_strings -from cudf._lib.strings.translate import filter_characters, translate -from cudf._lib.strings.wrap import wrap diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx deleted file mode 100644 index df81b3942b4..00000000000 --- a/python/cudf/cudf/_lib/strings/attributes.pyx +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def count_characters(Column source_strings): - """ - Returns an integer numeric column containing the - length of each string in characters. - """ - plc_column = plc.strings.attributes.count_characters( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def count_bytes(Column source_strings): - """ - Returns an integer numeric column containing the - number of bytes of each string. - """ - plc_column = plc.strings.attributes.count_bytes( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def code_points(Column source_strings): - """ - Creates a numeric column with code point values (integers) - for each character of each string. - """ - plc_column = plc.strings.attributes.code_points( - source_strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx deleted file mode 100644 index 42c40e2e753..00000000000 --- a/python/cudf/cudf/_lib/strings/capitalize.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def capitalize(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.capitalize( - source_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def title(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.title( - source_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def is_title(Column source_strings): - return Column.from_pylibcudf( - plc.strings.capitalize.is_title( - source_strings.to_pylibcudf(mode="read") - ) - ) diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx deleted file mode 100644 index ad4cbb6f088..00000000000 --- a/python/cudf/cudf/_lib/strings/case.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import case - - -@acquire_spill_lock() -def to_upper(Column source_strings): - return Column.from_pylibcudf( - case.to_upper( - source_strings.to_pylibcudf(mode='read') - ) - ) - - -@acquire_spill_lock() -def to_lower(Column source_strings): - return Column.from_pylibcudf( - case.to_lower( - source_strings.to_pylibcudf(mode='read') - ) - ) - - -@acquire_spill_lock() -def swapcase(Column source_strings): - return Column.from_pylibcudf( - case.swapcase( - source_strings.to_pylibcudf(mode='read') - ) - ) diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx deleted file mode 100644 index 376a6f8af97..00000000000 --- a/python/cudf/cudf/_lib/strings/char_types.pyx +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.char_types cimport ( - all_characters_of_type as cpp_all_characters_of_type, - filter_characters_of_type as cpp_filter_characters_of_type, - string_character_types, -) - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def filter_alphanum(Column source_strings, object py_repl, bool keep=True): - """ - Returns a Column of strings keeping only alphanumeric character types. - """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_filter_characters_of_type( - source_view, - string_character_types.ALL_TYPES if keep - else string_character_types.ALPHANUM, - scalar_repl[0], - string_character_types.ALPHANUM if keep - else string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_decimal(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only decimal characters -- those that can be used - to extract base10 numbers. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DECIMAL, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_alnum(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only alphanumeric characters. - - Equivalent to: is_alpha() or is_digit() or is_numeric() or is_decimal() - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHANUM, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_alpha(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only alphabetic characters. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.ALPHA, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_digit(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only decimal and digit characters. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.DIGIT, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_numeric(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only numeric characters. These include digit and - numeric characters. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.NUMERIC, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_upper(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only upper-case characters. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.UPPER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_lower(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain only lower-case characters. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.LOWER, - string_character_types.CASE_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def is_space(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contains all characters which are spaces only. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_all_characters_of_type( - source_view, - string_character_types.SPACE, - string_character_types.ALL_TYPES - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx deleted file mode 100644 index 76cc13db0da..00000000000 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ /dev/null @@ -1,162 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.combine cimport ( - concatenate as cpp_concatenate, - join_list_elements as cpp_join_list_elements, - join_strings as cpp_join_strings, - output_if_empty_list, - separator_on_nulls, -) -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_columns - - -@acquire_spill_lock() -def concatenate(list source_strings, - object sep, - object na_rep): - """ - Returns a Column by concatenating strings column-wise in `source_strings` - with the specified `sep` between each column and - `na`/`None` values are replaced by `na_rep` - """ - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef table_view source_view = table_view_from_columns(source_strings) - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_concatenate( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def join(Column source_strings, - object sep, - object na_rep): - """ - Returns a Column by concatenating strings row-wise in `source_strings` - with the specified `sep` between each column and - `na`/`None` values are replaced by `na_rep` - """ - - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_join_strings( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def join_lists_with_scalar( - Column source_strings, - object py_separator, - object py_narep): - """ - Returns a Column by concatenating Lists of strings row-wise - in `source_strings` with the specified `py_separator` - between each string in lists and ``/`None` values - are replaced by `py_narep` - """ - - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - scalar_separator[0], - scalar_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def join_lists_with_column( - Column source_strings, - Column separator_strings, - object py_source_narep, - object py_separator_narep): - """ - Returns a Column by concatenating Lists of strings row-wise in - `source_strings` with a corresponding separator at the same - position in `separator_strings` and ``/`None` values in - `source_strings` are replaced by `py_source_narep` and - ``/`None` values in `separator_strings` are replaced - by `py_separator_narep` - """ - - cdef DeviceScalar source_narep = py_source_narep.device_value - cdef DeviceScalar separator_narep = py_separator_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view separator_view = separator_strings.view() - - cdef const string_scalar* scalar_source_narep = \ - (source_narep.get_raw_ptr()) - cdef const string_scalar* scalar_separator_narep = ( - separator_narep.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - separator_view, - scalar_separator_narep[0], - scalar_source_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx deleted file mode 100644 index 03b4887f200..00000000000 --- a/python/cudf/cudf/_lib/strings/contains.pyx +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf.strings import contains -from pylibcudf.strings.regex_program import RegexProgram - - -@acquire_spill_lock() -def contains_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column of boolean values with True for `source_strings` - that contain regular expression `reg_ex`. - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def count_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column with count of occurrences of `reg_ex` in - each string of `source_strings` - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.count_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def match_re(Column source_strings, object reg_ex, uint32_t flags): - """ - Returns a Column with each value True if the string matches `reg_ex` - regular expression with each record of `source_strings` - """ - prog = RegexProgram.create(str(reg_ex), flags) - return Column.from_pylibcudf( - contains.matches_re(source_strings.to_pylibcudf(mode="read"), prog) - ) - - -@acquire_spill_lock() -def like(Column source_strings, object py_pattern, object py_escape): - """ - Returns a Column with each value True if the string matches the - `py_pattern` like expression with each record of `source_strings` - """ - plc_column = contains.like( - source_strings.to_pylibcudf(mode="read"), - py_pattern.device_value.c_value, - py_escape.device_value.c_value, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt deleted file mode 100644 index e8a76b476a8..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources convert_fixed_point.pyx convert_floats.pyx convert_integers.pyx - convert_lists.pyx convert_urls.pyx -) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/strings/convert/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.py b/python/cudf/cudf/_lib/strings/convert/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx deleted file mode 100644 index a8df8c9a92c..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import cudf - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport ( - from_fixed_point as cpp_from_fixed_point, - is_fixed_point as cpp_is_fixed_point, - to_fixed_point as cpp_to_fixed_point, -) -from pylibcudf.libcudf.types cimport data_type, type_id - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def from_decimal(Column input_col): - """ - Converts a `Decimal64Column` to a `StringColumn`. - - Parameters - ---------- - input_col : input column of type decimal - - Returns - ------- - A column of strings representing the input decimal values. - """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_fixed_point( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def to_decimal(Column input_col, object out_type): - """ - Returns a `Decimal64Column` from the provided `StringColumn` - using the scale in the `out_type`. - - Parameters - ---------- - input_col : input column of type string - out_type : The type and scale of the decimal column expected - - Returns - ------- - A column of decimals parsed from the string values. - """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef int scale = out_type.scale - cdef data_type c_out_type - if isinstance(out_type, cudf.Decimal32Dtype): - c_out_type = data_type(type_id.DECIMAL32, -scale) - elif isinstance(out_type, cudf.Decimal64Dtype): - c_out_type = data_type(type_id.DECIMAL64, -scale) - elif isinstance(out_type, cudf.Decimal128Dtype): - c_out_type = data_type(type_id.DECIMAL128, -scale) - else: - raise TypeError("should be a decimal dtype") - with nogil: - c_result = move( - cpp_to_fixed_point( - input_column_view, - c_out_type)) - - result = Column.from_unique_ptr(move(c_result)) - result.dtype.precision = out_type.precision - return result - - -@acquire_spill_lock() -def is_fixed_point(Column input_col, object dtype): - """ - Returns a Column of boolean values with True for `input_col` - that have fixed-point characters. The output row also has a - False value if the corresponding string would cause an integer - overflow. The scale of the `dtype` is used to determine overflow - in the output row. - - Parameters - ---------- - input_col : input column of type string - dtype : The type and scale of a decimal column - - Returns - ------- - A Column of booleans indicating valid decimal conversion. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = input_col.view() - cdef int scale = dtype.scale - cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale) - with nogil: - c_result = move(cpp_is_fixed_point( - source_view, - c_dtype - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx deleted file mode 100644 index 7965b588703..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - is_float as cpp_is_float, -) - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def is_float(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have floats. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_float( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx deleted file mode 100644 index 8b6da2bfa1c..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - is_integer as cpp_is_integer, -) - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def is_integer(Column source_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that have integers. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_integer( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx deleted file mode 100644 index 73aebf8ab35..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_lists cimport ( - format_list_column as cpp_format_list_column, -) - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def format_list_column(Column source_list, Column separators): - """ - Format a list column of strings into a strings column. - - Parameters - ---------- - input_col : input column of type list with strings child. - - separators: strings used for formatting (', ', '[', ']') - - Returns - ------- - Formatted strings column - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_list.view() - cdef column_view separators_view = separators.view() - # Use 'None' as null-replacement string - cdef DeviceScalar str_na_rep = as_device_scalar("None") - cdef const string_scalar* string_scalar_na_rep = ( - str_na_rep.get_raw_ptr()) - - with nogil: - c_result = move(cpp_format_list_column( - source_view, string_scalar_na_rep[0], separators_view - )) - - return Column.from_unique_ptr( - move(c_result) - ) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx deleted file mode 100644 index e52116d6247..00000000000 --- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_urls cimport ( - url_decode as cpp_url_decode, - url_encode as cpp_url_encode, -) - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def url_decode(Column source_strings): - """ - Decode each string in column. No format checking is performed. - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - URL decoded string column - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_url_decode( - source_view - )) - - return Column.from_unique_ptr( - move(c_result) - ) - - -@acquire_spill_lock() -def url_encode(Column source_strings): - """ - Encode each string in column. No format checking is performed. - All characters are encoded except for ASCII letters, digits, - and these characters: '.','_','-','~'. Encoding converts to - hex using UTF-8 encoded bytes. - - Parameters - ---------- - input_col : input column of type string - - Returns - ------- - URL encoded string column - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_url_encode( - source_view - )) - - return Column.from_unique_ptr( - move(c_result) - ) diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx deleted file mode 100644 index 5bf336f4f3c..00000000000 --- a/python/cudf/cudf/_lib/strings/extract.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def extract(Column source_strings, object pattern, uint32_t flags): - """ - Returns data which contains extracted capture groups provided in - `pattern` for all `source_strings`. - The returning data contains one row for each subject string, - and one column for each group. - """ - prog = plc.strings.regex_program.RegexProgram.create(str(pattern), flags) - plc_result = plc.strings.extract.extract( - source_strings.to_pylibcudf(mode="read"), prog - ) - return dict(enumerate(Column.from_pylibcudf(col) for col in plc_result.columns())) diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx deleted file mode 100644 index 2d284d1aa9d..00000000000 --- a/python/cudf/cudf/_lib/strings/find.pyx +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def contains(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain the pattern given in `py_target`. - """ - return Column.from_pylibcudf( - plc.strings.find.contains( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def contains_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain the corresponding string in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.contains( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def endswith(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that end with the pattern given in `py_target`. - """ - - return Column.from_pylibcudf( - plc.strings.find.ends_with( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def endswith_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that end with corresponding location - in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.ends_with( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def startswith(Column source_strings, object py_target): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that start with the pattern given in `py_target`. - """ - return Column.from_pylibcudf( - plc.strings.find.starts_with( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def startswith_multiple(Column source_strings, Column target_strings): - """ - Returns a Column of boolean values with True for `source_strings` - that contain strings that begin with corresponding location - in `target_strings`. - """ - return Column.from_pylibcudf( - plc.strings.find.starts_with( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def find(Column source_strings, - object py_target, - size_type start, - size_type end): - """ - Returns a Column containing lowest indexes in each string of - `source_strings` that fully contain `py_target` string. - Scan portion of strings in `source_strings` can be - controlled by setting `start` and `end` values. - """ - return Column.from_pylibcudf( - plc.strings.find.find( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value, - start, - end - ) - ) - - -@acquire_spill_lock() -def rfind(Column source_strings, - object py_target, - size_type start, - size_type end): - """ - Returns a Column containing highest indexes in each string of - `source_strings` that fully contain `py_target` string. - Scan portion of strings in `source_strings` can be - controlled by setting `start` and `end` values. - """ - - return Column.from_pylibcudf( - plc.strings.find.rfind( - source_strings.to_pylibcudf(mode="read"), - py_target.device_value.c_value, - start, - end - ) - ) diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx deleted file mode 100644 index 1358f8e3c2c..00000000000 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.find_multiple cimport ( - find_multiple as cpp_find_multiple, -) - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def find_multiple(Column source_strings, Column target_strings): - """ - Returns a column with character position values where each - of the `target_strings` are found in each string of `source_strings`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - - with nogil: - c_result = move(cpp_find_multiple( - source_view, - target_view - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx deleted file mode 100644 index 0e758d5b322..00000000000 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def findall(Column source_strings, object pattern, uint32_t flags): - """ - Returns data with all non-overlapping matches of `pattern` - in each string of `source_strings` as a lists column. - """ - prog = plc.strings.regex_program.RegexProgram.create( - str(pattern), flags - ) - plc_result = plc.strings.findall.findall( - source_strings.to_pylibcudf(mode="read"), - prog, - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx deleted file mode 100644 index c9b0bba088d..00000000000 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.json cimport ( - get_json_object as cpp_get_json_object, - get_json_object_options, -) - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def get_json_object( - Column col, object py_json_path, GetJsonObjectOptions options): - """ - Apply a JSONPath string to all rows in an input column - of json strings. - """ - cdef unique_ptr[column] c_result - - cdef column_view col_view = col.view() - cdef DeviceScalar json_path = py_json_path.device_value - - cdef const string_scalar* scalar_json_path = ( - json_path.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_get_json_object( - col_view, - scalar_json_path[0], - options.options, - )) - - return Column.from_unique_ptr(move(c_result)) - - -cdef class GetJsonObjectOptions: - cdef get_json_object_options options - - def __init__( - self, - *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False - ): - self.options.set_allow_single_quotes(allow_single_quotes) - self.options.set_strip_quotes_from_single_strings( - strip_quotes_from_single_strings - ) - self.options.set_missing_fields_as_nulls(missing_fields_as_nulls) - - @property - def allow_single_quotes(self): - return self.options.get_allow_single_quotes() - - @property - def strip_quotes_from_single_strings(self): - return self.options.get_strip_quotes_from_single_strings() - - @property - def missing_fields_as_nulls(self): - return self.options.get_missing_fields_as_nulls() - - @allow_single_quotes.setter - def allow_single_quotes(self, val): - self.options.set_allow_single_quotes(val) - - @strip_quotes_from_single_strings.setter - def strip_quotes_from_single_strings(self, val): - self.options.set_strip_quotes_from_single_strings(val) - - @missing_fields_as_nulls.setter - def missing_fields_as_nulls(self, val): - self.options.set_missing_fields_as_nulls(val) diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx deleted file mode 100644 index d0239e91ec3..00000000000 --- a/python/cudf/cudf/_lib/strings/padding.pyx +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -from enum import IntEnum - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.strings.padding cimport ( - pad as cpp_pad, - zfill as cpp_zfill, -) -from pylibcudf.libcudf.strings.side_type cimport ( - side_type, - underlying_type_t_side_type, -) - - -class SideType(IntEnum): - LEFT = side_type.LEFT - RIGHT = side_type.RIGHT - BOTH = side_type.BOTH - - -@acquire_spill_lock() -def pad(Column source_strings, - size_type width, - fill_char, - side=SideType.LEFT): - """ - Returns a Column by padding strings in `source_strings` - up to the given `width`. Direction of padding is to be specified by `side`. - The additional characters being filled can be changed by specifying - `fill_char`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - cdef side_type pad_direction = ( - side - ) - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - pad_direction, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def zfill(Column source_strings, - size_type width): - """ - Returns a Column by prepending strings in `source_strings` - with '0' characters up to the given `width`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_zfill( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def center(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling left and right side of strings - in `source_strings` with additional character, `fill_char` - up to the given `width`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.BOTH, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def ljust(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling right side of strings in `source_strings` - with additional character, `fill_char` up to the given `width`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.RIGHT, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def rjust(Column source_strings, - size_type width, - fill_char): - """ - Returns a Column by filling left side of strings in `source_strings` - with additional character, `fill_char` up to the given `width`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.LEFT, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx deleted file mode 100644 index 43649d4defe..00000000000 --- a/python/cudf/cudf/_lib/strings/repeat.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -import pylibcudf as plc - - -@acquire_spill_lock() -def repeat_scalar(Column source_strings, - size_type repeats): - """ - Returns a Column after repeating - each string in `source_strings` - `repeats` number of times. - """ - plc_result = plc.strings.repeat.repeat_strings( - source_strings.to_pylibcudf(mode="read"), - repeats - ) - return Column.from_pylibcudf(plc_result) - - -@acquire_spill_lock() -def repeat_sequence(Column source_strings, - Column repeats): - """ - Returns a Column after repeating - each string in `source_strings` - `repeats` number of times. - """ - plc_result = plc.strings.repeat.repeat_strings( - source_strings.to_pylibcudf(mode="read"), - repeats.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx deleted file mode 100644 index a260c4e4f45..00000000000 --- a/python/cudf/cudf/_lib/strings/replace.pyx +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def slice_replace(Column source_strings, - size_type start, - size_type stop, - object py_repl): - """ - Returns a Column by replacing specified section - of each string with `py_repl`. Positions can be - specified with `start` and `stop` params. - """ - - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace_slice( - source_strings.to_pylibcudf(mode="read"), - repl.c_value, - start, - stop - )) - - -@acquire_spill_lock() -def insert(Column source_strings, - size_type start, - object py_repl): - """ - Returns a Column by inserting a specified - string `py_repl` at a specific position in all strings. - """ - - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace_slice( - source_strings.to_pylibcudf(mode="read"), - repl.c_value, - start, - start, - )) - - -@acquire_spill_lock() -def replace(Column source_strings, - object py_target, - object py_repl, - int32_t maxrepl): - """ - Returns a Column after replacing occurrences of - patterns `py_target` with `py_repl` in `source_strings`. - `maxrepl` indicates number of replacements to make from start. - """ - cdef DeviceScalar target = py_target.device_value - cdef DeviceScalar repl = py_repl.device_value - - return Column.from_pylibcudf(plc.strings.replace.replace( - source_strings.to_pylibcudf(mode="read"), - target.c_value, - repl.c_value, - maxrepl - )) - - -@acquire_spill_lock() -def replace_multi(Column source_strings, - Column target_strings, - Column repl_strings): - """ - Returns a Column after replacing occurrences of - patterns `target_strings` with `repl_strings` in `source_strings`. - """ - return Column.from_pylibcudf(plc.strings.replace.replace_multiple( - source_strings.to_pylibcudf(mode="read"), - target_strings.to_pylibcudf(mode="read"), - repl_strings.to_pylibcudf(mode="read"), - )) diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx deleted file mode 100644 index fffc8b7c3f6..00000000000 --- a/python/cudf/cudf/_lib/strings/replace_re.pyx +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.replace_re cimport ( - replace_re as cpp_replace_re, - replace_with_backrefs as cpp_replace_with_backrefs, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def replace_re(Column source_strings, - object pattern, - object py_repl, - size_type n): - """ - Returns a Column after replacing occurrences regular - expressions `pattern` with `py_repl` in `source_strings`. - `n` indicates the number of resplacements to be made from - start. (-1 indicates all) - """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef const string_scalar* scalar_repl = \ - (repl.get_raw_ptr()) - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_replace_re( - source_view, - dereference(c_prog), - scalar_repl[0], - n - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def replace_with_backrefs( - Column source_strings, - object pattern, - object repl): - """ - Returns a Column after using the `repl` back-ref template to create - new string with the extracted elements found using - `pattern` regular expression in `source_strings`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef string repl_string = str(repl).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_replace_with_backrefs( - source_view, - dereference(c_prog), - repl_string - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def replace_multi_re(Column source_strings, - object patterns, - Column repl_strings): - """ - Returns a Column after replacing occurrences of multiple - regular expressions `patterns` with their corresponding - strings in `repl_strings` in `source_strings`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view repl_view = repl_strings.view() - - cdef int pattern_size = len(patterns) - cdef vector[string] patterns_vector - patterns_vector.reserve(pattern_size) - - for pattern in patterns: - patterns_vector.push_back(str.encode(pattern)) - - with nogil: - c_result = move(cpp_replace_re( - source_view, - patterns_vector, - repl_view - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt deleted file mode 100644 index 4ede0a2fac5..00000000000 --- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt +++ /dev/null @@ -1,22 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources partition.pyx split.pyx) - -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/strings/split/__init__.pxd b/python/cudf/cudf/_lib/strings/split/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/split/__init__.py b/python/cudf/cudf/_lib/strings/split/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx deleted file mode 100644 index a81fb18e752..00000000000 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.split.partition cimport ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from pylibcudf.libcudf.table.table cimport table - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr - - -@acquire_spill_lock() -def partition(Column source_strings, - object py_delimiter): - """ - Returns data by splitting the `source_strings` - column at the first occurrence of the specified `py_delimiter`. - """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_partition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) - ) - - -@acquire_spill_lock() -def rpartition(Column source_strings, - object py_delimiter): - """ - Returns a Column by splitting the `source_strings` - column at the last occurrence of the specified `py_delimiter`. - """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rpartition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) - ) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx deleted file mode 100644 index f481fea4c51..00000000000 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ /dev/null @@ -1,264 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.split.split cimport ( - rsplit as cpp_rsplit, - rsplit_re as cpp_rsplit_re, - rsplit_record as cpp_rsplit_record, - rsplit_record_re as cpp_rsplit_record_re, - split as cpp_split, - split_re as cpp_split_re, - split_record as cpp_split_record, - split_record_re as cpp_split_record_re, -) -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr - - -@acquire_spill_lock() -def split(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from beginning. - """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) - ) - - -@acquire_spill_lock() -def split_record(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from beginning. - """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), - ) - - -@acquire_spill_lock() -def rsplit(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from the end. - """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) - ) - - -@acquire_spill_lock() -def rsplit_record(Column source_strings, - object py_delimiter, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the specified `py_delimiter`. - The split happens from the end. - """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), - ) - - -@acquire_spill_lock() -def split_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the delimiters identified by `pattern`. - """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) - ) - - -@acquire_spill_lock() -def rsplit_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns data by splitting the `source_strings` - column around the delimiters identified by `pattern`. - The delimiters are searched starting from the end of each string. - """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) - ) - - -@acquire_spill_lock() -def split_record_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the delimiters identified by `pattern`. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), - ) - - -@acquire_spill_lock() -def rsplit_record_re(Column source_strings, - object pattern, - size_type maxsplit): - """ - Returns a Column by splitting the `source_strings` - column around the delimiters identified by `pattern`. - The delimiters are searched starting from the end of each string. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), - ) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx deleted file mode 100644 index 38ecb21a94c..00000000000 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.side_type cimport side_type -from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -import pylibcudf as plc - - -@acquire_spill_lock() -def strip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from left and right side - can be specified by `py_repl`. - """ - - cdef DeviceScalar repl = py_repl.device_value - return Column.from_pylibcudf( - plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.SideType.BOTH, - repl.c_value - ) - ) - - -@acquire_spill_lock() -def lstrip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from left side can - be specified by `py_repl`. - """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.LEFT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def rstrip(Column source_strings, - object py_repl): - """ - Returns a Column by removing leading and trailing characters. - The set of characters need be stripped from right side can - be specified by `py_repl`. - """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.RIGHT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx deleted file mode 100644 index db96d99c7b6..00000000000 --- a/python/cudf/cudf/_lib/strings/substring.pyx +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - -import pylibcudf as plc - - -@acquire_spill_lock() -def slice_strings(Column source_strings, - object start, - object end, - object step): - """ - Returns a Column by extracting a substring of each string - at given start and end positions. Slicing can also be - performed in steps by skipping `step` number of - characters in a string. - """ - cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32) - cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32) - cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - start_scalar.c_value, - end_scalar.c_value, - step_scalar.c_value - ) - ) - - -@acquire_spill_lock() -def slice_from(Column source_strings, - Column starts, - Column stops): - """ - Returns a Column by extracting a substring of each string - at given starts and stops positions. `starts` and `stops` - here are positions per element in the string-column. - """ - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - starts.to_pylibcudf(mode="read"), - stops.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def get(Column source_strings, - object index): - """ - Returns a Column which contains only single - character from each input string. The index of - characters required can be controlled by passing `index`. - """ - - if index < 0: - next_index = index - 1 - step = -1 - else: - next_index = index + 1 - step = 1 - cdef DeviceScalar start_scalar = as_device_scalar(index, np.int32) - cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32) - cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32) - - return Column.from_pylibcudf( - plc.strings.slice.slice_strings( - source_strings.to_pylibcudf(mode="read"), - start_scalar.c_value, - end_scalar.c_value, - step_scalar.c_value - ) - ) diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx deleted file mode 100644 index 3fad91bbfc0..00000000000 --- a/python/cudf/cudf/_lib/strings/translate.pyx +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.translate cimport ( - filter_characters as cpp_filter_characters, - filter_type, - translate as cpp_translate, -) -from pylibcudf.libcudf.types cimport char_utf8 - -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar - - -@acquire_spill_lock() -def translate(Column source_strings, - object mapping_table): - """ - Translates individual characters within each string - if present in the mapping_table. - """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - with nogil: - c_result = move(cpp_translate(source_view, c_mapping_table)) - - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() -def filter_characters(Column source_strings, - object mapping_table, - bool keep, - object py_repl): - """ - Removes or keeps individual characters within each string - using the provided mapping_table. - """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_repl = ( - repl.get_raw_ptr() - ) - cdef int table_size - table_size = len(mapping_table) - - cdef vector[pair[char_utf8, char_utf8]] c_mapping_table - c_mapping_table.reserve(table_size) - - for key in mapping_table: - value = mapping_table[key] - if type(value) is int: - value = chr(value) - if type(value) is str: - value = int.from_bytes(value.encode(), byteorder='big') - if type(key) is int: - key = chr(key) - if type(key) is str: - key = int.from_bytes(key.encode(), byteorder='big') - c_mapping_table.push_back((key, value)) - - cdef filter_type c_keep - if keep is True: - c_keep = filter_type.KEEP - else: - c_keep = filter_type.REMOVE - - with nogil: - c_result = move(cpp_filter_characters( - source_view, - c_mapping_table, - c_keep, - scalar_repl[0] - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx deleted file mode 100644 index eed5cf33b10..00000000000 --- a/python/cudf/cudf/_lib/strings/wrap.pyx +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def wrap(Column source_strings, - size_type width): - """ - Returns a Column by wrapping long strings - in the Column to be formatted in paragraphs - with length less than a given `width`. - """ - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_wrap( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index 78fc9f08bd8..952275c925d 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -1,71 +1,11 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. -from libc.stdint cimport uint8_t, uint16_t, uintptr_t - from pylibcudf.libcudf.strings_udf cimport ( - get_character_cases_table as cpp_get_character_cases_table, - get_character_flags_table as cpp_get_character_flags_table, - get_special_case_mapping_table as cpp_get_special_case_mapping_table, -) - -import numpy as np - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from cudf.core.buffer import as_buffer - -from pylibcudf.libcudf.column.column cimport column, column_view -from pylibcudf.libcudf.strings_udf cimport ( - column_from_udf_string_array as cpp_column_from_udf_string_array, - free_udf_string_array as cpp_free_udf_string_array, get_cuda_build_version as cpp_get_cuda_build_version, - to_string_view_array as cpp_to_string_view_array, - udf_string, ) -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer -from cudf._lib.column cimport Column +from cudf._lib.column import f def get_cuda_build_version(): return cpp_get_cuda_build_version() - - -def column_to_string_view_array(Column strings_col): - cdef unique_ptr[device_buffer] c_buffer - cdef column_view input_view = strings_col.view() - with nogil: - c_buffer = move(cpp_to_string_view_array(input_view)) - - db = DeviceBuffer.c_from_unique_ptr(move(c_buffer)) - return as_buffer(db, exposed=True) - - -def column_from_udf_string_array(DeviceBuffer d_buffer): - cdef size_t size = int(d_buffer.c_size() / sizeof(udf_string)) - cdef udf_string* data = d_buffer.c_data() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_column_from_udf_string_array(data, size)) - cpp_free_udf_string_array(data, size) - - result = Column.from_unique_ptr(move(c_result)) - - return result - - -def get_character_flags_table_ptr(): - cdef const uint8_t* tbl_ptr = cpp_get_character_flags_table() - return np.uintp(tbl_ptr) - - -def get_character_cases_table_ptr(): - cdef const uint16_t* tbl_ptr = cpp_get_character_cases_table() - return np.uintp(tbl_ptr) - - -def get_special_case_mapping_table_ptr(): - cdef const void* tbl_ptr = cpp_get_special_case_mapping_table() - return np.uintp(tbl_ptr) diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx deleted file mode 100644 index b2c7232f549..00000000000 --- a/python/cudf/cudf/_lib/text.pyx +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from io import TextIOBase - -from cython.operator cimport dereference -from libc.stdint cimport uint64_t -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.io.text cimport ( - byte_range_info, - data_chunk_source, - make_source, - make_source_from_bgzip_file, - make_source_from_file, - multibyte_split, - parse_options, -) - -from cudf._lib.column cimport Column - - -def read_text(object filepaths_or_buffers, - object delimiter=None, - object byte_range=None, - object strip_delimiters=False, - object compression=None, - object compression_offsets=None): - """ - Cython function to call into libcudf API, see `multibyte_split`. - - See Also - -------- - cudf.io.text.read_text - """ - cdef string delim = delimiter.encode() - - cdef unique_ptr[data_chunk_source] datasource - cdef unique_ptr[column] c_col - - cdef size_t c_byte_range_offset - cdef size_t c_byte_range_size - cdef uint64_t c_compression_begin_offset - cdef uint64_t c_compression_end_offset - cdef parse_options c_options - - if compression is None: - if isinstance(filepaths_or_buffers, TextIOBase): - datasource = move(make_source( - filepaths_or_buffers.read().encode())) - else: - datasource = move(make_source_from_file( - filepaths_or_buffers.encode())) - elif compression == "bgzip": - if isinstance(filepaths_or_buffers, TextIOBase): - raise ValueError("bgzip compression requires a file path") - if compression_offsets is not None: - if len(compression_offsets) != 2: - raise ValueError( - "compression offsets need to consist of two elements") - c_compression_begin_offset = compression_offsets[0] - c_compression_end_offset = compression_offsets[1] - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode(), - c_compression_begin_offset, - c_compression_end_offset)) - else: - datasource = move(make_source_from_bgzip_file( - filepaths_or_buffers.encode())) - else: - raise ValueError("Only bgzip compression is supported at the moment") - - c_options = parse_options() - if byte_range is not None: - c_byte_range_offset = byte_range[0] - c_byte_range_size = byte_range[1] - c_options.byte_range = byte_range_info( - c_byte_range_offset, - c_byte_range_size) - c_options.strip_delimiters = strip_delimiters - with nogil: - c_col = move(multibyte_split( - dereference(datasource), - delim, - c_options)) - - return Column.from_unique_ptr(move(c_col)) diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx deleted file mode 100644 index 54624a5a2fd..00000000000 --- a/python/cudf/cudf/_lib/timezone.pyx +++ /dev/null @@ -1,10 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -def make_timezone_transition_table(tzdir, tzname): - plc_table = plc.io.timezone.make_timezone_transition_table(tzdir, tzname) - return [Column.from_pylibcudf(col) for col in plc_table.columns()] diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx deleted file mode 100644 index 40d0c9eac3a..00000000000 --- a/python/cudf/cudf/_lib/transform.pyx +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from numba.np import numpy_support - -import cudf -from cudf.core._internals.expressions import parse_expression -from cudf.core.buffer import acquire_spill_lock, as_buffer -from cudf.utils import cudautils - -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -cimport pylibcudf.libcudf.transform as libcudf_transform -from pylibcudf cimport transform as plc_transform -from pylibcudf.expressions cimport Expression -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.expressions cimport expression -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from cudf._lib.utils cimport table_view_from_columns - -import pylibcudf as plc - - -@acquire_spill_lock() -def bools_to_mask(Column col): - """ - Given an int8 (boolean) column, compress the data from booleans to bits and - return a Buffer - """ - mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read")) - return as_buffer(mask) - - -@acquire_spill_lock() -def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit): - """ - Given a mask buffer, returns a boolean column representng bit 0 -> False - and 1 -> True within range of [begin_bit, end_bit), - """ - if not isinstance(mask_buffer, cudf.core.buffer.Buffer): - raise TypeError("mask_buffer is not an instance of " - "cudf.core.buffer.Buffer") - plc_column = plc_transform.mask_to_bools( - mask_buffer.get_ptr(mode="read"), begin_bit, end_bit - ) - return Column.from_pylibcudf(plc_column) - - -@acquire_spill_lock() -def nans_to_nulls(Column input): - mask, _ = plc_transform.nans_to_nulls( - input.to_pylibcudf(mode="read") - ) - return as_buffer(mask) - - -@acquire_spill_lock() -def transform(Column input, op): - nb_type = numpy_support.from_dtype(input.dtype) - nb_signature = (nb_type,) - compiled_op = cudautils.compile_udf(op, nb_signature) - np_dtype = cudf.dtype(compiled_op[1]) - - plc_column = plc_transform.transform( - input.to_pylibcudf(mode="read"), - compiled_op[0], - plc.column._datatype_from_dtype_desc(np_dtype.str[1:]), - True - ) - return Column.from_pylibcudf(plc_column) - - -def table_encode(list source_columns): - plc_table, plc_column = plc_transform.encode( - plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]) - ) - - return ( - [Column.from_pylibcudf(col) for col in plc_table.columns()], - Column.from_pylibcudf(plc_column) - ) - - -def one_hot_encode(Column input_column, Column categories): - plc_table = plc_transform.one_hot_encode( - input_column.to_pylibcudf(mode="read"), - categories.to_pylibcudf(mode="read"), - ) - result_columns = [ - Column.from_pylibcudf(col, data_ptr_exposed=True) - for col in plc_table.columns() - ] - result_labels = [ - x if x is not None else '' - for x in categories.to_arrow().to_pylist() - ] - return dict(zip(result_labels, result_columns)) - - -@acquire_spill_lock() -def compute_column(list columns, tuple column_names, expr: str): - """Compute a new column by evaluating an expression on a set of columns. - - Parameters - ---------- - columns : list - The set of columns forming the table to evaluate the expression on. - column_names : tuple[str] - The names associated with each column. These names are necessary to map - column names in the expression to indices in the provided list of - columns, which are what will be used by libcudf to evaluate the - expression on the table. - expr : str - The expression to evaluate. - """ - visitor = parse_expression(expr, column_names) - - # At the end, all the stack contains is the expression to evaluate. - cdef Expression cudf_expr = visitor.expression - cdef table_view tbl = table_view_from_columns(columns) - cdef unique_ptr[column] col - with nogil: - col = move( - libcudf_transform.compute_column( - tbl, - dereference(cudf_expr.c_obj.get()) - ) - ) - return Column.from_unique_ptr(move(col)) diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx deleted file mode 100644 index 995d278cb88..00000000000 --- a/python/cudf/cudf/_lib/transpose.pyx +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -def transpose(list source_columns): - """Transpose m n-row columns into n m-row columns - """ - input_table = plc.table.Table( - [col.to_pylibcudf(mode="read") for col in source_columns] - ) - result_table = plc.transpose.transpose(input_table) - return [ - Column.from_pylibcudf(col, data_ptr_exposed=True) - for col in result_table.columns() - ] diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd deleted file mode 100644 index 4fd3d31841e..00000000000 --- a/python/cudf/cudf/_lib/types.pxd +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t -from libcpp cimport bool - -cimport pylibcudf.libcudf.types as libcudf_types -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view - -ctypedef bool underlying_type_t_order -ctypedef bool underlying_type_t_null_order -ctypedef bool underlying_type_t_sorted -ctypedef int32_t underlying_type_t_interpolation -ctypedef int32_t underlying_type_t_type_id -ctypedef bool underlying_type_t_null_policy - -cdef dtype_from_column_view(column_view cv) - -cdef libcudf_types.data_type dtype_to_data_type(dtype) except * -cpdef dtype_to_pylibcudf_type(dtype) -cdef bool is_decimal_type_id(libcudf_types.type_id tid) except * diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx deleted file mode 100644 index 861bb063707..00000000000 --- a/python/cudf/cudf/_lib/types.pyx +++ /dev/null @@ -1,343 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from enum import IntEnum - -import numpy as np -import pandas as pd - -from libcpp.memory cimport make_shared, shared_ptr - -cimport pylibcudf.libcudf.types as libcudf_types -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view - -from cudf._lib.types cimport ( - underlying_type_t_interpolation, - underlying_type_t_order, - underlying_type_t_sorted, -) - -import pylibcudf - -import cudf - - -class TypeId(IntEnum): - EMPTY = libcudf_types.type_id.EMPTY - INT8 = libcudf_types.type_id.INT8 - INT16 = libcudf_types.type_id.INT16 - INT32 = libcudf_types.type_id.INT32 - INT64 = libcudf_types.type_id.INT64 - UINT8 = libcudf_types.type_id.UINT8 - UINT16 = libcudf_types.type_id.UINT16 - UINT32 = libcudf_types.type_id.UINT32 - UINT64 = libcudf_types.type_id.UINT64 - FLOAT32 = libcudf_types.type_id.FLOAT32 - FLOAT64 = libcudf_types.type_id.FLOAT64 - BOOL8 = libcudf_types.type_id.BOOL8 - TIMESTAMP_DAYS = ( - libcudf_types.type_id.TIMESTAMP_DAYS - ) - TIMESTAMP_SECONDS = ( - libcudf_types.type_id.TIMESTAMP_SECONDS - ) - TIMESTAMP_MILLISECONDS = ( - ( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - ) - TIMESTAMP_MICROSECONDS = ( - ( - libcudf_types.type_id.TIMESTAMP_MICROSECONDS - ) - ) - TIMESTAMP_NANOSECONDS = ( - libcudf_types.type_id.TIMESTAMP_NANOSECONDS - ) - DURATION_SECONDS = ( - libcudf_types.type_id.DURATION_SECONDS - ) - DURATION_MILLISECONDS = ( - libcudf_types.type_id.DURATION_MILLISECONDS - ) - DURATION_MICROSECONDS = ( - libcudf_types.type_id.DURATION_MICROSECONDS - ) - DURATION_NANOSECONDS = ( - libcudf_types.type_id.DURATION_NANOSECONDS - ) - STRING = libcudf_types.type_id.STRING - DECIMAL32 = libcudf_types.type_id.DECIMAL32 - DECIMAL64 = libcudf_types.type_id.DECIMAL64 - DECIMAL128 = libcudf_types.type_id.DECIMAL128 - STRUCT = libcudf_types.type_id.STRUCT - - -SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = { - np.dtype("int8"): TypeId.INT8, - np.dtype("int16"): TypeId.INT16, - np.dtype("int32"): TypeId.INT32, - np.dtype("int64"): TypeId.INT64, - np.dtype("uint8"): TypeId.UINT8, - np.dtype("uint16"): TypeId.UINT16, - np.dtype("uint32"): TypeId.UINT32, - np.dtype("uint64"): TypeId.UINT64, - np.dtype("float32"): TypeId.FLOAT32, - np.dtype("float64"): TypeId.FLOAT64, - np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): TypeId.STRING, - np.dtype("bool"): TypeId.BOOL8, - np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS, -} - -SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { - k: pylibcudf.TypeId(v).value - for k, v in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.items() -} - -LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - # There's no equivalent to EMPTY in cudf. We translate EMPTY - # columns from libcudf to ``int8`` columns of all nulls in Python. - # ``int8`` is chosen because it uses the least amount of memory. - TypeId.EMPTY: np.dtype("int8"), - TypeId.INT8: np.dtype("int8"), - TypeId.INT16: np.dtype("int16"), - TypeId.INT32: np.dtype("int32"), - TypeId.INT64: np.dtype("int64"), - TypeId.UINT8: np.dtype("uint8"), - TypeId.UINT16: np.dtype("uint16"), - TypeId.UINT32: np.dtype("uint32"), - TypeId.UINT64: np.dtype("uint64"), - TypeId.FLOAT32: np.dtype("float32"), - TypeId.FLOAT64: np.dtype("float64"), - TypeId.BOOL8: np.dtype("bool"), - TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"), - TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"), - TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"), - TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"), - TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"), - TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"), - TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"), - TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"), - TypeId.STRING: np.dtype("object"), - TypeId.STRUCT: np.dtype("object"), -} - -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - pylibcudf.TypeId(k).value: v - for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items() -} - -duration_unit_map = { - TypeId.DURATION_SECONDS: "s", - TypeId.DURATION_MILLISECONDS: "ms", - TypeId.DURATION_MICROSECONDS: "us", - TypeId.DURATION_NANOSECONDS: "ns" -} - -datetime_unit_map = { - TypeId.TIMESTAMP_SECONDS: "s", - TypeId.TIMESTAMP_MILLISECONDS: "ms", - TypeId.TIMESTAMP_MICROSECONDS: "us", - TypeId.TIMESTAMP_NANOSECONDS: "ns", -} - -size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID] - - -class Interpolation(IntEnum): - LINEAR = ( - libcudf_types.interpolation.LINEAR - ) - LOWER = ( - libcudf_types.interpolation.LOWER - ) - HIGHER = ( - libcudf_types.interpolation.HIGHER - ) - MIDPOINT = ( - libcudf_types.interpolation.MIDPOINT - ) - NEAREST = ( - libcudf_types.interpolation.NEAREST - ) - - -class Order(IntEnum): - ASCENDING = libcudf_types.order.ASCENDING - DESCENDING = libcudf_types.order.DESCENDING - - -class Sorted(IntEnum): - YES = libcudf_types.sorted.YES - NO = libcudf_types.sorted.NO - - -class NullOrder(IntEnum): - BEFORE = libcudf_types.null_order.BEFORE - AFTER = libcudf_types.null_order.AFTER - - -class NullHandling(IntEnum): - INCLUDE = libcudf_types.null_policy.INCLUDE - EXCLUDE = libcudf_types.null_policy.EXCLUDE - - -cdef dtype_from_lists_column_view(column_view cv): - # lists_column_view have no default constructor, so we heap - # allocate it to get around Cython's limitation of requiring - # default constructors for stack allocated objects - cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) - cdef column_view child = lv.get()[0].child() - - if child.type().id() == libcudf_types.type_id.LIST: - return cudf.ListDtype(dtype_from_lists_column_view(child)) - elif child.type().id() == libcudf_types.type_id.EMPTY: - return cudf.ListDtype("int8") - else: - return cudf.ListDtype( - dtype_from_column_view(child) - ) - -cdef dtype_from_structs_column_view(column_view cv): - fields = { - str(i): dtype_from_column_view(cv.child(i)) - for i in range(cv.num_children()) - } - return cudf.StructDtype(fields) - -cdef dtype_from_column_view(column_view cv): - cdef libcudf_types.type_id tid = cv.type().id() - if tid == libcudf_types.type_id.LIST: - return dtype_from_lists_column_view(cv) - elif tid == libcudf_types.type_id.STRUCT: - return dtype_from_structs_column_view(cv) - elif tid == libcudf_types.type_id.DECIMAL64: - return cudf.Decimal64Dtype( - precision=cudf.Decimal64Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL32: - return cudf.Decimal32Dtype( - precision=cudf.Decimal32Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL128: - return cudf.Decimal128Dtype( - precision=cudf.Decimal128Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - else: - return LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (tid) - ] - -cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: - # Note: This function is to be phased out in favor of - # dtype_to_pylibcudf_type which will return a pylibcudf - # DataType object - cdef libcudf_types.type_id tid - if isinstance(dtype, cudf.ListDtype): - tid = libcudf_types.type_id.LIST - elif isinstance(dtype, cudf.StructDtype): - tid = libcudf_types.type_id.STRUCT - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = libcudf_types.type_id.DECIMAL128 - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = libcudf_types.type_id.DECIMAL64 - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = libcudf_types.type_id.DECIMAL32 - else: - tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)])) - - if is_decimal_type_id(tid): - return libcudf_types.data_type(tid, -dtype.scale) - else: - return libcudf_types.data_type(tid) - -cpdef dtype_to_pylibcudf_type(dtype): - if isinstance(dtype, cudf.ListDtype): - return pylibcudf.DataType(pylibcudf.TypeId.LIST) - elif isinstance(dtype, cudf.StructDtype): - return pylibcudf.DataType(pylibcudf.TypeId.STRUCT) - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = pylibcudf.TypeId.DECIMAL128 - return pylibcudf.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = pylibcudf.TypeId.DECIMAL64 - return pylibcudf.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = pylibcudf.TypeId.DECIMAL32 - return pylibcudf.DataType(tid, -dtype.scale) - # libcudf types don't support localization so convert to the base type - elif isinstance(dtype, pd.DatetimeTZDtype): - dtype = np.dtype(f"(tid) - ] diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx deleted file mode 100644 index d5602fd5a1c..00000000000 --- a/python/cudf/cudf/_lib/unary.pyx +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf._lib.column cimport Column -from cudf._lib.types cimport dtype_to_pylibcudf_type - -import numpy as np - -import pylibcudf - -from cudf.api.types import is_decimal_dtype -from cudf.core.buffer import acquire_spill_lock - - -@acquire_spill_lock() -def unary_operation(Column input, object op): - return Column.from_pylibcudf( - pylibcudf.unary.unary_operation(input.to_pylibcudf(mode="read"), op) - ) - - -@acquire_spill_lock() -def is_null(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_null(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def is_valid(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_valid(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def cast(Column input, object dtype=np.float64): - result = Column.from_pylibcudf( - pylibcudf.unary.cast( - input.to_pylibcudf(mode="read"), - dtype_to_pylibcudf_type(dtype) - ) - ) - - if is_decimal_dtype(result.dtype): - result.dtype.precision = dtype.precision - return result - - -@acquire_spill_lock() -def is_nan(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_nan(input.to_pylibcudf(mode="read")) - ) - - -@acquire_spill_lock() -def is_non_nan(Column input): - return Column.from_pylibcudf( - pylibcudf.unary.is_not_nan(input.to_pylibcudf(mode="read")) - ) diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd deleted file mode 100644 index 7254db5c43d..00000000000 --- a/python/cudf/cudf/_lib/utils.pxd +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column cimport column_view -from pylibcudf.libcudf.table.table cimport table, table_view - - -cdef data_from_unique_ptr( - unique_ptr[table] c_tbl, column_names, index_names=*) -cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) -cdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) -cdef data_from_table_view( - table_view tv, object owner, object column_names, object index_names=*) -cdef table_view table_view_from_columns(columns) except * -cdef table_view table_view_from_table(tbl, ignore_index=*) except* -cdef columns_from_unique_ptr(unique_ptr[table] c_tbl) -cdef columns_from_table_view(table_view tv, object owners) -cdef columns_from_pylibcudf_table(tbl) -cdef _data_from_columns(columns, column_names, index_names=*) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx deleted file mode 100644 index 9e5b99f64eb..00000000000 --- a/python/cudf/cudf/_lib/utils.pyx +++ /dev/null @@ -1,400 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pyarrow as pa - -import cudf - -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column cimport column, column_view -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -try: - import ujson as json -except ImportError: - import json - -from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype - -PARQUET_META_TYPE_MAP = { - str(cudf_dtype): str(pandas_dtype) - for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items() -} - -cdef table_view table_view_from_columns(columns) except*: - """Create a cudf::table_view from an iterable of Columns.""" - cdef vector[column_view] column_views - - cdef Column col - for col in columns: - column_views.push_back(col.view()) - - return table_view(column_views) - - -cdef table_view table_view_from_table(tbl, ignore_index=False) except*: - """Create a cudf::table_view from a Table. - - Parameters - ---------- - ignore_index : bool, default False - If True, don't include the index in the columns. - """ - return table_view_from_columns( - tbl._index._columns + tbl._columns - if not ignore_index and tbl._index is not None - else tbl._columns - ) - - -cpdef generate_pandas_metadata(table, index): - col_names = [] - types = [] - index_levels = [] - index_descriptors = [] - columns_to_convert = list(table._columns) - # Columns - for name, col in table._column_labels_and_values: - if cudf.get_option("mode.pandas_compatible"): - # in pandas-compat mode, non-string column names are stringified. - col_names.append(str(name)) - else: - col_names.append(name) - - if isinstance(col.dtype, cudf.CategoricalDtype): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif isinstance(col.dtype, ( - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype - )): - types.append(col.dtype.to_arrow()) - else: - # A boolean element takes 8 bits in cudf and 1 bit in - # pyarrow. To make sure the cudf format is interperable - # in arrow, we use `int8` type when converting from a - # cudf boolean array. - if col.dtype.type == np.bool_: - types.append(pa.int8()) - else: - types.append(np_to_pa_dtype(col.dtype)) - - # Indexes - materialize_index = False - if index is not False: - for level, name in enumerate(table._index.names): - if isinstance(table._index, cudf.MultiIndex): - idx = table.index.get_level_values(level) - else: - idx = table.index - - if isinstance(idx, cudf.RangeIndex): - if index is None: - descr = { - "kind": "range", - "name": table.index.name, - "start": table.index.start, - "stop": table.index.stop, - "step": table.index.step, - } - else: - materialize_index = True - # When `index=True`, RangeIndex needs to be materialized. - materialized_idx = idx._as_int_index() - descr = _index_level_name( - index_name=materialized_idx.name, - level=level, - column_names=col_names - ) - index_levels.append(materialized_idx) - columns_to_convert.append(materialized_idx._values) - col_names.append(descr) - types.append(np_to_pa_dtype(materialized_idx.dtype)) - else: - descr = _index_level_name( - index_name=idx.name, - level=level, - column_names=col_names - ) - columns_to_convert.append(idx._values) - col_names.append(descr) - if isinstance(idx.dtype, cudf.CategoricalDtype): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif isinstance(idx.dtype, cudf.ListDtype): - types.append(col.dtype.to_arrow()) - else: - # A boolean element takes 8 bits in cudf and 1 bit in - # pyarrow. To make sure the cudf format is interperable - # in arrow, we use `int8` type when converting from a - # cudf boolean array. - if idx.dtype.type == np.bool_: - types.append(pa.int8()) - else: - types.append(np_to_pa_dtype(idx.dtype)) - - index_levels.append(idx) - index_descriptors.append(descr) - - df_meta = table.head(0) - if materialize_index: - df_meta.index = df_meta.index._as_int_index() - metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=columns_to_convert, - # It is OKAY to do `.head(0).to_pandas()` because - # this method will extract `.columns` metadata only - df=df_meta.to_pandas(), - column_names=col_names, - index_levels=index_levels, - index_descriptors=index_descriptors, - preserve_index=index, - types=types, - ) - - md_dict = json.loads(metadata[b"pandas"]) - - # correct metadata for list and struct and nullable numeric types - for col_meta in md_dict["columns"]: - if ( - col_meta["name"] in table._column_names - and table._data[col_meta["name"]].nullable - and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP - and col_meta["pandas_type"] != "decimal" - ): - col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[ - col_meta["numpy_type"] - ] - if col_meta["numpy_type"] in ("list", "struct"): - col_meta["numpy_type"] = "object" - - return json.dumps(md_dict) - - -def _index_level_name(index_name, level, column_names): - """ - Return the name of an index level or a default name - if `index_name` is None or is already a column name. - - Parameters - ---------- - index_name : name of an Index object - level : level of the Index object - - Returns - ------- - name : str - """ - if index_name is not None and index_name not in column_names: - return index_name - else: - return f"__index_level_{level}__" - - -cdef columns_from_unique_ptr( - unique_ptr[table] c_tbl -): - """Convert a libcudf table into list of columns. - - Parameters - ---------- - c_tbl : unique_ptr[cudf::table] - The libcudf table whose columns will be extracted - - Returns - ------- - list[Column] - A list of columns. - """ - cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release()) - cdef vector[unique_ptr[column]].iterator it = c_columns.begin() - - cdef size_t i - - columns = [Column.from_unique_ptr(move(dereference(it+i))) - for i in range(c_columns.size())] - - return columns - - -cdef columns_from_pylibcudf_table(tbl): - """Convert a pylibcudf table into list of columns. - - Parameters - ---------- - tbl : pylibcudf.Table - The pylibcudf table whose columns will be extracted - - Returns - ------- - list[Column] - A list of columns. - """ - return [Column.from_pylibcudf(plc) for plc in tbl.columns()] - - -cdef _data_from_columns(columns, column_names, index_names=None): - """Convert a list of columns into a dict with an index. - - This method is intended to provide the bridge between the columns returned - from calls to libcudf or pylibcudf APIs and the cuDF Python Frame objects, which - require named columns and a separate index. - - Since cuDF Python has an independent representation of a table as a - collection of columns, this function simply returns a dict of columns - suitable for conversion into data to be passed to cuDF constructors. - This method returns the columns of the table in the order they are - stored in libcudf, but calling code is responsible for partitioning and - labeling them as needed. - - Parameters - ---------- - columns : list[Column] - The columns to be extracted - column_names : iterable - The keys associated with the columns in the output data. - index_names : iterable, optional - If provided, an iterable of strings that will be used to label the - corresponding first set of columns into a (Multi)Index. If this - argument is omitted, all columns are assumed to be part of the output - table and no index is constructed. - """ - # First construct the index, if any - index = ( - # TODO: For performance, the _from_data methods of Frame types assume - # that the passed index object is already an Index because cudf.Index - # and cudf.as_index are expensive. As a result, this function is - # currently somewhat inconsistent in returning a dict of columns for - # the data while actually constructing the Index object here (instead - # of just returning a dict for that as well). As we clean up the - # Frame factories we may want to look for a less dissonant approach - # that does not impose performance penalties. The same applies to - # data_from_table_view below. - cudf.core.index._index_from_data( - { - name: columns[i] - for i, name in enumerate(index_names) - } - ) - if index_names is not None - else None - ) - n_index_columns = len(index_names) if index_names is not None else 0 - data = { - name: columns[i + n_index_columns] - for i, name in enumerate(column_names) - } - return data, index - - -cdef data_from_unique_ptr( - unique_ptr[table] c_tbl, column_names, index_names=None -): - return _data_from_columns( - columns_from_unique_ptr(move(c_tbl)), - column_names, - index_names - ) - - -cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): - return _data_from_columns( - columns_from_pylibcudf_table(tbl), - column_names, - index_names - ) - -cdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None): - """ - Unpacks the TableWithMetadata from libcudf I/O - into a dict of columns and an Index (cuDF format) - """ - if column_names is None: - column_names = tbl_with_meta.column_names(include_children=False) - return _data_from_columns( - columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], - column_names=column_names, - index_names=index_names - ) - -cdef columns_from_table_view( - table_view tv, - object owners, -): - """ - Given a ``cudf::table_view``, constructs a list of columns from it, - along with referencing an owner Python object that owns the memory - lifetime. owner must be either None or a list of column. If owner - is a list of columns, the owner of the `i`th ``cudf::column_view`` - in the table view is ``owners[i]``. For more about memory ownership, - see ``Column.from_column_view``. - """ - - return [ - Column.from_column_view( - tv.column(i), owners[i] if isinstance(owners, list) else None - ) for i in range(tv.num_columns()) - ] - -cdef data_from_table_view( - table_view tv, - object owner, - object column_names, - object index_names=None -): - """ - Given a ``cudf::table_view``, constructs a Frame from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. If ``owner`` is a Frame we reach inside of it and - reach inside of each ``cudf.Column`` to make the owner of each newly - created ``Buffer`` underneath the ``cudf.Column`` objects of the - created Frame the respective ``Buffer`` from the relevant - ``cudf.Column`` of the ``owner`` Frame - """ - cdef size_type column_idx = 0 - table_owner = isinstance(owner, cudf.core.frame.Frame) - - # First construct the index, if any - index = None - if index_names is not None: - index_columns = [] - for _ in index_names: - column_owner = owner - if table_owner: - column_owner = owner._index._columns[column_idx] - index_columns.append( - Column.from_column_view( - tv.column(column_idx), - column_owner - ) - ) - column_idx += 1 - index = cudf.core.index._index_from_data( - dict(zip(index_names, index_columns))) - - # Construct the data dict - cdef size_type source_column_idx = 0 - data_columns = [] - for _ in column_names: - column_owner = owner - if table_owner: - column_owner = owner._columns[source_column_idx] - data_columns.append( - Column.from_column_view(tv.column(column_idx), column_owner) - ) - column_idx += 1 - source_column_idx += 1 - - return dict(zip(column_names, data_columns)), index diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py deleted file mode 100644 index 6e8ad556b08..00000000000 --- a/python/cudf/cudf/_typing.py +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import sys -from collections.abc import Callable -from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union - -import numpy as np -from pandas import Period, Timedelta, Timestamp - -if TYPE_CHECKING: - from pandas.api.extensions import ExtensionDtype - - import cudf - -# Backwards compat: mypy >= 0.790 rejects Type[NotImplemented], but -# NotImplementedType is only introduced in 3.10 -if sys.version_info >= (3, 10): - from types import NotImplementedType -else: - NotImplementedType = Any - -# Many of these are from -# https://github.com/pandas-dev/pandas/blob/master/pandas/_typing.py - -Dtype = Union["ExtensionDtype", str, np.dtype] -DtypeObj = Union["ExtensionDtype", np.dtype] - -# scalars -DatetimeLikeScalar = TypeVar( - "DatetimeLikeScalar", Period, Timestamp, Timedelta -) -ScalarLike = Any - -# columns -ColumnLike = Any - -# binary operation -ColumnBinaryOperand = Union["cudf.Scalar", "cudf.core.column.ColumnBase"] - -DataFrameOrSeries = Union["cudf.Series", "cudf.DataFrame"] -SeriesOrIndex = Union["cudf.Series", "cudf.core.index.BaseIndex"] -SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"] - -# Groupby aggregation -AggType = Union[str, Callable] -MultiColumnAggType = Union[ - AggType, Iterable[AggType], Dict[Any, Iterable[AggType]] -] diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py deleted file mode 100644 index 7dd732b4905..00000000000 --- a/python/cudf/cudf/_version.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib.resources - -__version__ = ( - importlib.resources.files(__package__) - .joinpath("VERSION") - .read_text() - .strip() -) -try: - __git_commit__ = ( - importlib.resources.files(__package__) - .joinpath("GIT_COMMIT") - .read_text() - .strip() - ) -except FileNotFoundError: - __git_commit__ = "" - -__all__ = ["__git_commit__", "__version__"] diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py deleted file mode 100644 index c66bfb4efeb..00000000000 --- a/python/cudf/cudf/api/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -from cudf.api import extensions, types - -__all__ = ["extensions", "types"] diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py deleted file mode 100644 index 6118b6bf620..00000000000 --- a/python/cudf/cudf/api/extensions/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -from pandas.api.extensions import no_default - -from cudf.api.extensions.accessor import ( - register_dataframe_accessor, - register_index_accessor, - register_series_accessor, -) - -__all__ = [ - "no_default", - "register_dataframe_accessor", - "register_index_accessor", - "register_series_accessor", -] diff --git a/python/cudf/cudf/api/extensions/accessor.py b/python/cudf/cudf/api/extensions/accessor.py deleted file mode 100644 index e4988c1fa68..00000000000 --- a/python/cudf/cudf/api/extensions/accessor.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import warnings - -from pandas.core.accessor import CachedAccessor - -import cudf -from cudf.utils.docutils import docfmt_partial - -_docstring_register_accessor = """ - Extends `cudf.{klass}` with custom defined accessor - - Parameters - ---------- - name : str - The name to be registered in `{klass}` for the custom accessor - - Returns - ------- - decorator : callable - Decorator function for accessor - - Notes - ----- - The `{klass}` object will be passed to your custom accessor upon first - invocation. And will be cached for future calls. - - If the data passed to your accessor is of wrong datatype, you should - raise an `AttributeError` in consistent with other cudf methods. - - - Examples - -------- - {example} -""" - -_dataframe_example = """ - In your library code: - - >>> import cudf - >>> @cudf.api.extensions.register_dataframe_accessor("point") - ... class PointsAccessor: - ... def __init__(self, obj): - ... self._validate(obj) - ... self._obj = obj - ... @staticmethod - ... def _validate(obj): - ... cols = obj.columns - ... if not all([vertex in cols for vertex in ["x", "y"]]): - ... raise AttributeError("Must have vertices 'x', 'y'.") - ... @property - ... def bounding_box(self): - ... xs, ys = self._obj["x"], self._obj["y"] - ... min_x, min_y = xs.min(), ys.min() - ... max_x, max_y = xs.max(), ys.max() - ... return (min_x, min_y, max_x, max_y) - - Then in user code: - - >>> df = cudf.DataFrame({'x': [1,2,3,4,5,6], 'y':[7,6,5,4,3,2]}) - >>> df.point.bounding_box - (1, 2, 6, 7) - -""" - -_index_example = """ - In your library code: - - >>> import cudf - >>> @cudf.api.extensions.register_index_accessor("odd") - ... class OddRowAccessor: - ... def __init__(self, obj): - ... self._obj = obj - ... def __getitem__(self, i): - ... return self._obj[2 * i - 1] - - Then in user code: - - >>> gs = cudf.Index(list(range(0, 50))) - >>> gs.odd[1] - 1 - >>> gs.odd[2] - 3 - >>> gs.odd[3] - 5 - -""" - -_series_example = """ - In your library code: - - >>> import cudf - >>> @cudf.api.extensions.register_series_accessor("odd") - ... class OddRowAccessor: - ... def __init__(self, obj): - ... self._obj = obj - ... def __getitem__(self, i): - ... return self._obj[2 * i - 1] - - Then in user code: - - >>> gs = cudf.Series(list(range(0, 50))) - >>> gs.odd[1] - 1 - >>> gs.odd[2] - 3 - >>> gs.odd[3] - 5 - -""" - - -doc_register_dataframe_accessor = docfmt_partial( - docstring=_docstring_register_accessor.format( - klass="DataFrame", example=_dataframe_example - ) -) - -doc_register_index_accessor = docfmt_partial( - docstring=_docstring_register_accessor.format( - klass="Index", example=_index_example - ) -) - -doc_register_series_accessor = docfmt_partial( - docstring=_docstring_register_accessor.format( - klass="Series", example=_series_example - ) -) - - -def _register_accessor(name, cls): - def decorator(accessor): - if hasattr(cls, name): - msg = f"Attribute {name} will be overridden in {cls.__name__}" - warnings.warn(msg) - cached_accessor = CachedAccessor(name, accessor) - cls._accessors.add(name) - setattr(cls, name, cached_accessor) - - return accessor - - return decorator - - -@doc_register_dataframe_accessor() -def register_dataframe_accessor(name): - """{docstring}""" - return _register_accessor(name, cudf.DataFrame) - - -@doc_register_index_accessor() -def register_index_accessor(name): - """{docstring}""" - return _register_accessor(name, cudf.BaseIndex) - - -@doc_register_series_accessor() -def register_series_accessor(name): - """{docstring}""" - return _register_accessor(name, cudf.Series) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py deleted file mode 100644 index 9c436dfad18..00000000000 --- a/python/cudf/cudf/api/types.py +++ /dev/null @@ -1,562 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -"""Define common type operations.""" - -from __future__ import annotations - -import warnings -from collections import abc -from functools import wraps -from inspect import isclass -from typing import cast - -import cupy as cp -import numpy as np -import pandas as pd -from pandas.api import types as pd_types - -import cudf -from cudf.core._compat import PANDAS_LT_300 -from cudf.core.dtypes import ( # noqa: F401 - _BaseDtype, - _is_categorical_dtype, - _is_interval_dtype, - dtype, - is_categorical_dtype, - is_decimal32_dtype, - is_decimal64_dtype, - is_decimal128_dtype, - is_decimal_dtype, - is_interval_dtype, - is_list_dtype, - is_struct_dtype, -) - - -def is_numeric_dtype(obj): - """Check whether the provided array or dtype is of a numeric dtype. - - Parameters - ---------- - obj : array-like or dtype - The array or dtype to check. - - Returns - ------- - bool - Whether or not the array or dtype is of a numeric dtype. - """ - if isclass(obj): - if issubclass(obj, cudf.core.dtypes.DecimalDtype): - return True - if issubclass(obj, _BaseDtype): - return False - else: - if isinstance( - obj, - (cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype), - ) or isinstance( - getattr(obj, "dtype", None), - (cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype), - ): - return True - if isinstance(obj, _BaseDtype) or isinstance( - getattr(obj, "dtype", None), _BaseDtype - ): - return False - if isinstance(obj, cudf.BaseIndex): - return obj._is_numeric() - return pd_types.is_numeric_dtype(obj) - - -# A version of numerical type check that does not include cudf decimals for -# places where we need to distinguish fixed and floating point numbers. -def _is_non_decimal_numeric_dtype(obj): - if isinstance(obj, _BaseDtype) or isinstance( - getattr(obj, "dtype", None), _BaseDtype - ): - return False - try: - return pd_types.is_numeric_dtype(obj) - except TypeError: - return False - - -def is_integer(obj): - """Return True if given object is integer. - - Returns - ------- - bool - """ - if isinstance(obj, cudf.Scalar): - return obj.dtype.kind in "iu" - return pd.api.types.is_integer(obj) - - -def is_string_dtype(obj): - """Check whether the provided array or dtype is of the string dtype. - - Parameters - ---------- - obj : array-like or dtype - The array or dtype to check. - - Returns - ------- - bool - Whether or not the array or dtype is of the string dtype. - """ - return ( - ( - isinstance(obj, (cudf.Index, cudf.Series)) - and obj.dtype == cudf.dtype("O") - ) - or (isinstance(obj, cudf.core.column.StringColumn)) - or ( - pd.api.types.is_string_dtype(obj) - # Reject all cudf extension types. - and not _is_categorical_dtype(obj) - and not is_decimal_dtype(obj) - and not is_list_dtype(obj) - and not is_struct_dtype(obj) - and not _is_interval_dtype(obj) - ) - ) - - -def is_scalar(val): - """Return True if given object is scalar. - - Parameters - ---------- - val : object - Possibly scalar object. - - Returns - ------- - bool - Return True if given object is scalar. - """ - return isinstance( - val, - ( - cudf.Scalar, - cudf._lib.scalar.DeviceScalar, - cudf.core.tools.datetimes.DateOffset, - ), - ) or ( - pd_types.is_scalar(val) - # Pytorch tensors advertise that they support the number - # protocol, and therefore return True for PyNumber_Check even - # when they have a shape. So, if we get through this, let's - # additionally check that if they have a shape property that - # it is empty. - # See https://github.com/pytorch/pytorch/issues/99646 - # and https://github.com/pandas-dev/pandas/issues/52701 - and len(getattr(val, "shape", ())) == 0 - ) - - -def _is_scalar_or_zero_d_array(val): - """Return True if given object is scalar or a 0d array. - - This is an internal function primarily used by indexing applications that - need to flatten dimensions that are indexed by 0d arrays. - - Parameters - ---------- - val : object - Possibly scalar object. - - Returns - ------- - bool - Return True if given object is scalar. - """ - return ( - isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0 - ) or is_scalar(val) - - -# TODO: We should be able to reuse the pandas function for this, need to figure -# out why we can't. -def is_list_like(obj): - """Return `True` if the given `obj` is list-like (list, tuple, Series...). - - Parameters - ---------- - obj : object of any type which needs to be validated. - - Returns - ------- - bool - Return True if given object is list-like. - """ - return isinstance(obj, (abc.Sequence, np.ndarray)) and not isinstance( - obj, (str, bytes) - ) - - -# These methods are aliased directly into this namespace, but can be modified -# later if we determine that there is a need. - - -def _wrap_pandas_is_dtype_api(func): - """Wrap a pandas dtype checking function to ignore cudf types.""" - - @wraps(func) - def wrapped_func(obj): - if ( - (isclass(obj) and issubclass(obj, _BaseDtype)) - or isinstance(obj, _BaseDtype) - or isinstance(getattr(obj, "dtype", None), _BaseDtype) - ): - return False - return func(obj) - - return wrapped_func - - -def _union_categoricals( - to_union: list[cudf.Series | cudf.CategoricalIndex], - sort_categories: bool = False, - ignore_order: bool = False, -): - """Combine categorical data. - - This API is currently internal but should be exposed once full support for - cudf.Categorical is ready. - """ - # TODO(s) in the order specified : - # 1. The return type needs to be changed - # to cudf.Categorical once it is implemented. - # 2. Make this API public (i.e., to resemble - # pd.api.types.union_categoricals) - - if ignore_order: - raise TypeError("ignore_order is not yet implemented") - - result_col = cudf.core.column.CategoricalColumn._concat( - [ - cast(cudf.core.column.CategoricalColumn, obj._column) - for obj in to_union - ] - ) - if sort_categories: - sorted_categories = result_col.categories.sort_values(ascending=True) - result_col = result_col.reorder_categories( - new_categories=sorted_categories - ) - - return cudf.CategoricalIndex._from_column(result_col) - - -def is_bool_dtype(arr_or_dtype): - """ - Check whether the provided array or dtype is of a boolean dtype. - - Parameters - ---------- - arr_or_dtype : array-like or dtype - The array or dtype to check. - - Returns - ------- - boolean - Whether or not the array or dtype is of a boolean dtype. - - Examples - -------- - >>> from cudf.api.types import is_bool_dtype - >>> import numpy as np - >>> import cudf - >>> is_bool_dtype(str) - False - >>> is_bool_dtype(int) - False - >>> is_bool_dtype(bool) - True - >>> is_bool_dtype(np.bool_) - True - >>> is_bool_dtype(np.array(['a', 'b'])) - False - >>> is_bool_dtype(cudf.Series([1, 2])) - False - >>> is_bool_dtype(np.array([True, False])) - True - >>> is_bool_dtype(cudf.Series([True, False], dtype='category')) - True - """ - if isinstance(arr_or_dtype, cudf.BaseIndex): - return arr_or_dtype._is_boolean() - elif isinstance(arr_or_dtype, cudf.Series): - if isinstance(arr_or_dtype.dtype, cudf.CategoricalDtype): - return is_bool_dtype(arr_or_dtype=arr_or_dtype.dtype) - else: - return pd_types.is_bool_dtype(arr_or_dtype=arr_or_dtype.dtype) - elif isinstance(arr_or_dtype, cudf.CategoricalDtype): - return pd_types.is_bool_dtype( - arr_or_dtype=arr_or_dtype.categories.dtype - ) - else: - return pd_types.is_bool_dtype(arr_or_dtype=arr_or_dtype) - - -def is_object_dtype(arr_or_dtype): - """ - Check whether an array-like or dtype is of the object dtype. - - Parameters - ---------- - arr_or_dtype : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - boolean - Whether or not the array-like or dtype is of the object dtype. - - Examples - -------- - >>> from cudf.api.types import is_object_dtype - >>> import numpy as np - >>> is_object_dtype(object) - True - >>> is_object_dtype(int) - False - >>> is_object_dtype(np.array([], dtype=object)) - True - >>> is_object_dtype(np.array([], dtype=int)) - False - >>> is_object_dtype([1, 2, 3]) - False - """ - if isinstance(arr_or_dtype, cudf.BaseIndex): - return arr_or_dtype._is_object() - elif isinstance(arr_or_dtype, cudf.Series): - return pd_types.is_object_dtype(arr_or_dtype=arr_or_dtype.dtype) - else: - return pd_types.is_object_dtype(arr_or_dtype=arr_or_dtype) - - -def is_float_dtype(arr_or_dtype) -> bool: - """ - Check whether the provided array or dtype is of a float dtype. - - Parameters - ---------- - arr_or_dtype : array-like or dtype - The array or dtype to check. - - Returns - ------- - boolean - Whether or not the array or dtype is of a float dtype. - - Examples - -------- - >>> from cudf.api.types import is_float_dtype - >>> import numpy as np - >>> import cudf - >>> is_float_dtype(str) - False - >>> is_float_dtype(int) - False - >>> is_float_dtype(float) - True - >>> is_float_dtype(np.array(['a', 'b'])) - False - >>> is_float_dtype(cudf.Series([1, 2])) - False - >>> is_float_dtype(cudf.Index([1, 2.])) - True - """ - if isinstance(arr_or_dtype, cudf.BaseIndex): - return arr_or_dtype._is_floating() - return _wrap_pandas_is_dtype_api(pd_types.is_float_dtype)(arr_or_dtype) - - -def is_integer_dtype(arr_or_dtype) -> bool: - """ - Check whether the provided array or dtype is of an integer dtype. - Unlike in `is_any_int_dtype`, timedelta64 instances will return False. - - Parameters - ---------- - arr_or_dtype : array-like or dtype - The array or dtype to check. - - Returns - ------- - boolean - Whether or not the array or dtype is of an integer dtype and - not an instance of timedelta64. - - Examples - -------- - >>> from cudf.api.types import is_integer_dtype - >>> import numpy as np - >>> import cudf - >>> is_integer_dtype(str) - False - >>> is_integer_dtype(int) - True - >>> is_integer_dtype(float) - False - >>> is_integer_dtype(np.uint64) - True - >>> is_integer_dtype('int8') - True - >>> is_integer_dtype('Int8') - True - >>> is_integer_dtype(np.datetime64) - False - >>> is_integer_dtype(np.timedelta64) - False - >>> is_integer_dtype(np.array(['a', 'b'])) - False - >>> is_integer_dtype(cudf.Series([1, 2])) - True - >>> is_integer_dtype(np.array([], dtype=np.timedelta64)) - False - >>> is_integer_dtype(cudf.Index([1, 2.])) # float - False - """ - if isinstance(arr_or_dtype, cudf.BaseIndex): - return arr_or_dtype._is_integer() - return _wrap_pandas_is_dtype_api(pd_types.is_integer_dtype)(arr_or_dtype) - - -def is_any_real_numeric_dtype(arr_or_dtype) -> bool: - """ - Check whether the provided array or dtype is of a real number dtype. - - Parameters - ---------- - arr_or_dtype : array-like or dtype - The array or dtype to check. - - Returns - ------- - boolean - Whether or not the array or dtype is of a real number dtype. - - Examples - -------- - >>> from cudf.api.types import is_any_real_numeric_dtype - >>> import cudf - >>> is_any_real_numeric_dtype(int) - True - >>> is_any_real_numeric_dtype(float) - True - >>> is_any_real_numeric_dtype(object) - False - >>> is_any_real_numeric_dtype(str) - False - >>> is_any_real_numeric_dtype(complex(1, 2)) - False - >>> is_any_real_numeric_dtype(bool) - False - >>> is_any_real_numeric_dtype(cudf.Index([1, 2, 3])) - True - """ - return ( - is_numeric_dtype(arr_or_dtype) - and not is_complex_dtype(arr_or_dtype) - and not is_bool_dtype(arr_or_dtype) - ) - - -def _is_datetime64tz_dtype(obj): - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." - return _wrap_pandas_is_dtype_api(pd_types.is_datetime64tz_dtype)(obj) - - -def is_datetime64tz_dtype(obj): - # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." - warnings.warn( - "is_datetime64tz_dtype is deprecated and will be removed in a future " - "version.", - FutureWarning, - ) - return _is_datetime64tz_dtype(obj) - - -def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool: - if isinstance( - dtype_to_check, - ( - pd.UInt8Dtype, - pd.UInt16Dtype, - pd.UInt32Dtype, - pd.UInt64Dtype, - pd.Int8Dtype, - pd.Int16Dtype, - pd.Int32Dtype, - pd.Int64Dtype, - pd.Float32Dtype, - pd.Float64Dtype, - pd.BooleanDtype, - pd.StringDtype, - pd.ArrowDtype, - ), - ): - return True - elif isinstance(dtype_to_check, pd.CategoricalDtype): - if dtype_to_check.categories is None: - return False - return _is_pandas_nullable_extension_dtype( - dtype_to_check.categories.dtype - ) - elif isinstance(dtype_to_check, pd.IntervalDtype): - return _is_pandas_nullable_extension_dtype(dtype_to_check.subtype) - return False - - -# TODO: The below alias is removed for now since improving cudf categorical -# support is ongoing and we don't want to introduce any ambiguities. The above -# method _union_categoricals will take its place once exposed. -# union_categoricals = pd_types.union_categoricals -infer_dtype = pd_types.infer_dtype -pandas_dtype = pd_types.pandas_dtype -is_complex_dtype = pd_types.is_complex_dtype -# TODO: Evaluate which of the datetime types need special handling for cudf. -is_datetime_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype) -is_datetime64_any_dtype = pd_types.is_datetime64_any_dtype -is_datetime64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_datetime64_dtype) -is_datetime64_ns_dtype = _wrap_pandas_is_dtype_api( - pd_types.is_datetime64_ns_dtype -) -is_extension_array_dtype = pd_types.is_extension_array_dtype -is_int64_dtype = pd_types.is_int64_dtype -is_period_dtype = pd_types.is_period_dtype -is_signed_integer_dtype = pd_types.is_signed_integer_dtype -is_timedelta_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype) -is_timedelta64_dtype = _wrap_pandas_is_dtype_api(pd_types.is_timedelta64_dtype) -is_timedelta64_ns_dtype = _wrap_pandas_is_dtype_api( - pd_types.is_timedelta64_ns_dtype -) -is_unsigned_integer_dtype = pd_types.is_unsigned_integer_dtype -is_sparse = pd_types.is_sparse -# is_list_like = pd_types.is_list_like -is_dict_like = pd_types.is_dict_like -is_file_like = pd_types.is_file_like -is_named_tuple = pd_types.is_named_tuple -is_iterator = pd_types.is_iterator -is_bool = pd_types.is_bool -is_complex = pd_types.is_complex -is_float = pd_types.is_float -is_hashable = pd_types.is_hashable -is_interval = pd_types.is_interval -is_number = pd_types.is_number -is_re = pd_types.is_re -is_re_compilable = pd_types.is_re_compilable -is_dtype_equal = pd_types.is_dtype_equal - - -# Aliases of numpy dtype functionality. -issubdtype = np.issubdtype diff --git a/python/cudf/cudf/comm/__init__.py b/python/cudf/cudf/comm/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/comm/serialize.py b/python/cudf/cudf/comm/serialize.py deleted file mode 100644 index 9fb28907e73..00000000000 --- a/python/cudf/cudf/comm/serialize.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. - -import cudf # noqa: F401 -from cudf.core.abc import Serializable - -try: - from distributed.protocol import dask_deserialize, dask_serialize - from distributed.protocol.cuda import cuda_deserialize, cuda_serialize - from distributed.utils import log_errors - - @cuda_serialize.register(Serializable) - def cuda_serialize_cudf_object(x): - with log_errors(): - return x.device_serialize() - - @dask_serialize.register(Serializable) - def dask_serialize_cudf_object(x): - with log_errors(): - return x.host_serialize() - - @cuda_deserialize.register(Serializable) - def cuda_deserialize_cudf_object(header, frames): - with log_errors(): - return Serializable.device_deserialize(header, frames) - - @dask_deserialize.register(Serializable) - def dask_deserialize_cudf_object(header, frames): - with log_errors(): - return Serializable.host_deserialize(header, frames) - -except ImportError: - # distributed is probably not installed on the system - pass diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py deleted file mode 100644 index ec4878b332d..00000000000 --- a/python/cudf/cudf/core/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py deleted file mode 100644 index a6abd63d042..00000000000 --- a/python/cudf/cudf/core/_base_index.py +++ /dev/null @@ -1,2178 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import pickle -import warnings -from functools import cached_property -from typing import TYPE_CHECKING, Any, Literal - -import pandas as pd -from typing_extensions import Self - -import cudf -from cudf._lib.copying import _gather_map_is_valid, gather -from cudf._lib.stream_compaction import ( - apply_boolean_mask, - drop_duplicates, - drop_nulls, -) -from cudf._lib.types import size_type_dtype -from cudf.api.extensions import no_default -from cudf.api.types import is_integer, is_list_like, is_scalar -from cudf.core.abc import Serializable -from cudf.core.column import ColumnBase, column -from cudf.errors import MixedTypeError -from cudf.utils import ioutils -from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype -from cudf.utils.utils import _is_same_name - -if TYPE_CHECKING: - from collections.abc import Generator - - import cupy - - from cudf.core.column_accessor import ColumnAccessor - - -class BaseIndex(Serializable): - """Base class for all cudf Index types.""" - - _accessors: set[Any] = set() - _data: ColumnAccessor - - @property - def _columns(self) -> tuple[Any, ...]: - raise NotImplementedError - - @cached_property - def _values(self) -> ColumnBase: - raise NotImplementedError - - def copy(self, deep: bool = True) -> Self: - raise NotImplementedError - - def __len__(self): - raise NotImplementedError - - def __bool__(self): - raise ValueError( - f"The truth value of a {type(self).__name__} is ambiguous. Use " - "a.empty, a.bool(), a.item(), a.any() or a.all()." - ) - - @property - def size(self): - # The size of an index is always its length irrespective of dimension. - return len(self) - - def astype(self, dtype, copy: bool = True): - """Create an Index with values cast to dtypes. - - The class of a new Index is determined by dtype. When conversion is - impossible, a ValueError exception is raised. - - Parameters - ---------- - dtype : :class:`numpy.dtype` - Use a :class:`numpy.dtype` to cast entire Index object to. - copy : bool, default False - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. - - Returns - ------- - Index - Index with values cast to specified dtype. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, 3]) - >>> index - Index([1, 2, 3], dtype='int64') - >>> index.astype('float64') - Index([1.0, 2.0, 3.0], dtype='float64') - """ - raise NotImplementedError - - def argsort(self, *args, **kwargs) -> cupy.ndarray: - """Return the integer indices that would sort the index. - - Parameters vary by subclass. - """ - raise NotImplementedError - - @property - def dtype(self): - raise NotImplementedError - - @property - def empty(self): - return self.size == 0 - - @property - def is_unique(self): - """Return if the index has unique values.""" - raise NotImplementedError - - def memory_usage(self, deep=False): - """Return the memory usage of an object. - - Parameters - ---------- - deep : bool - The deep parameter is ignored and is only included for pandas - compatibility. - - Returns - ------- - The total bytes used. - """ - raise NotImplementedError - - def tolist(self): # noqa: D102 - raise TypeError( - "cuDF does not support conversion to host memory " - "via the `tolist()` method. Consider using " - "`.to_arrow().to_pylist()` to construct a Python list." - ) - - to_list = tolist - - @property - def name(self): - """Returns the name of the Index.""" - raise NotImplementedError - - @property # type: ignore - def ndim(self) -> int: # noqa: D401 - """Number of dimensions of the underlying data, by definition 1.""" - return 1 - - def equals(self, other) -> bool: - """ - Determine if two Index objects contain the same elements. - - Returns - ------- - out: bool - True if "other" is an Index and it has the same elements - as calling index; False otherwise. - """ - raise NotImplementedError - - def shift(self, periods=1, freq=None): - """Not yet implemented""" - raise NotImplementedError - - @property - def shape(self): - """Get a tuple representing the dimensionality of the data.""" - return (len(self),) - - @property - def str(self): - """Not yet implemented.""" - raise NotImplementedError - - @property - def values(self): - raise NotImplementedError - - def get_indexer(self, target, method=None, limit=None, tolerance=None): - """ - Compute indexer and mask for new index given the current index. - - The indexer should be then used as an input to ndarray.take to align - the current data to the new index. - - Parameters - ---------- - target : Index - method : {None, 'pad'/'fill', 'backfill'/'bfill', 'nearest'}, optional - - default: exact matches only. - - pad / ffill: find the PREVIOUS index value if no exact match. - - backfill / bfill: use NEXT index value if no exact match. - - nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index - value. - tolerance : int or float, optional - Maximum distance from index value for inexact matches. The value - of the index at the matching location must satisfy the equation - ``abs(index[loc] - target) <= tolerance``. - - Returns - ------- - cupy.ndarray - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. - Missing values in the target are marked by -1. - - Examples - -------- - >>> import cudf - >>> index = cudf.Index(['c', 'a', 'b']) - >>> index - Index(['c', 'a', 'b'], dtype='object') - >>> index.get_indexer(['a', 'b', 'x']) - array([ 1, 2, -1], dtype=int32) - """ - raise NotImplementedError - - def get_loc(self, key): - """ - Get integer location, slice or boolean mask for requested label. - - Parameters - ---------- - key : label - - Returns - ------- - int or slice or boolean mask - - If result is unique, return integer index - - If index is monotonic, loc is returned as a slice object - - Otherwise, a boolean mask is returned - - Examples - -------- - >>> import cudf - >>> unique_index = cudf.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 - >>> monotonic_index = cudf.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) - >>> non_monotonic_index = cudf.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True]) - >>> numeric_unique_index = cudf.Index([1, 2, 3]) - >>> numeric_unique_index.get_loc(3) - 2 - - **MultiIndex** - - >>> multi_index = cudf.MultiIndex.from_tuples([('a', 'd'), ('b', 'e'), ('b', 'f')]) - >>> multi_index - MultiIndex([('a', 'd'), - ('b', 'e'), - ('b', 'f')], - ) - >>> multi_index.get_loc('b') - slice(1, 3, None) - >>> multi_index.get_loc(('b', 'e')) - 1 - """ # noqa: E501 - - def max(self): - """The maximum value of the index.""" - raise NotImplementedError - - def min(self): - """The minimum value of the index.""" - raise NotImplementedError - - def __getitem__(self, key): - raise NotImplementedError() - - def __contains__(self, item): - hash(item) - return item in self._values - - def _copy_type_metadata(self: Self, other: Self) -> Self: - raise NotImplementedError - - def get_level_values(self, level): - """ - Return an Index of values for requested level. - - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatibility. - - Parameters - ---------- - level : int or str - It is either the integer position or the name of the level. - - Returns - ------- - Index - Calling object, as there is only one level in the Index. - - See Also - -------- - cudf.MultiIndex.get_level_values : Get values for - a level of a MultiIndex. - - Notes - ----- - For Index, level should be 0, since there are no multiple levels. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(["a", "b", "c"]) - >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') - """ - - if level == self.name: - return self - elif is_integer(level): - if level != 0: - raise IndexError( - f"Cannot get level: {level} " f"for index with 1 level" - ) - return self - else: - raise KeyError(f"Requested level with name {level} " "not found") - - @classmethod - def deserialize(cls, header, frames): - # Dispatch deserialization to the appropriate index type in case - # deserialization is ever attempted with the base class directly. - idx_type = pickle.loads(header["type-serialized"]) - return idx_type.deserialize(header, frames) - - @property - def names(self): - """ - Returns a FrozenList containing the name of the Index. - """ - return pd.core.indexes.frozen.FrozenList([self.name]) - - @names.setter - def names(self, values): - if not is_list_like(values): - raise ValueError("Names must be a list-like") - - num_values = len(values) - if num_values > 1: - raise ValueError( - "Length of new names must be 1, got %d" % num_values - ) - - self.name = values[0] - - def _clean_nulls_from_index(self): - """ - Convert all na values(if any) in Index object - to `` as a preprocessing step to `__repr__` methods. - - This will involve changing type of Index object - to string dtype but it is the responsibility of the `__repr__` - methods using this method to replace or handle representation - of the actual types correctly. - """ - raise NotImplementedError - - @property - def is_monotonic_increasing(self): - """Return boolean if values in the object are monotonically increasing. - - Returns - ------- - bool - """ - raise NotImplementedError - - @property - def is_monotonic_decreasing(self): - """Return boolean if values in the object are monotonically decreasing. - - Returns - ------- - bool - """ - raise NotImplementedError - - @property - def hasnans(self): - """ - Return True if there are any NaNs or nulls. - - Returns - ------- - out : bool - If Series has at least one NaN or null value, return True, - if not return False. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> index = cudf.Index([1, 2, np.nan, 3, 4], nan_as_null=False) - >>> index - Index([1.0, 2.0, nan, 3.0, 4.0], dtype='float64') - >>> index.hasnans - True - - `hasnans` returns `True` for the presence of any `NA` values: - - >>> index = cudf.Index([1, 2, None, 3, 4]) - >>> index - Index([1, 2, , 3, 4], dtype='int64') - >>> index.hasnans - True - """ - raise NotImplementedError - - @property - def nlevels(self): - """ - Number of levels. - """ - return 1 - - def _set_names(self, names, inplace=False): - if inplace: - idx = self - else: - idx = self.copy(deep=False) - - idx.names = names - if not inplace: - return idx - - def set_names(self, names, level=None, inplace=False): - """ - Set Index or MultiIndex name. - Able to set new names partially and by level. - - Parameters - ---------- - names : label or list of label - Name(s) to set. - level : int, label or list of int or label, optional - If the index is a MultiIndex, level(s) to set (None for all - levels). Otherwise level must be None. - inplace : bool, default False - Modifies the object directly, instead of creating a new Index or - MultiIndex. - - Returns - ------- - Index - The same type as the caller or None if inplace is True. - - See Also - -------- - cudf.Index.rename : Able to set new names without level. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1, 2, 3, 4]) - >>> idx - Index([1, 2, 3, 4], dtype='int64') - >>> idx.set_names('quarter') - Index([1, 2, 3, 4], dtype='int64', name='quarter') - >>> idx = cudf.MultiIndex.from_product([['python', 'cobra'], - ... [2018, 2019]]) - >>> idx - MultiIndex([('python', 2018), - ('python', 2019), - ( 'cobra', 2018), - ( 'cobra', 2019)], - ) - >>> idx.names - FrozenList([None, None]) - >>> idx.set_names(['kind', 'year'], inplace=True) - >>> idx.names - FrozenList(['kind', 'year']) - >>> idx.set_names('species', level=0, inplace=True) - >>> idx.names - FrozenList(['species', 'year']) - """ - if level is not None: - raise ValueError("Level must be None for non-MultiIndex") - - if not is_list_like(names): - names = [names] - - return self._set_names(names=names, inplace=inplace) - - @property - def has_duplicates(self): - return not self.is_unique - - def where(self, cond, other=None, inplace=False): - """ - Replace values where the condition is False. - - The replacement is taken from other. - - Parameters - ---------- - cond : bool array-like with the same length as self - Condition to select the values on. - other : scalar, or array-like, default None - Replacement if the condition is False. - - Returns - ------- - cudf.Index - A copy of self with values replaced from other - where the condition is False. - """ - raise NotImplementedError - - def factorize(self, sort: bool = False, use_na_sentinel: bool = True): - raise NotImplementedError - - def union(self, other, sort=None): - """ - Form the union of two Index objects. - - Parameters - ---------- - other : Index or array-like - sort : bool or None, default None - Whether to sort the resulting Index. - - * None : Sort the result, except when - - 1. `self` and `other` are equal. - 2. `self` or `other` has length 0. - - * False : do not sort the result. - * True : Sort the result (which may raise TypeError). - - Returns - ------- - union : Index - - Examples - -------- - Union of an Index - >>> import cudf - >>> import pandas as pd - >>> idx1 = cudf.Index([1, 2, 3, 4]) - >>> idx2 = cudf.Index([3, 4, 5, 6]) - >>> idx1.union(idx2) - Index([1, 2, 3, 4, 5, 6], dtype='int64') - - MultiIndex case - - >>> idx1 = cudf.MultiIndex.from_pandas( - ... pd.MultiIndex.from_arrays( - ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ... ) - ... ) - >>> idx1 - MultiIndex([(1, 'Red'), - (1, 'Blue'), - (2, 'Red'), - (2, 'Blue')], - ) - >>> idx2 = cudf.MultiIndex.from_pandas( - ... pd.MultiIndex.from_arrays( - ... [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] - ... ) - ... ) - >>> idx2 - MultiIndex([(3, 'Red'), - (3, 'Green'), - (2, 'Red'), - (2, 'Green')], - ) - >>> idx1.union(idx2) - MultiIndex([(1, 'Blue'), - (1, 'Red'), - (2, 'Blue'), - (2, 'Green'), - (2, 'Red'), - (3, 'Green'), - (3, 'Red')], - ) - >>> idx1.union(idx2, sort=False) - MultiIndex([(1, 'Red'), - (1, 'Blue'), - (2, 'Red'), - (2, 'Blue'), - (3, 'Red'), - (3, 'Green'), - (2, 'Green')], - ) - """ - if not isinstance(other, BaseIndex): - other = cudf.Index(other, name=self.name) - - if sort not in {None, False, True}: - raise ValueError( - f"The 'sort' keyword only takes the values of " - f"[None, False, True]; {sort} was passed." - ) - - if cudf.get_option("mode.pandas_compatible"): - if (self.dtype.kind == "b" and other.dtype.kind != "b") or ( - self.dtype.kind != "b" and other.dtype.kind == "b" - ): - # Bools + other types will result in mixed type. - # This is not yet consistent in pandas and specific to APIs. - raise MixedTypeError("Cannot perform union with mixed types") - if (self.dtype.kind == "i" and other.dtype.kind == "u") or ( - self.dtype.kind == "u" and other.dtype.kind == "i" - ): - # signed + unsigned types will result in - # mixed type for union in pandas. - raise MixedTypeError("Cannot perform union with mixed types") - - if not len(other) or self.equals(other): - common_dtype = cudf.utils.dtypes.find_common_type( - [self.dtype, other.dtype] - ) - res = self._get_reconciled_name_object(other).astype(common_dtype) - if sort: - return res.sort_values() - return res - elif not len(self): - common_dtype = cudf.utils.dtypes.find_common_type( - [self.dtype, other.dtype] - ) - res = other._get_reconciled_name_object(self).astype(common_dtype) - if sort: - return res.sort_values() - return res - - result = self._union(other, sort=sort) - result.name = _get_result_name(self.name, other.name) - return result - - def intersection(self, other, sort=False): - """ - Form the intersection of two Index objects. - - This returns a new Index with elements common to the index and `other`. - - Parameters - ---------- - other : Index or array-like - sort : False or None, default False - Whether to sort the resulting index. - - * False : do not sort the result. - * None : sort the result, except when `self` and `other` are equal - or when the values cannot be compared. - * True : Sort the result (which may raise TypeError). - - Returns - ------- - intersection : Index - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> idx1 = cudf.Index([1, 2, 3, 4]) - >>> idx2 = cudf.Index([3, 4, 5, 6]) - >>> idx1.intersection(idx2) - Index([3, 4], dtype='int64') - - MultiIndex case - - >>> idx1 = cudf.MultiIndex.from_pandas( - ... pd.MultiIndex.from_arrays( - ... [[1, 1, 3, 4], ["Red", "Blue", "Red", "Blue"]] - ... ) - ... ) - >>> idx2 = cudf.MultiIndex.from_pandas( - ... pd.MultiIndex.from_arrays( - ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ... ) - ... ) - >>> idx1 - MultiIndex([(1, 'Red'), - (1, 'Blue'), - (3, 'Red'), - (4, 'Blue')], - ) - >>> idx2 - MultiIndex([(1, 'Red'), - (1, 'Blue'), - (2, 'Red'), - (2, 'Blue')], - ) - >>> idx1.intersection(idx2) - MultiIndex([(1, 'Red'), - (1, 'Blue')], - ) - >>> idx1.intersection(idx2, sort=False) - MultiIndex([(1, 'Red'), - (1, 'Blue')], - ) - """ - if not can_convert_to_column(other): - raise TypeError("Input must be Index or array-like") - - if not isinstance(other, BaseIndex): - other = cudf.Index( - other, - name=getattr(other, "name", self.name), - ) - - if sort not in {None, False, True}: - raise ValueError( - f"The 'sort' keyword only takes the values of " - f"[None, False, True]; {sort} was passed." - ) - - if not len(self) or not len(other) or self.equals(other): - common_dtype = cudf.utils.dtypes._dtype_pandas_compatible( - cudf.utils.dtypes.find_common_type([self.dtype, other.dtype]) - ) - - lhs = self.unique() if self.has_duplicates else self - rhs = other - if not len(other): - lhs, rhs = rhs, lhs - - return lhs._get_reconciled_name_object(rhs).astype(common_dtype) - - res_name = _get_result_name(self.name, other.name) - - if (self._is_boolean() and other._is_numeric()) or ( - self._is_numeric() and other._is_boolean() - ): - if isinstance(self, cudf.MultiIndex): - return self[:0].rename(res_name) - else: - return cudf.Index([], name=res_name) - - if self.has_duplicates: - lhs = self.unique() - else: - lhs = self - if other.has_duplicates: - rhs = other.unique() - else: - rhs = other - result = lhs._intersection(rhs, sort=sort) - result.name = res_name - return result - - def _get_reconciled_name_object(self, other): - """ - If the result of a set operation will be self, - return self, unless the name changes, in which - case make a shallow copy of self. - """ - name = _get_result_name(self.name, other.name) - if not _is_same_name(self.name, name): - return self.rename(name) - return self - - def fillna(self, value, downcast=None): - """ - Fill null values with the specified value. - - Parameters - ---------- - value : scalar - Scalar value to use to fill nulls. This value cannot be a - list-likes. - - downcast : dict, default is None - This Parameter is currently NON-FUNCTIONAL. - - Returns - ------- - filled : Index - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, None, 4]) - >>> index - Index([1, 2, , 4], dtype='int64') - >>> index.fillna(3) - Index([1, 2, 3, 4], dtype='int64') - """ - if downcast is not None: - raise NotImplementedError( - "`downcast` parameter is not yet supported" - ) - - return super().fillna(value=value) - - def to_arrow(self): - """Convert to a suitable Arrow object.""" - raise NotImplementedError - - def to_cupy(self): - """Convert to a cupy array.""" - raise NotImplementedError - - def to_numpy(self): - """Convert to a numpy array.""" - raise NotImplementedError - - def to_flat_index(self) -> Self: - """ - Identity method. - - This is implemented for compatibility with subclass implementations - when chaining. - - Returns - ------- - pd.Index - Caller. - - See Also - -------- - MultiIndex.to_flat_index : Subclass implementation. - """ - return self - - def any(self): - """ - Return whether any elements is True in Index. - """ - raise NotImplementedError - - def isna(self): - """ - Detect missing values. - - Return a boolean same-sized object indicating if the values are NA. - NA values, such as ``None``, `numpy.NAN` or `cudf.NA`, get - mapped to ``True`` values. - Everything else get mapped to ``False`` values. - - Returns - ------- - numpy.ndarray[bool] - A boolean array to indicate which entries are NA. - - """ - raise NotImplementedError - - def notna(self): - """ - Detect existing (non-missing) values. - - Return a boolean same-sized object indicating if the values are not NA. - Non-missing values get mapped to ``True``. - NA values, such as None or `numpy.NAN`, get mapped to ``False`` - values. - - Returns - ------- - numpy.ndarray[bool] - A boolean array to indicate which entries are not NA. - """ - raise NotImplementedError - - def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False): - """ - Convert to a Pandas Index. - - Parameters - ---------- - nullable : bool, Default False - If ``nullable`` is ``True``, the resulting index will have - a corresponding nullable Pandas dtype. - If there is no corresponding nullable Pandas dtype present, - the resulting dtype will be a regular pandas dtype. - If ``nullable`` is ``False``, the resulting index will - either convert null values to ``np.nan`` or ``None`` - depending on the dtype. - arrow_type : bool, Default False - Return the Index with a ``pandas.ArrowDtype`` - - Notes - ----- - nullable and arrow_type cannot both be set to ``True`` - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([-3, 10, 15, 20]) - >>> idx - Index([-3, 10, 15, 20], dtype='int64') - >>> idx.to_pandas() - Index([-3, 10, 15, 20], dtype='int64') - >>> type(idx.to_pandas()) - - >>> type(idx) - - >>> idx.to_pandas(arrow_type=True) - Index([-3, 10, 15, 20], dtype='int64[pyarrow]') - """ - raise NotImplementedError - - def isin(self, values, level=None): - """Return a boolean array where the index values are in values. - - Compute boolean array of whether each index value is found in - the passed set of values. The length of the returned boolean - array matches the length of the index. - - Parameters - ---------- - values : set, list-like, Index - Sought values. - level : str or int, optional - Name or position of the index level to use (if the index is a - `MultiIndex`). - - Returns - ------- - is_contained : cupy array - CuPy array of boolean values. - - Examples - -------- - >>> idx = cudf.Index([1,2,3]) - >>> idx - Index([1, 2, 3], dtype='int64') - - Check whether each index value in a list of values. - - >>> idx.isin([1, 4]) - array([ True, False, False]) - """ - # To match pandas behavior, even though only list-like objects are - # supposed to be passed, only scalars throw errors. Other types (like - # dicts) just transparently return False (see the implementation of - # ColumnBase.isin). - raise NotImplementedError - - def unique(self, level: int | None = None): - """ - Return unique values in the index. - - Returns - ------- - Index without duplicates - """ - raise NotImplementedError - - def to_series(self, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys. - Useful with map for returning an indexer based on an index. - - Parameters - ---------- - index : Index, optional - Index of resulting Series. If None, defaults to original index. - name : str, optional - Name of resulting Series. If None, defaults to name of original - index. - - Returns - ------- - Series - The dtype will be based on the type of the Index values. - """ - return cudf.Series._from_data( - self._data, - index=self.copy(deep=False) if index is None else index, - name=self.name if name is None else name, - ) - - @ioutils.doc_to_dlpack() - def to_dlpack(self): - """{docstring}""" - - return cudf.io.dlpack.to_dlpack(self) - - def append(self, other): - """ - Append a collection of Index objects together. - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1, 2, 10, 100]) - >>> idx - Index([1, 2, 10, 100], dtype='int64') - >>> other = cudf.Index([200, 400, 50]) - >>> other - Index([200, 400, 50], dtype='int64') - >>> idx.append(other) - Index([1, 2, 10, 100, 200, 400, 50], dtype='int64') - - append accepts list of Index objects - - >>> idx.append([other, other]) - Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64') - """ - raise NotImplementedError - - def difference(self, other, sort=None): - """ - Return a new Index with elements from the index that are not in - `other`. - - This is the set difference of two Index objects. - - Parameters - ---------- - other : Index or array-like - sort : False or None, default None - Whether to sort the resulting index. By default, the - values are attempted to be sorted, but any TypeError from - incomparable elements is caught by cudf. - - * None : Attempt to sort the result, but catch any TypeErrors - from comparing incomparable elements. - * False : Do not sort the result. - * True : Sort the result (which may raise TypeError). - - Returns - ------- - difference : Index - - Examples - -------- - >>> import cudf - >>> idx1 = cudf.Index([2, 1, 3, 4]) - >>> idx1 - Index([2, 1, 3, 4], dtype='int64') - >>> idx2 = cudf.Index([3, 4, 5, 6]) - >>> idx2 - Index([3, 4, 5, 6], dtype='int64') - >>> idx1.difference(idx2) - Index([1, 2], dtype='int64') - >>> idx1.difference(idx2, sort=False) - Index([2, 1], dtype='int64') - """ - - if not can_convert_to_column(other): - raise TypeError("Input must be Index or array-like") - - if sort not in {None, False, True}: - raise ValueError( - f"The 'sort' keyword only takes the values " - f"of [None, False, True]; {sort} was passed." - ) - - if not isinstance(other, BaseIndex): - other = cudf.Index( - other, - name=getattr(other, "name", self.name), - ) - - if not len(other): - res = self._get_reconciled_name_object(other).unique() - if sort: - return res.sort_values() - return res - elif self.equals(other): - res = self[:0]._get_reconciled_name_object(other).unique() - if sort: - return res.sort_values() - return res - - res_name = _get_result_name(self.name, other.name) - - if is_mixed_with_object_dtype(self, other) or len(other) == 0: - difference = self.unique() - difference.name = res_name - if sort is True: - return difference.sort_values() - else: - other = other.copy(deep=False) - difference = cudf.core.index._index_from_data( - cudf.DataFrame._from_data({"None": self._column.unique()}) - .merge( - cudf.DataFrame._from_data({"None": other._column}), - how="leftanti", - on="None", - ) - ._data - ) - difference.name = res_name - - if self.dtype != other.dtype: - difference = difference.astype(self.dtype) - - if sort in {None, True} and len(other): - return difference.sort_values() - - return difference - - def is_numeric(self): - """ - Check if the Index only consists of numeric data. - - .. deprecated:: 23.04 - Use `cudf.api.types.is_any_real_numeric_dtype` instead. - - Returns - ------- - bool - Whether or not the Index only consists of numeric data. - - See Also - -------- - is_boolean : Check if the Index only consists of booleans. - is_integer : Check if the Index only consists of integers. - is_floating : Check if the Index is a floating type. - is_object : Check if the Index is of the object dtype. - is_categorical : Check if the Index holds categorical data. - is_interval : Check if the Index holds Interval objects. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0]) - >>> idx.is_numeric() - True - >>> idx = cudf.Index([1, 2, 3, 4.0]) - >>> idx.is_numeric() - True - >>> idx = cudf.Index([1, 2, 3, 4]) - >>> idx.is_numeric() - True - >>> idx = cudf.Index([1, 2, 3, 4.0, np.nan]) - >>> idx.is_numeric() - True - >>> idx = cudf.Index(["Apple", "cold"]) - >>> idx.is_numeric() - False - """ - # Do not remove until pandas removes this. - warnings.warn( - f"{type(self).__name__}.is_numeric is deprecated. " - "Use cudf.api.types.is_any_real_numeric_dtype instead", - FutureWarning, - ) - return self._is_numeric() - - def _is_numeric(self): - raise NotImplementedError - - def is_boolean(self): - """ - Check if the Index only consists of booleans. - - .. deprecated:: 23.04 - Use `cudf.api.types.is_bool_dtype` instead. - - Returns - ------- - bool - Whether or not the Index only consists of booleans. - - See Also - -------- - is_integer : Check if the Index only consists of integers. - is_floating : Check if the Index is a floating type. - is_numeric : Check if the Index only consists of numeric data. - is_object : Check if the Index is of the object dtype. - is_categorical : Check if the Index holds categorical data. - is_interval : Check if the Index holds Interval objects. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([True, False, True]) - >>> idx.is_boolean() - True - >>> idx = cudf.Index(["True", "False", "True"]) - >>> idx.is_boolean() - False - >>> idx = cudf.Index([1, 2, 3]) - >>> idx.is_boolean() - False - """ - # Do not remove until pandas removes this. - warnings.warn( - f"{type(self).__name__}.is_boolean is deprecated. " - "Use cudf.api.types.is_bool_dtype instead", - FutureWarning, - ) - return self._is_boolean() - - def _is_boolean(self): - raise NotImplementedError - - def is_integer(self): - """ - Check if the Index only consists of integers. - - .. deprecated:: 23.04 - Use `cudf.api.types.is_integer_dtype` instead. - - Returns - ------- - bool - Whether or not the Index only consists of integers. - - See Also - -------- - is_boolean : Check if the Index only consists of booleans. - is_floating : Check if the Index is a floating type. - is_numeric : Check if the Index only consists of numeric data. - is_object : Check if the Index is of the object dtype. - is_categorical : Check if the Index holds categorical data. - is_interval : Check if the Index holds Interval objects. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1, 2, 3, 4]) - >>> idx.is_integer() - True - >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0]) - >>> idx.is_integer() - False - >>> idx = cudf.Index(["Apple", "Mango", "Watermelon"]) - >>> idx.is_integer() - False - """ - # Do not remove until pandas removes this. - warnings.warn( - f"{type(self).__name__}.is_integer is deprecated. " - "Use cudf.api.types.is_integer_dtype instead", - FutureWarning, - ) - return self._is_integer() - - def _is_integer(self): - raise NotImplementedError - - def is_floating(self): - """ - Check if the Index is a floating type. - - The Index may consist of only floats, NaNs, or a mix of floats, - integers, or NaNs. - - .. deprecated:: 23.04 - Use `cudf.api.types.is_float_dtype` instead. - - Returns - ------- - bool - Whether or not the Index only consists of only consists - of floats, NaNs, or a mix of floats, integers, or NaNs. - - See Also - -------- - is_boolean : Check if the Index only consists of booleans. - is_integer : Check if the Index only consists of integers. - is_numeric : Check if the Index only consists of numeric data. - is_object : Check if the Index is of the object dtype. - is_categorical : Check if the Index holds categorical data. - is_interval : Check if the Index holds Interval objects. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0]) - >>> idx.is_floating() - True - >>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0]) - >>> idx.is_floating() - True - >>> idx = cudf.Index([1, 2, 3, 4, np.nan], nan_as_null=False) - >>> idx.is_floating() - True - >>> idx = cudf.Index([1, 2, 3, 4]) - >>> idx.is_floating() - False - """ - # Do not remove until pandas removes this. - warnings.warn( - f"{type(self).__name__}.is_floating is deprecated. " - "Use cudf.api.types.is_float_dtype instead", - FutureWarning, - ) - return self._is_floating() - - def _is_floating(self): - raise NotImplementedError - - def is_object(self): - """ - Check if the Index is of the object dtype. - - .. deprecated:: 23.04 - Use `cudf.api.types.is_object_dtype` instead. - - Returns - ------- - bool - Whether or not the Index is of the object dtype. - - See Also - -------- - is_boolean : Check if the Index only consists of booleans. - is_integer : Check if the Index only consists of integers. - is_floating : Check if the Index is a floating type. - is_numeric : Check if the Index only consists of numeric data. - is_categorical : Check if the Index holds categorical data. - is_interval : Check if the Index holds Interval objects. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(["Apple", "Mango", "Watermelon"]) - >>> idx.is_object() - True - >>> idx = cudf.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") - >>> idx.is_object() - False - >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0]) - >>> idx.is_object() - False - """ - # Do not remove until pandas removes this. - warnings.warn( - f"{type(self).__name__}.is_object is deprecated. " - "Use cudf.api.types.is_object_dtype instead", - FutureWarning, - ) - return self._is_object() - - def _is_object(self): - raise NotImplementedError - - def is_categorical(self): - """ - Check if the Index holds categorical data. - - .. deprecated:: 23.04 - Use `cudf.api.types.is_categorical_dtype` instead. - - Returns - ------- - bool - True if the Index is categorical. - - See Also - -------- - CategoricalIndex : Index for categorical data. - is_boolean : Check if the Index only consists of booleans. - is_integer : Check if the Index only consists of integers. - is_floating : Check if the Index is a floating type. - is_numeric : Check if the Index only consists of numeric data. - is_object : Check if the Index is of the object dtype. - is_interval : Check if the Index holds Interval objects. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(["Watermelon", "Orange", "Apple", - ... "Watermelon"]).astype("category") - >>> idx.is_categorical() - True - >>> idx = cudf.Index([1, 3, 5, 7]) - >>> idx.is_categorical() - False - >>> s = cudf.Series(["Peter", "Victor", "Elisabeth", "Mar"]) - >>> s - 0 Peter - 1 Victor - 2 Elisabeth - 3 Mar - dtype: object - >>> s.index.is_categorical() - False - """ - # Do not remove until pandas removes this. - warnings.warn( - f"{type(self).__name__}.is_categorical is deprecated. " - "Use cudf.api.types.is_categorical_dtype instead", - FutureWarning, - ) - return self._is_categorical() - - def _is_categorical(self): - raise NotImplementedError - - def is_interval(self): - """ - Check if the Index holds Interval objects. - - .. deprecated:: 23.04 - Use `cudf.api.types.is_interval_dtype` instead. - - Returns - ------- - bool - Whether or not the Index holds Interval objects. - - See Also - -------- - IntervalIndex : Index for Interval objects. - is_boolean : Check if the Index only consists of booleans. - is_integer : Check if the Index only consists of integers. - is_floating : Check if the Index is a floating type. - is_numeric : Check if the Index only consists of numeric data. - is_object : Check if the Index is of the object dtype. - is_categorical : Check if the Index holds categorical data. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> idx = cudf.from_pandas( - ... pd.Index([pd.Interval(left=0, right=5), - ... pd.Interval(left=5, right=10)]) - ... ) - >>> idx.is_interval() - True - >>> idx = cudf.Index([1, 3, 5, 7]) - >>> idx.is_interval() - False - """ - # Do not remove until pandas removes this. - warnings.warn( - f"{type(self).__name__}.is_interval is deprecated. " - "Use cudf.api.types.is_interval_dtype instead", - FutureWarning, - ) - return self._is_interval() - - def _is_interval(self): - raise NotImplementedError - - def _union(self, other, sort=None): - # TODO: As a future optimization we should explore - # not doing `to_frame` - self_df = self.to_frame(index=False, name=0) - other_df = other.to_frame(index=False, name=0) - self_df["order"] = self_df.index - other_df["order"] = other_df.index - res = self_df.merge(other_df, on=[0], how="outer") - res = res.sort_values( - by=res._data.to_pandas_index()[1:], ignore_index=True - ) - union_result = cudf.core.index._index_from_data({0: res._data[0]}) - - if sort in {None, True} and len(other): - return union_result.sort_values() - return union_result - - def _intersection(self, other, sort=None): - intersection_result = cudf.core.index._index_from_data( - cudf.DataFrame._from_data({"None": self.unique()._column}) - .merge( - cudf.DataFrame._from_data({"None": other.unique()._column}), - how="inner", - on="None", - ) - ._data - ) - - if sort is {None, True} and len(other): - return intersection_result.sort_values() - return intersection_result - - def sort_values( - self, - return_indexer=False, - ascending=True, - na_position="last", - key=None, - ) -> Self | tuple[Self, cupy.ndarray]: - """ - Return a sorted copy of the index, and optionally return the indices - that sorted the index itself. - - Parameters - ---------- - return_indexer : bool, default False - Should the indices that would sort the index be returned. - ascending : bool, default True - Should the index values be sorted in an ascending order. - na_position : {'first' or 'last'}, default 'last' - Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at - the end. - key : None, optional - This parameter is NON-FUNCTIONAL. - - Returns - ------- - sorted_index : Index - Sorted copy of the index. - indexer : cupy.ndarray, optional - The indices that the index itself was sorted by. - - See Also - -------- - cudf.Series.min : Sort values of a Series. - cudf.DataFrame.sort_values : Sort values in a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([10, 100, 1, 1000]) - >>> idx - Index([10, 100, 1, 1000], dtype='int64') - - Sort values in ascending order (default behavior). - - >>> idx.sort_values() - Index([1, 10, 100, 1000], dtype='int64') - - Sort values in descending order, and also get the indices `idx` was - sorted by. - - >>> idx.sort_values(ascending=False, return_indexer=True) - (Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2], - dtype=int32)) - - Sorting values in a MultiIndex: - - >>> midx = cudf.MultiIndex( - ... levels=[[1, 3, 4, -10], [1, 11, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> midx - MultiIndex([( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11), - (-10, 1)], - names=['x', 'y']) - >>> midx.sort_values() - MultiIndex([(-10, 1), - ( 1, 1), - ( 1, 5), - ( 3, 11), - ( 4, 11)], - names=['x', 'y']) - >>> midx.sort_values(ascending=False) - MultiIndex([( 4, 11), - ( 3, 11), - ( 1, 5), - ( 1, 1), - (-10, 1)], - names=['x', 'y']) - """ - if key is not None: - raise NotImplementedError("key parameter is not yet implemented.") - if na_position not in {"first", "last"}: - raise ValueError(f"invalid na_position: {na_position}") - - indices = self.argsort(ascending=ascending, na_position=na_position) - index_sorted = self.take(indices) - - if return_indexer: - return index_sorted, indices - else: - return index_sorted - - def join( - self, other, how="left", level=None, return_indexers=False, sort=False - ): - """ - Compute join_index and indexers to conform data structures - to the new index. - - Parameters - ---------- - other : Index. - how : {'left', 'right', 'inner', 'outer'} - return_indexers : bool, default False - sort : bool, default False - Sort the join keys lexicographically in the result Index. If False, - the order of the join keys depends on the join type (how keyword). - - Returns: index - - Examples - -------- - >>> import cudf - >>> lhs = cudf.DataFrame({ - ... "a": [2, 3, 1], - ... "b": [3, 4, 2], - ... }).set_index(['a', 'b']).index - >>> lhs - MultiIndex([(2, 3), - (3, 4), - (1, 2)], - names=['a', 'b']) - >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index - >>> rhs - Index([1, 4, 3], dtype='int64', name='a') - >>> lhs.join(rhs, how='inner') - MultiIndex([(3, 4), - (1, 2)], - names=['a', 'b']) - """ - if return_indexers is not False: - raise NotImplementedError("return_indexers is not implemented") - self_is_multi = isinstance(self, cudf.MultiIndex) - other_is_multi = isinstance(other, cudf.MultiIndex) - if level is not None: - if self_is_multi and other_is_multi: - raise TypeError( - "Join on level between two MultiIndex objects is ambiguous" - ) - - if not is_scalar(level): - raise ValueError("level should be an int or a label only") - - if other_is_multi: - if how == "left": - how = "right" - elif how == "right": - how = "left" - rhs = self.copy(deep=False) - lhs = other.copy(deep=False) - else: - lhs = self.copy(deep=False) - rhs = other.copy(deep=False) - same_names = lhs.names == rhs.names - # There should be no `None` values in Joined indices, - # so essentially it would be `left/right` or 'inner' - # in case of MultiIndex - if isinstance(lhs, cudf.MultiIndex): - on = ( - lhs._data.get_labels_by_index(level)[0] - if isinstance(level, int) - else level - ) - - if on is not None: - rhs.names = (on,) - on = rhs.names[0] - if how == "outer": - how = "left" - elif how == "right": - how = "inner" - else: - # Both are normal indices - on = lhs.names[0] - rhs.names = lhs.names - - lhs = lhs.to_frame() - rhs = rhs.to_frame() - - output = lhs.merge(rhs, how=how, on=on, sort=sort) - - # If both inputs were MultiIndexes, the output is a MultiIndex. - # Otherwise, the output is only a MultiIndex if there are multiple - # columns - if self_is_multi and other_is_multi: - return cudf.MultiIndex._from_data(output._data) - else: - idx = cudf.core.index._index_from_data(output._data) - idx.name = self.name if same_names else None - return idx - - def rename(self, name, inplace=False): - """ - Alter Index name. - - Defaults to returning new index. - - Parameters - ---------- - name : label - Name(s) to set. - - Returns - ------- - Index - - Examples - -------- - >>> import cudf - >>> index = cudf.Index([1, 2, 3], name='one') - >>> index - Index([1, 2, 3], dtype='int64', name='one') - >>> index.name - 'one' - >>> renamed_index = index.rename('two') - >>> renamed_index - Index([1, 2, 3], dtype='int64', name='two') - >>> renamed_index.name - 'two' - """ - if inplace is True: - self.name = name - return None - else: - out = self.copy(deep=False) - out.name = name - return out - - def _indices_of(self, value) -> cudf.core.column.NumericalColumn: - """ - Return indices corresponding to value - - Parameters - ---------- - value - Value to look for in index - - Returns - ------- - Column of indices - """ - raise NotImplementedError - - def find_label_range(self, loc: slice) -> slice: - """ - Translate a label-based slice to an index-based slice - - Parameters - ---------- - loc - slice to search for. - - Notes - ----- - As with all label-based searches, the slice is right-closed. - - Returns - ------- - New slice translated into integer indices of the index (right-open). - """ - start = loc.start - stop = loc.stop - step = 1 if loc.step is None else loc.step - start_side: Literal["left", "right"] - stop_side: Literal["left", "right"] - if step < 0: - start_side, stop_side = "right", "left" - else: - start_side, stop_side = "left", "right" - istart = ( - None - if start is None - else self.get_slice_bound(start, side=start_side) - ) - istop = ( - None - if stop is None - else self.get_slice_bound(stop, side=stop_side) - ) - if step < 0: - # Fencepost - istart = None if istart is None else max(istart - 1, 0) - istop = None if (istop is None or istop == 0) else istop - 1 - return slice(istart, istop, step) - - def searchsorted( - self, - value, - side: Literal["left", "right"] = "left", - ascending: bool = True, - na_position: Literal["first", "last"] = "last", - ): - """Find index where elements should be inserted to maintain order - - Parameters - ---------- - value : - Value to be hypothetically inserted into Self - side : str {'left', 'right'} optional, default 'left' - If 'left', the index of the first suitable location found is given - If 'right', return the last such index - ascending : bool optional, default True - Index is in ascending order (otherwise descending) - na_position : str {'last', 'first'} optional, default 'last' - Position of null values in sorted order - - Returns - ------- - Insertion point. - - Notes - ----- - As a precondition the index must be sorted in the same order - as requested by the `ascending` flag. - """ - raise NotImplementedError - - def get_slice_bound( - self, - label, - side: Literal["left", "right"], - ) -> int: - """ - Calculate slice bound that corresponds to given label. - Returns leftmost (one-past-the-rightmost if ``side=='right'``) position - of given label. - - Parameters - ---------- - label : object - side : {'left', 'right'} - - Returns - ------- - int - Index of label. - """ - if side not in {"left", "right"}: - raise ValueError(f"Invalid side argument {side}") - if self.is_monotonic_increasing or self.is_monotonic_decreasing: - return self.searchsorted( - label, side=side, ascending=self.is_monotonic_increasing - ) - else: - try: - left, right = self._values._find_first_and_last(label) - except ValueError: - raise KeyError(f"{label=} not in index") - if left != right: - raise KeyError( - f"Cannot get slice bound for non-unique label {label=}" - ) - if side == "left": - return left - else: - return right + 1 - - def __array_function__(self, func, types, args, kwargs): - # check if the function is implemented for the current type - cudf_index_module = type(self) - for submodule in func.__module__.split(".")[1:]: - # point cudf_index_module to the correct submodule - if hasattr(cudf_index_module, submodule): - cudf_index_module = getattr(cudf_index_module, submodule) - else: - return NotImplemented - - fname = func.__name__ - - handled_types = [BaseIndex, cudf.Series] - - # check if we don't handle any of the types (including sub-class) - for t in types: - if not any( - issubclass(t, handled_type) for handled_type in handled_types - ): - return NotImplemented - - if hasattr(cudf_index_module, fname): - cudf_func = getattr(cudf_index_module, fname) - # Handle case if cudf_func is same as numpy function - if cudf_func is func: - return NotImplemented - else: - result = cudf_func(*args, **kwargs) - if fname == "unique": - # NumPy expects a sorted result for `unique`, which is not - # guaranteed by cudf.Index.unique. - result = result.sort_values() - return result - - else: - return NotImplemented - - @classmethod - def from_pandas(cls, index: pd.Index, nan_as_null=no_default): - """ - Convert from a Pandas Index. - - Parameters - ---------- - index : Pandas Index object - A Pandas Index object which has to be converted - to cuDF Index. - nan_as_null : bool, Default None - If ``None``/``True``, converts ``np.nan`` values - to ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Raises - ------ - TypeError for invalid input type. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> import numpy as np - >>> data = [10, 20, 30, np.nan] - >>> pdi = pd.Index(data) - >>> cudf.Index.from_pandas(pdi) - Index([10.0, 20.0, 30.0, ], dtype='float64') - >>> cudf.Index.from_pandas(pdi, nan_as_null=False) - Index([10.0, 20.0, 30.0, nan], dtype='float64') - """ - if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) - - if not isinstance(index, pd.Index): - raise TypeError("not a pandas.Index") - if isinstance(index, pd.RangeIndex): - return cudf.RangeIndex( - start=index.start, - stop=index.stop, - step=index.step, - name=index.name, - ) - else: - return cudf.Index._from_column( - column.as_column(index, nan_as_null=nan_as_null), - name=index.name, - ) - - @property - def _constructor_expanddim(self): - return cudf.MultiIndex - - def drop_duplicates( - self, - keep="first", - nulls_are_equal=True, - ): - """ - Drop duplicate rows in index. - - keep : {"first", "last", False}, default "first" - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - nulls_are_equal: bool, default True - Null elements are considered equal to other null elements. - """ - - # This utilizes the fact that all `Index` is also a `Frame`. - # Except RangeIndex. - return self._from_columns_like_self( - drop_duplicates( - list(self._columns), - keys=range(len(self._columns)), - keep=keep, - nulls_are_equal=nulls_are_equal, - ), - self._column_names, - ) - - def duplicated(self, keep="first") -> cupy.ndarray: - """ - Indicate duplicate index values. - - Duplicated values are indicated as ``True`` values in the resulting - array. Either all duplicates, all except the first, or all except the - last occurrence of duplicates can be indicated. - - Parameters - ---------- - keep : {'first', 'last', False}, default 'first' - The value or values in a set of duplicates to mark as missing. - - - ``'first'`` : Mark duplicates as ``True`` except for the first - occurrence. - - ``'last'`` : Mark duplicates as ``True`` except for the last - occurrence. - - ``False`` : Mark all duplicates as ``True``. - - Returns - ------- - cupy.ndarray[bool] - - See Also - -------- - Series.duplicated : Equivalent method on cudf.Series. - DataFrame.duplicated : Equivalent method on cudf.DataFrame. - Index.drop_duplicates : Remove duplicate values from Index. - - Examples - -------- - By default, for each set of duplicated values, the first occurrence is - set to False and all others to True: - - >>> import cudf - >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) - >>> idx.duplicated() - array([False, False, True, False, True]) - - which is equivalent to - - >>> idx.duplicated(keep='first') - array([False, False, True, False, True]) - - By using 'last', the last occurrence of each set of duplicated values - is set to False and all others to True: - - >>> idx.duplicated(keep='last') - array([ True, False, True, False, False]) - - By setting keep to ``False``, all duplicates are True: - - >>> idx.duplicated(keep=False) - array([ True, False, True, False, True]) - """ - return self.to_series().duplicated(keep=keep).to_cupy() - - def dropna(self, how="any"): - """ - Drop null rows from Index. - - how : {"any", "all"}, default "any" - Specifies how to decide whether to drop a row. - "any" (default) drops rows containing at least - one null value. "all" drops only rows containing - *all* null values. - """ - if how not in {"any", "all"}: - raise ValueError(f"{how=} must be 'any' or 'all'") - try: - if not self.hasnans: - return self.copy(deep=False) - except NotImplementedError: - pass - # This is to be consistent with IndexedFrame.dropna to handle nans - # as nulls by default - data_columns = [col.nans_to_nulls() for col in self._columns] - - return self._from_columns_like_self( - drop_nulls( - data_columns, - how=how, - keys=range(len(data_columns)), - ), - self._column_names, - ) - - def _gather(self, gather_map, nullify=False, check_bounds=True): - """Gather rows of index specified by indices in `gather_map`. - - Skip bounds checking if check_bounds is False. - Set rows to null for all out of bound indices if nullify is `True`. - """ - gather_map = cudf.core.column.as_column(gather_map) - - # TODO: For performance, the check and conversion of gather map should - # be done by the caller. This check will be removed in future release. - if gather_map.dtype.kind not in "iu": - gather_map = gather_map.astype(size_type_dtype) - - if not _gather_map_is_valid( - gather_map, len(self), check_bounds, nullify - ): - raise IndexError("Gather map index is out of bounds.") - - return self._from_columns_like_self( - gather(list(self._columns), gather_map, nullify=nullify), - self._column_names, - ) - - def take(self, indices, axis=0, allow_fill=True, fill_value=None): - """Return a new index containing the rows specified by *indices* - - Parameters - ---------- - indices : array-like - Array of ints indicating which positions to take. - axis : int - The axis over which to select values, always 0. - allow_fill : Unsupported - fill_value : Unsupported - - Returns - ------- - out : Index - New object with desired subset of rows. - - Examples - -------- - >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e']) - >>> idx.take([2, 0, 4, 3]) - Index(['c', 'a', 'e', 'd'], dtype='object') - """ - - if axis not in {0, "index"}: - raise NotImplementedError( - "Gather along column axis is not yet supported." - ) - if not allow_fill or fill_value is not None: - raise NotImplementedError( - "`allow_fill` and `fill_value` are unsupported." - ) - - return self._gather(indices) - - def _apply_boolean_mask(self, boolean_mask): - """Apply boolean mask to each row of `self`. - - Rows corresponding to `False` is dropped. - """ - boolean_mask = cudf.core.column.as_column(boolean_mask) - if boolean_mask.dtype.kind != "b": - raise ValueError("boolean_mask is not boolean type.") - - return self._from_columns_like_self( - apply_boolean_mask(list(self._columns), boolean_mask), - column_names=self._column_names, - ) - - def repeat(self, repeats, axis=None): - """Repeat elements of a Index. - - Returns a new Index where each element of the current Index is repeated - consecutively a given number of times. - - Parameters - ---------- - repeats : int, or array of ints - The number of repetitions for each element. This should - be a non-negative integer. Repeating 0 times will return - an empty object. - - Returns - ------- - Index - A newly created object of same type as caller with repeated - elements. - - Examples - -------- - >>> index = cudf.Index([10, 22, 33, 55]) - >>> index - Index([10, 22, 33, 55], dtype='int64') - >>> index.repeat(5) - Index([10, 10, 10, 10, 10, 22, 22, 22, 22, 22, 33, - 33, 33, 33, 33, 55, 55, 55, 55, 55], - dtype='int64') - """ - raise NotImplementedError - - def _new_index_for_reset_index( - self, levels: tuple | None, name - ) -> None | BaseIndex: - """Return the new index after .reset_index""" - # None is caught later to return RangeIndex - return None - - def _columns_for_reset_index( - self, levels: tuple | None - ) -> Generator[tuple[Any, ColumnBase], None, None]: - """Return the columns and column names for .reset_index""" - yield ( - "index" if self.name is None else self.name, - next(iter(self._columns)), - ) - - def _split(self, splits): - raise NotImplementedError - - -def _get_result_name(left_name, right_name): - return left_name if _is_same_name(left_name, right_name) else None - - -def _return_get_indexer_result(result): - if cudf.get_option("mode.pandas_compatible"): - return result.astype("int64") - return result diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py deleted file mode 100644 index e2bdecbe67a..00000000000 --- a/python/cudf/cudf/core/_compat.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pandas as pd -from packaging import version - -PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2") -PANDAS_VERSION = version.parse(pd.__version__) - - -PANDAS_GE_210 = PANDAS_VERSION >= version.parse("2.1.0") -PANDAS_GE_220 = PANDAS_VERSION >= version.parse("2.2.0") -PANDAS_LT_300 = PANDAS_VERSION < version.parse("3.0.0") diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py deleted file mode 100644 index 6faeeffdbec..00000000000 --- a/python/cudf/cudf/core/_internals/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py deleted file mode 100644 index 90d9118027a..00000000000 --- a/python/cudf/cudf/core/_internals/expressions.py +++ /dev/null @@ -1,229 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import ast -import functools - -import pyarrow as pa - -import pylibcudf as plc -from pylibcudf.expressions import ( - ASTOperator, - ColumnReference, - Expression, - Literal, - Operation, -) - -# This dictionary encodes the mapping from Python AST operators to their cudf -# counterparts. -python_cudf_operator_map = { - # Binary operators - ast.Add: ASTOperator.ADD, - ast.Sub: ASTOperator.SUB, - ast.Mult: ASTOperator.MUL, - ast.Div: ASTOperator.DIV, - ast.FloorDiv: ASTOperator.FLOOR_DIV, - ast.Mod: ASTOperator.PYMOD, - ast.Pow: ASTOperator.POW, - ast.Eq: ASTOperator.EQUAL, - ast.NotEq: ASTOperator.NOT_EQUAL, - ast.Lt: ASTOperator.LESS, - ast.Gt: ASTOperator.GREATER, - ast.LtE: ASTOperator.LESS_EQUAL, - ast.GtE: ASTOperator.GREATER_EQUAL, - ast.BitXor: ASTOperator.BITWISE_XOR, - # TODO: The mapping of logical/bitwise operators here is inconsistent with - # pandas. In pandas, Both `BitAnd` and `And` map to - # `ASTOperator.LOGICAL_AND` for booleans, while they map to - # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to - # encode this at present because expressions can be arbitrarily nested so - # we won't know the dtype of the input without inserting a much more - # complex traversal of the expression tree to determine the output types at - # each node. For now, we'll rely on users to use the appropriate operator. - ast.BitAnd: ASTOperator.BITWISE_AND, - ast.BitOr: ASTOperator.BITWISE_OR, - ast.And: ASTOperator.LOGICAL_AND, - ast.Or: ASTOperator.LOGICAL_OR, - # Unary operators - ast.Invert: ASTOperator.BIT_INVERT, - ast.Not: ASTOperator.NOT, - # TODO: Missing USub, possibility other unary ops? -} - - -# Mapping between Python function names encode in an ast.Call node and the -# corresponding libcudf C++ AST operators. -python_cudf_function_map = { - # TODO: Operators listed on - # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval # noqa: E501 - # that we don't support yet: - # expm1, log1p, arctan2 and log10. - "isnull": ASTOperator.IS_NULL, - "isna": ASTOperator.IS_NULL, - "sin": ASTOperator.SIN, - "cos": ASTOperator.COS, - "tan": ASTOperator.TAN, - "arcsin": ASTOperator.ARCSIN, - "arccos": ASTOperator.ARCCOS, - "arctan": ASTOperator.ARCTAN, - "sinh": ASTOperator.SINH, - "cosh": ASTOperator.COSH, - "tanh": ASTOperator.TANH, - "arcsinh": ASTOperator.ARCSINH, - "arccosh": ASTOperator.ARCCOSH, - "arctanh": ASTOperator.ARCTANH, - "exp": ASTOperator.EXP, - "log": ASTOperator.LOG, - "sqrt": ASTOperator.SQRT, - "abs": ASTOperator.ABS, - "ceil": ASTOperator.CEIL, - "floor": ASTOperator.FLOOR, - # TODO: Operators supported by libcudf with no Python function analog. - # ast.rint: ASTOperator.RINT, - # ast.cbrt: ASTOperator.CBRT, -} - - -class libcudfASTVisitor(ast.NodeVisitor): - """A NodeVisitor specialized for constructing a libcudf expression tree. - - This visitor is designed to handle AST nodes that have libcudf equivalents. - It constructs column references from names and literals from constants, - then builds up operations. The final result can be accessed using the - `expression` property. The visitor must be kept in scope for as long as the - expression is needed because all of the underlying libcudf expressions will - be destroyed when the libcudfASTVisitor is. - - Parameters - ---------- - col_names : Tuple[str] - The column names used to map the names in an expression. - """ - - def __init__(self, col_names: tuple[str]): - self.stack: list[Expression] = [] - self.nodes: list[Expression] = [] - self.col_names = col_names - - @property - def expression(self): - """Expression: The result of parsing an AST.""" - assert len(self.stack) == 1 - return self.stack[-1] - - def visit_Name(self, node): - try: - col_id = self.col_names.index(node.id) - except ValueError: - raise ValueError(f"Unknown column name {node.id}") - self.stack.append(ColumnReference(col_id)) - - def visit_Constant(self, node): - if not isinstance(node.value, (float, int, str, complex)): - raise ValueError( - f"Unsupported literal {repr(node.value)} of type " - "{type(node.value).__name__}" - ) - self.stack.append( - Literal(plc.interop.from_arrow(pa.scalar(node.value))) - ) - - def visit_UnaryOp(self, node): - self.visit(node.operand) - self.nodes.append(self.stack.pop()) - if isinstance(node.op, ast.USub): - # TODO: Except for leaf nodes, we won't know the type of the - # operand, so there's no way to know whether this should be a float - # or an int. We should maybe see what Spark does, and this will - # probably require casting. - self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1)))) - op = ASTOperator.MUL - self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) - elif isinstance(node.op, ast.UAdd): - self.stack.append(self.nodes[-1]) - else: - op = python_cudf_operator_map[type(node.op)] - self.stack.append(Operation(op, self.nodes[-1])) - - def visit_BinOp(self, node): - self.visit(node.left) - self.visit(node.right) - self.nodes.append(self.stack.pop()) - self.nodes.append(self.stack.pop()) - - op = python_cudf_operator_map[type(node.op)] - self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) - - def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops): - # Helper function handling the common components of parsing BoolOp and - # Compare AST nodes. These two types of nodes both support chaining - # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this - # function helps standardize that. - - # TODO: Whether And/Or and BitAnd/BitOr actually correspond to - # logical or bitwise operators depends on the data types that they - # are applied to. We'll need to add logic to map to that. - inner_ops = [] - for op, (left, right) in zip(operators, operands): - # Note that this will lead to duplicate nodes, e.g. if - # the comparison is `a < b < c` that will be encoded as - # `a < b and b < c`. We could potentially optimize by caching - # expressions by name so that we only construct them once. - self.visit(left) - self.visit(right) - - self.nodes.append(self.stack.pop()) - self.nodes.append(self.stack.pop()) - - op = python_cudf_operator_map[type(op)] - inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2])) - - self.nodes.extend(inner_ops) - - # If we have more than one comparator, we need to link them - # together with LOGICAL_AND operators. - if has_multiple_ops: - op = ASTOperator.LOGICAL_AND - - def _combine_compare_ops(left, right): - self.nodes.append(Operation(op, left, right)) - return self.nodes[-1] - - functools.reduce(_combine_compare_ops, inner_ops) - - self.stack.append(self.nodes[-1]) - - def visit_BoolOp(self, node): - operators = [node.op] * (len(node.values) - 1) - operands = zip(node.values[:-1], node.values[1:]) - self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2) - - def visit_Compare(self, node): - operands = (node.left, *node.comparators) - has_multiple_ops = len(operands) > 2 - operands = zip(operands[:-1], operands[1:]) - self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops) - - def visit_Call(self, node): - try: - op = python_cudf_function_map[node.func.id] - except KeyError: - raise ValueError(f"Unsupported function {node.func}.") - # Assuming only unary functions are supported, which is checked above. - if len(node.args) != 1 or node.keywords: - raise ValueError( - f"Function {node.func} only accepts one positional " - "argument." - ) - self.visit(node.args[0]) - - self.nodes.append(self.stack.pop()) - self.stack.append(Operation(op, self.nodes[-1])) - - -@functools.lru_cache(256) -def parse_expression(expr: str, col_names: tuple[str]): - visitor = libcudfASTVisitor(col_names) - visitor.visit(ast.parse(expr)) - return visitor diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py deleted file mode 100644 index fd89904e766..00000000000 --- a/python/cudf/cudf/core/_internals/timezones.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import datetime -import os -import zoneinfo -from functools import lru_cache -from typing import TYPE_CHECKING, Literal - -import numpy as np -import pandas as pd - -import cudf -from cudf._lib.timezone import make_timezone_transition_table - -if TYPE_CHECKING: - from cudf.core.column.datetime import DatetimeColumn - from cudf.core.column.timedelta import TimeDeltaColumn - - -def get_compatible_timezone(dtype: pd.DatetimeTZDtype) -> pd.DatetimeTZDtype: - """Convert dtype.tz object to zoneinfo object if possible.""" - tz = dtype.tz - if isinstance(tz, zoneinfo.ZoneInfo): - return dtype - if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError( - f"{tz} must be a zoneinfo.ZoneInfo object in pandas_compatible mode." - ) - elif (tzname := getattr(tz, "zone", None)) is not None: - # pytz-like - key = tzname - elif (tz_file := getattr(tz, "_filename", None)) is not None: - # dateutil-like - key = tz_file.split("zoneinfo/")[-1] - elif isinstance(tz, datetime.tzinfo): - # Try to get UTC-like tzinfos - reference = datetime.datetime.now() - key = tz.tzname(reference) - if not (isinstance(key, str) and key.lower() == "utc"): - raise NotImplementedError(f"cudf does not support {tz}") - else: - raise NotImplementedError(f"cudf does not support {tz}") - new_tz = zoneinfo.ZoneInfo(key) - return pd.DatetimeTZDtype(dtype.unit, new_tz) - - -@lru_cache(maxsize=20) -def get_tz_data(zone_name: str) -> tuple[DatetimeColumn, TimeDeltaColumn]: - """ - Return timezone data (transition times and UTC offsets) for the - given IANA time zone. - - Parameters - ---------- - zone_name: str - IANA time zone name - - Returns - ------- - Tuple with two columns containing the transition times - and corresponding UTC offsets. - """ - try: - # like zoneinfo, we first look in TZPATH - tz_table = _find_and_read_tzfile_tzpath(zone_name) - except zoneinfo.ZoneInfoNotFoundError: - # if that fails, we fall back to using `tzdata` - tz_table = _find_and_read_tzfile_tzdata(zone_name) - return tz_table - - -def _find_and_read_tzfile_tzpath( - zone_name: str, -) -> tuple[DatetimeColumn, TimeDeltaColumn]: - for search_path in zoneinfo.TZPATH: - if os.path.isfile(os.path.join(search_path, zone_name)): - return _read_tzfile_as_columns(search_path, zone_name) - raise zoneinfo.ZoneInfoNotFoundError(zone_name) - - -def _find_and_read_tzfile_tzdata( - zone_name: str, -) -> tuple[DatetimeColumn, TimeDeltaColumn]: - import importlib.resources - - package_base = "tzdata.zoneinfo" - try: - return _read_tzfile_as_columns( - str(importlib.resources.files(package_base)), zone_name - ) - # TODO: make it so that the call to libcudf raises a - # FileNotFoundError instead of a RuntimeError - except (ImportError, FileNotFoundError, UnicodeEncodeError, RuntimeError): - # the "except" part of this try-except is basically vendored - # from the zoneinfo library. - # - # There are three types of exception that can be raised that all amount - # to "we cannot find this key": - # - # ImportError: If package_name doesn't exist (e.g. if tzdata is not - # installed, or if there's an error in the folder name like - # Amrica/New_York) - # FileNotFoundError: If resource_name doesn't exist in the package - # (e.g. Europe/Krasnoy) - # UnicodeEncodeError: If package_name or resource_name are not UTF-8, - # such as keys containing a surrogate character. - raise zoneinfo.ZoneInfoNotFoundError(zone_name) - - -def _read_tzfile_as_columns( - tzdir, zone_name: str -) -> tuple[DatetimeColumn, TimeDeltaColumn]: - transition_times_and_offsets = make_timezone_transition_table( - tzdir, zone_name - ) - - if not transition_times_and_offsets: - from cudf.core.column.column import as_column - - # this happens for UTC-like zones - min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") - return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) # type: ignore[return-value] - return tuple(transition_times_and_offsets) # type: ignore[return-value] - - -def check_ambiguous_and_nonexistent( - ambiguous: Literal["NaT"], nonexistent: Literal["NaT"] -) -> tuple[Literal["NaT"], Literal["NaT"]]: - if ambiguous != "NaT": - raise NotImplementedError( - "Only ambiguous='NaT' is currently supported" - ) - if nonexistent != "NaT": - raise NotImplementedError( - "Only nonexistent='NaT' is currently supported" - ) - return ambiguous, nonexistent diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py deleted file mode 100644 index 2199d4d5ba5..00000000000 --- a/python/cudf/cudf/core/_internals/where.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING - -import numpy as np - -import cudf -from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar -from cudf.core.dtypes import CategoricalDtype -from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype - -if TYPE_CHECKING: - from cudf._typing import ScalarLike - from cudf.core.column import ColumnBase - - -def _normalize_categorical(input_col, other): - if isinstance(input_col, cudf.core.column.CategoricalColumn): - if cudf.api.types.is_scalar(other): - try: - other = input_col._encode(other) - except ValueError: - # When other is not present in categories, - # fill with Null. - other = None - other = cudf.Scalar(other, dtype=input_col.codes.dtype) - elif isinstance(other, cudf.core.column.CategoricalColumn): - other = other.codes - - input_col = input_col.codes - return input_col, other - - -def _check_and_cast_columns_with_other( - source_col: ColumnBase, - other: ScalarLike | ColumnBase, - inplace: bool, -) -> tuple[ColumnBase, ScalarLike | ColumnBase]: - # Returns type-casted `source_col` & `other` based on `inplace`. - from cudf.core.column import as_column - - source_dtype = source_col.dtype - if isinstance(source_dtype, CategoricalDtype): - return _normalize_categorical(source_col, other) - - other_is_scalar = is_scalar(other) - if other_is_scalar: - if isinstance(other, (float, np.floating)) and not np.isnan(other): - try: - is_safe = source_dtype.type(other) == other - except OverflowError: - is_safe = False - - if not is_safe: - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {source_dtype.name}" - ) - - if cudf.utils.utils.is_na_like(other): - return _normalize_categorical( - source_col, cudf.Scalar(other, dtype=source_dtype) - ) - - mixed_err = ( - "cudf does not support mixed types, please type-cast the column of " - "dataframe/series and other to same dtypes." - ) - - if inplace: - other = cudf.Scalar(other) if other_is_scalar else other - if is_mixed_with_object_dtype(other, source_col): - raise TypeError(mixed_err) - - if not _can_cast(other.dtype, source_dtype): - warnings.warn( - f"Type-casting from {other.dtype} " - f"to {source_dtype}, there could be potential data loss" - ) - return _normalize_categorical(source_col, other.astype(source_dtype)) - - if _is_non_decimal_numeric_dtype(source_dtype) and as_column( - other - ).can_cast_safely(source_dtype): - common_dtype = source_dtype - else: - common_dtype = find_common_type( - [ - source_dtype, - np.min_scalar_type(other) if other_is_scalar else other.dtype, - ] - ) - - if other_is_scalar: - other = cudf.Scalar(other) - - if is_mixed_with_object_dtype(other, source_col) or ( - source_dtype.kind == "b" and common_dtype.kind != "b" - ): - raise TypeError(mixed_err) - - other = other.astype(common_dtype) - - return _normalize_categorical(source_col.astype(common_dtype), other) - - -def _can_cast(from_dtype, to_dtype): - """ - Utility function to determine if we can cast - from `from_dtype` to `to_dtype`. This function primarily calls - `np.can_cast` but with some special handling around - cudf specific dtypes. - """ - if cudf.utils.utils.is_na_like(from_dtype): - return True - if isinstance(from_dtype, type): - from_dtype = cudf.dtype(from_dtype) - if isinstance(to_dtype, type): - to_dtype = cudf.dtype(to_dtype) - - # TODO : Add precision & scale checking for - # decimal types in future - - if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype): - if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): - return True - elif isinstance(to_dtype, np.dtype): - if to_dtype.kind in {"i", "f", "u", "U", "O"}: - return True - else: - return False - elif isinstance(from_dtype, np.dtype): - if isinstance(to_dtype, np.dtype): - return np.can_cast(from_dtype, to_dtype) - elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype): - if from_dtype.kind in {"i", "f", "u", "U", "O"}: - return True - else: - return False - elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): - return True - else: - return False - elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): - # TODO: Add level based checks too once casting of - # list columns is supported - if isinstance(to_dtype, cudf.core.dtypes.ListDtype): - return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) - else: - return False - elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): - if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): - return True - elif isinstance(to_dtype, np.dtype): - return np.can_cast(from_dtype._categories.dtype, to_dtype) - else: - return False - else: - return np.can_cast(from_dtype, to_dtype) diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py deleted file mode 100644 index ce6bb83bc77..00000000000 --- a/python/cudf/cudf/core/abc.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -"""Common abstract base classes for cudf.""" - -import pickle - -import numpy - -import cudf - - -class Serializable: - """A serializable object composed of device memory buffers. - - This base class defines a standard serialization protocol for objects - encapsulating device memory buffers. Serialization proceeds by copying - device data onto the host, then returning it along with suitable metadata - for reconstruction of the object. Deserialization performs the reverse - process, copying the serialized data from the host to new device buffers. - Subclasses must define the abstract methods :meth:`~.serialize` and - :meth:`~.deserialize`. The former defines the conversion of the object - into a representative collection of metadata and data buffers, while the - latter converts back from that representation into an equivalent object. - """ - - def serialize(self): - """Generate an equivalent serializable representation of an object. - - Subclasses must implement this method to define how the attributes of - the object are converted into a serializable representation. A common - solution is to construct a list containing device buffer attributes in - a well-defined order that can be reinterpreted upon deserialization, - then place all other lightweight attributes into the metadata - dictionary. - - Returns - ------- - Tuple[Dict, List] - The first element of the returned tuple is a dict containing any - serializable metadata required to reconstruct the object. The - second element is a list containing the device data buffers - or memoryviews of the object. - - :meta private: - """ - raise NotImplementedError( - "Subclasses of Serializable must implement serialize" - ) - - @classmethod - def deserialize(cls, header, frames): - """Generate an object from a serialized representation. - - Subclasses must implement this method to define how objects of that - class can be constructed from a serialized representation generalized - by :meth:`serialize`. - - Parameters - ---------- - header : dict - The metadata required to reconstruct the object. - frames : list - The Buffers or memoryviews that the object should contain. - - Returns - ------- - Serializable - A new instance of `cls` (a subclass of `Serializable`) equivalent - to the instance that was serialized to produce the header and - frames. - - :meta private: - """ - raise NotImplementedError( - "Subclasses of Serializable must implement deserialize" - ) - - def device_serialize(self): - """Serialize data and metadata associated with device memory. - - Returns - ------- - header : dict - The metadata required to reconstruct the object. - frames : list - The Buffer or memoryview objects that the object - should contain. - - :meta private: - """ - header, frames = self.serialize() - assert all( - isinstance( - f, - ( - cudf.core.buffer.Buffer, - memoryview, - ), - ) - for f in frames - ) - header["type-serialized"] = pickle.dumps(type(self)) - header["is-cuda"] = [ - hasattr(f, "__cuda_array_interface__") for f in frames - ] - header["lengths"] = [f.nbytes for f in frames] - return header, frames - - @classmethod - def device_deserialize(cls, header, frames): - """Perform device-side deserialization tasks. - - The primary purpose of this method is the creation of device memory - buffers from host buffers where necessary. - - Parameters - ---------- - header : dict - The metadata required to reconstruct the object. - frames : list - The Buffers or memoryviews that the object should contain. - - Returns - ------- - Serializable - A new instance of `cls` (a subclass of `Serializable`) equivalent - to the instance that was serialized to produce the header and - frames. - - :meta private: - """ - typ = pickle.loads(header["type-serialized"]) - frames = [ - cudf.core.buffer.as_buffer(f) if c else memoryview(f) - for c, f in zip(header["is-cuda"], frames) - ] - return typ.deserialize(header, frames) - - def host_serialize(self): - """Serialize data and metadata associated with host memory. - - Returns - ------- - header : dict - The metadata required to reconstruct the object. - frames : list - The Buffers or memoryviews that the object should contain. - - :meta private: - """ - header, frames = self.device_serialize() - header["writeable"] = len(frames) * (None,) - frames = [ - f.memoryview() if c else memoryview(f) - for c, f in zip(header["is-cuda"], frames) - ] - return header, frames - - @classmethod - def host_deserialize(cls, header, frames): - """Perform device-side deserialization tasks. - - Parameters - ---------- - header : dict - The metadata required to reconstruct the object. - frames : list - The Buffers or memoryviews that the object should contain. - - Returns - ------- - Serializable - A new instance of `cls` (a subclass of `Serializable`) equivalent - to the instance that was serialized to produce the header and - frames. - - :meta private: - """ - frames = [ - cudf.core.buffer.as_buffer(f) if c else f - for c, f in zip(header["is-cuda"], map(memoryview, frames)) - ] - obj = cls.device_deserialize(header, frames) - return obj - - def __reduce_ex__(self, protocol): - header, frames = self.host_serialize() - - # Since memoryviews are not pickable, we convert them to numpy - # arrays (zero-copy). This works seamlessly because host_deserialize - # converts the frames back into memoryviews. - frames = [numpy.asarray(f) for f in frames] - return self.host_deserialize, (header, frames) diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py deleted file mode 100644 index b28fce6d343..00000000000 --- a/python/cudf/cudf/core/algorithms.py +++ /dev/null @@ -1,269 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING - -import cupy as cp -import numpy as np - -import cudf -from cudf.core.column import as_column -from cudf.core.index import Index, RangeIndex -from cudf.core.scalar import Scalar -from cudf.options import get_option -from cudf.utils.dtypes import can_convert_to_column - -if TYPE_CHECKING: - from cudf.core.column.column import ColumnBase - from cudf.core.index import BaseIndex - - -def factorize(values, sort=False, use_na_sentinel=True, size_hint=None): - """Encode the input values as integer labels - - Parameters - ---------- - values: Series, Index, or CuPy array - The data to be factorized. - sort : bool, default True - Sort uniques and shuffle codes to maintain the relationship. - use_na_sentinel : bool, default True - If True, the sentinel -1 will be used for NA values. - If False, NA values will be encoded as non-negative - integers and will not drop the NA from the uniques - of the values. - - Returns - ------- - (labels, cats) : (cupy.ndarray, cupy.ndarray or Index) - - *labels* contains the encoded values - - *cats* contains the categories in order that the N-th - item corresponds to the (N-1) code. - - See Also - -------- - cudf.Series.factorize : Encode the input values of Series. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> data = cudf.Series(['a', 'c', 'c']) - >>> codes, uniques = cudf.factorize(data) - >>> codes - array([0, 1, 1], dtype=int8) - >>> uniques - Index(['a' 'c'], dtype='object') - - When ``use_na_sentinel=True`` (the default), missing values are indicated - in the `codes` with the sentinel value ``-1`` and missing values are not - included in `uniques`. - - >>> codes, uniques = cudf.factorize(['b', None, 'a', 'c', 'b']) - >>> codes - array([ 1, -1, 0, 2, 1], dtype=int8) - >>> uniques - Index(['a', 'b', 'c'], dtype='object') - - If NA is in the values, and we want to include NA in the uniques of the - values, it can be achieved by setting ``use_na_sentinel=False``. - - >>> values = np.array([1, 2, 1, np.nan]) - >>> codes, uniques = cudf.factorize(values) - >>> codes - array([ 0, 1, 0, -1], dtype=int8) - >>> uniques - Index([1.0, 2.0], dtype='float64') - >>> codes, uniques = cudf.factorize(values, use_na_sentinel=False) - >>> codes - array([1, 2, 1, 0], dtype=int8) - >>> uniques - Index([, 1.0, 2.0], dtype='float64') - """ - - return_cupy_array = isinstance(values, cp.ndarray) - - if not can_convert_to_column(values): - raise TypeError( - "'values' can only be a Series, Index, or CuPy array, " - f"got {type(values)}" - ) - - values = as_column(values) - - if size_hint: - warnings.warn("size_hint is not applicable for cudf.factorize") - - if use_na_sentinel: - na_sentinel = Scalar(-1) - cats = values.dropna() - else: - na_sentinel = Scalar(None, dtype=values.dtype) - cats = values - - cats = cats.unique().astype(values.dtype) - - if sort: - cats = cats.sort_values() - - labels = values._label_encoding( - cats=cats, - na_sentinel=na_sentinel, - dtype="int64" if get_option("mode.pandas_compatible") else None, - ).values - - return labels, cats.values if return_cupy_array else Index._from_column( - cats - ) - - -def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase: - """ - Interpolate over a float column. assumes a linear interpolation - strategy using the index of the data to denote spacing of the x - values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4] - would result in [1.0, 3.0, 4.0]. - """ - # figure out where the nans are - mask = column.isnull() - - # trivial cases, all nan or no nans - if not mask.any() or mask.all(): - return column.copy() - - valid_locs = ~mask - if isinstance(index, RangeIndex): - # Each point is evenly spaced, index values don't matter - known_x = cp.flatnonzero(valid_locs.values) - else: - known_x = index._column.apply_boolean_mask(valid_locs).values # type: ignore[attr-defined] - known_y = column.apply_boolean_mask(valid_locs).values - - result = cp.interp(index.to_cupy(), known_x, known_y) - - # find the first nan - first_nan_idx = valid_locs.values.argmax().item() - result[:first_nan_idx] = np.nan - return as_column(result) - - -def unique(values): - """ - Return unique values from array-like - - Parameters - ---------- - values : 1d array-like - - Returns - ------- - cudf.Series, - - The return can be: - - * Index : when the input is an Index - * cudf.Series : when the input is a Series - * cupy.ndarray : when the input is a cupy.ndarray - - Return cudf.Series, cudf.Index, or cupy.ndarray. - - See Also - -------- - Index.unique : Return unique values from an Index. - Series.unique : Return unique values of Series object. - - Examples - -------- - >>> cudf.unique(cudf.Series([2, 1, 3, 3])) - 0 2 - 1 1 - 2 3 - dtype: int64 - - >>> cudf.unique(cudf.Series([2] + [1] * 5)) - 0 2 - 1 1 - dtype: int64 - - >>> cudf.unique(cudf.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) - 0 2016-01-01 - dtype: datetime64[ns] - - >>> cudf.unique( - ... cudf.Series( - ... [ - ... pd.Timestamp("20160101", tz="US/Eastern"), - ... pd.Timestamp("20160101", tz="US/Eastern"), - ... pd.Timestamp("20160103", tz="US/Eastern"), - ... ] - ... ) - ... ) - 0 2016-01-01 00:00:00-05:00 - 1 2016-01-03 00:00:00-05:00 - dtype: datetime64[ns, US/Eastern] - - >>> cudf.unique( - ... cudf.Index( - ... [ - ... pd.Timestamp("20160101", tz="US/Eastern"), - ... pd.Timestamp("20160101", tz="US/Eastern"), - ... pd.Timestamp("20160103", tz="US/Eastern"), - ... ] - ... ) - ... ) - DatetimeIndex(['2016-01-01 00:00:00-05:00', '2016-01-03 00:00:00-05:00'],dtype='datetime64[ns, US/Eastern]') - - An unordered Categorical will return categories in the - order of appearance. - - >>> cudf.unique(cudf.Series(pd.Categorical(list("baabc")))) - 0 b - 1 a - 2 c - dtype: category - Categories (3, object): ['a', 'b', 'c'] - - >>> cudf.unique(cudf.Series(pd.Categorical(list("baabc"), categories=list("abc")))) - 0 b - 1 a - 2 c - dtype: category - Categories (3, object): ['a', 'b', 'c'] - - An ordered Categorical preserves the category ordering. - - >>> pd.unique( - ... pd.Series( - ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True) - ... ) - ... ) - 0 b - 1 a - 2 c - dtype: category - Categories (3, object): ['a' < 'b' < 'c'] - - An array of tuples - - >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values) - array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) - """ - if not isinstance(values, (cudf.Series, cudf.Index, cp.ndarray)): - raise ValueError( - "Must pass cudf.Series, cudf.Index, or cupy.ndarray object" - ) - if isinstance(values, cp.ndarray): - # pandas.unique will not sort the values in the result - # while cupy.unique documents it will, so we pass cupy.ndarray - # through cudf.Index to maintain the original order. - return cp.asarray(cudf.Index(values).unique()) - if isinstance(values, cudf.Series): - if get_option("mode.pandas_compatible"): - if isinstance(values.dtype, cudf.CategoricalDtype): - raise NotImplementedError( - "cudf.Categorical is not implemented" - ) - else: - return cp.asarray(values.unique()) - return values.unique() diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py deleted file mode 100644 index 9b9774c12be..00000000000 --- a/python/cudf/cudf/core/buffer/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from cudf.core.buffer.buffer import ( - Buffer, - BufferOwner, - cuda_array_interface_wrapper, -) -from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer -from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock -from cudf.core.buffer.utils import ( - acquire_spill_lock, - as_buffer, - get_spill_lock, -) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py deleted file mode 100644 index 32ae8c5ee53..00000000000 --- a/python/cudf/cudf/core/buffer/buffer.py +++ /dev/null @@ -1,511 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import math -import pickle -import weakref -from types import SimpleNamespace -from typing import Any, Literal, Mapping - -import numpy -from typing_extensions import Self - -import pylibcudf -import rmm - -import cudf -from cudf.core.abc import Serializable -from cudf.utils.string import format_bytes - - -def host_memory_allocation(nbytes: int) -> memoryview: - """Allocate host memory using NumPy - - This is an alternative to `bytearray` to avoid memory initialization cost. - A `bytearray` is zero-initialized using `calloc`, which we don't need. - Additionally, `numpy.empty` both skips the zero-initialization and uses - hugepages when available . - - Parameters - ---------- - nbytes : int - Size of the new host allocation in bytes. - - Return - ------ - memoryview - The new host allocation. - """ - return numpy.empty((nbytes,), dtype="u1").data - - -def cuda_array_interface_wrapper( - ptr: int, - size: int, - owner: object | None = None, - readonly=False, - typestr="|u1", - version=0, -): - """Wrap device pointer in an object that exposes `__cuda_array_interface__` - - See - - Parameters - ---------- - ptr : int - An integer representing a pointer to device memory. - size : int, optional - Size of device memory in bytes. - owner : object, optional - Python object to which the lifetime of the memory allocation is tied. - A reference to this object is kept in the returned wrapper object. - readonly: bool, optional - Mark the interface read-only. - typestr: str, optional - The type string of the interface. By default this is "|u1", which - means "an unsigned integer with a not relevant byteorder". See: - - version : bool, optional - The version of the interface. - - Return - ------ - SimpleNamespace - An object that exposes `__cuda_array_interface__` and keeps a reference - to `owner`. - """ - - if size < 0: - raise ValueError("size cannot be negative") - - return SimpleNamespace( - __cuda_array_interface__={ - "data": (ptr, readonly), - "shape": (size,), - "strides": None, - "typestr": typestr, - "version": version, - }, - owner=owner, - ) - - -class BufferOwner(Serializable): - """An owning buffer that represents device memory. - - This class isn't meant to be used throughout cuDF. Instead, it - standardizes data owning by wrapping any data object that - represents device memory. Multiple `Buffer` instances, which are - the ones used throughout cuDF, can then refer to the same - `BufferOwner` instance. - - In order to implement copy-on-write and spillable buffers, we need the - ability to detect external access to the underlying memory. We say that - the buffer has been exposed if the device pointer (integer or void*) has - been accessed outside of BufferOwner. In this case, we have no control - over knowing if the data is being modified by a third party. - - Use `from_device_memory` and `from_host_memory` to create - a new instance from either device or host memory respectively. - - Parameters - ---------- - ptr - An integer representing a pointer to memory. - size - The size of the memory in nbytes - owner - Python object to which the lifetime of the memory allocation is tied. - This buffer will keep a reference to `owner`. - exposed - Pointer to the underlying memory - - Raises - ------ - ValueError - If size is negative - """ - - _ptr: int - _size: int - _owner: object - _exposed: bool - # The set of buffers that point to this owner. - _slices: weakref.WeakSet[Buffer] - - def __init__( - self, - *, - ptr: int, - size: int, - owner: object, - exposed: bool, - ): - if size < 0: - raise ValueError("size cannot be negative") - - self._ptr = ptr - self._size = size - self._owner = owner - self._exposed = exposed - self._slices = weakref.WeakSet() - - @classmethod - def from_device_memory(cls, data: Any, exposed: bool) -> Self: - """Create from an object providing a `__cuda_array_interface__`. - - No data is being copied. - - Parameters - ---------- - data : device-buffer-like - An object implementing the CUDA Array Interface. - exposed : bool - Mark the buffer as permanently exposed. This is used by - ExposureTrackedBuffer to determine when a deep copy is required - and by SpillableBuffer to mark the buffer unspillable. - - Returns - ------- - BufferOwner - BufferOwner wrapping `data` - - Raises - ------ - AttributeError - If data does not support the cuda array interface - ValueError - If the resulting buffer has negative size - """ - - if isinstance(data, rmm.DeviceBuffer): # Common case shortcut - ptr = data.ptr - size = data.size - else: - ptr, size = get_ptr_and_size(data.__cuda_array_interface__) - return cls(ptr=ptr, size=size, owner=data, exposed=exposed) - - @classmethod - def from_host_memory(cls, data: Any) -> Self: - """Create an owner from a buffer or array like object - - Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.asanyarray()` - - The host memory is copied to a new device allocation. - - Raises ValueError if array isn't C-contiguous. - - Parameters - ---------- - data : Any - An object that represents host memory. - - Returns - ------- - BufferOwner - BufferOwner wrapping a device copy of `data`. - """ - - # Convert to numpy array, this will not copy data in most cases. - ary = numpy.asanyarray(data) - # Extract pointer and size - ptr, size = get_ptr_and_size(ary.__array_interface__) - # Copy to device memory - buf = rmm.DeviceBuffer(ptr=ptr, size=size) - # Create from device memory - return cls.from_device_memory(buf, exposed=False) - - @property - def size(self) -> int: - """Size of the buffer in bytes.""" - return self._size - - @property - def nbytes(self) -> int: - """Size of the buffer in bytes.""" - return self._size - - @property - def owner(self) -> object: - """Object owning the memory of the buffer.""" - return self._owner - - @property - def exposed(self) -> bool: - """The current exposure status of the buffer - - This is used by ExposureTrackedBuffer to determine when a deep copy - is required and by SpillableBuffer to mark the buffer unspillable. - """ - return self._exposed - - def mark_exposed(self) -> None: - """Mark the buffer as "exposed" permanently - - This is used by ExposureTrackedBuffer to determine when a deep copy - is required and by SpillableBuffer to mark the buffer unspillable. - - Notice, once the exposure status becomes True, it will never change - back. - """ - self._exposed = True - - def get_ptr(self, *, mode: Literal["read", "write"]) -> int: - """Device pointer to the start of the buffer. - - Parameters - ---------- - mode : str - Supported values are {"read", "write"} - If "write", the data pointed to may be modified - by the caller. If "read", the data pointed to - must not be modified by the caller. - Failure to fulfill this contract will cause - incorrect behavior. - - Returns - ------- - int - The device pointer as an integer - - See Also - -------- - SpillableBuffer.get_ptr - ExposureTrackedBuffer.get_ptr - """ - return self._ptr - - def memoryview( - self, *, offset: int = 0, size: int | None = None - ) -> memoryview: - """Read-only access to the buffer through host memory.""" - size = self._size if size is None else size - host_buf = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( - self.get_ptr(mode="read") + offset, host_buf - ) - return memoryview(host_buf).toreadonly() - - def __str__(self) -> str: - return ( - f"<{self.__class__.__name__} size={format_bytes(self._size)} " - f"ptr={hex(self._ptr)} owner={self._owner!r}>" - ) - - -class Buffer(Serializable): - """A buffer that represents a slice or view of a `BufferOwner`. - - Use the factory function `as_buffer` to create a Buffer instance. - - Note - ---- - This buffer is untyped, so all indexing and sizes are in bytes. - - Parameters - ---------- - owner - The owning exposure buffer this refers to. - offset - The offset relative to the start memory of owner (in bytes). - size - The size of the buffer (in bytes). If None, use the size of owner. - """ - - def __init__( - self, - *, - owner: BufferOwner, - offset: int = 0, - size: int | None = None, - ) -> None: - size = owner.size if size is None else size - if size < 0: - raise ValueError("size cannot be negative") - if offset < 0: - raise ValueError("offset cannot be negative") - if offset + size > owner.size: - raise ValueError( - "offset+size cannot be greater than the size of owner" - ) - self._owner = owner - self._offset = offset - self._size = size - - @property - def size(self) -> int: - """Size of the buffer in bytes.""" - return self._size - - @property - def nbytes(self) -> int: - """Size of the buffer in bytes.""" - return self._size - - @property - def owner(self) -> BufferOwner: - """Object owning the memory of the buffer.""" - return self._owner - - def __getitem__(self, key: slice) -> Self: - """Create a new slice of the buffer.""" - if not isinstance(key, slice): - raise TypeError( - "Argument 'key' has incorrect type " - f"(expected slice, got {key.__class__.__name__})" - ) - start, stop, step = key.indices(self.size) - if step != 1: - raise ValueError("slice must be C-contiguous") - return self.__class__( - owner=self._owner, offset=self._offset + start, size=stop - start - ) - - def get_ptr(self, *, mode: Literal["read", "write"]) -> int: - return self._owner.get_ptr(mode=mode) + self._offset - - def memoryview(self) -> memoryview: - return self._owner.memoryview(offset=self._offset, size=self._size) - - def copy(self, deep: bool = True) -> Self: - """Return a copy of Buffer. - - Parameters - ---------- - deep : bool, default True - - If deep=True, returns a deep copy of the underlying data. - - If deep=False, returns a new `Buffer` instance that refers - to the same `BufferOwner` as this one. Thus, no device - data are being copied. - - Returns - ------- - Buffer - A new buffer that either refers to either a new or an existing - `BufferOwner` depending on the `deep` argument (see above). - """ - - # When doing a shallow copy, we just return a new slice - if not deep: - return self.__class__( - owner=self._owner, offset=self._offset, size=self._size - ) - - # Otherwise, we create a new copy of the memory - owner = self._owner.from_device_memory( - rmm.DeviceBuffer( - ptr=self._owner.get_ptr(mode="read") + self._offset, - size=self.size, - ), - exposed=False, - ) - return self.__class__(owner=owner, offset=0, size=owner.size) - - @property - def __cuda_array_interface__(self) -> Mapping: - """Implementation of the CUDA Array Interface.""" - return { - "data": (self.get_ptr(mode="write"), False), - "shape": (self.size,), - "strides": None, - "typestr": "|u1", - "version": 0, - } - - def serialize(self) -> tuple[dict, list]: - """Serialize the buffer into header and frames. - - The frames can be a mixture of memoryview, Buffer, and BufferOwner - objects. - - Returns - ------- - Tuple[dict, List] - The first element of the returned tuple is a dict containing any - serializable metadata required to reconstruct the object. The - second element is a list containing single frame. - """ - header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) - header["frame_count"] = 1 - frames = [self] - return header, frames - - @classmethod - def deserialize(cls, header: dict, frames: list) -> Self: - """Create an Buffer from a serialized representation. - - Parameters - ---------- - header : dict - The metadata required to reconstruct the object. - frames : list - The Buffer and memoryview that makes up the Buffer. - - Returns - ------- - Buffer - The deserialized Buffer. - """ - if header["frame_count"] != 1: - raise ValueError("Deserializing a Buffer expect a single frame") - frame = frames[0] - if isinstance(frame, cls): - return frame # The frame is already deserialized - - owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"]) - if hasattr(frame, "__cuda_array_interface__"): - owner = owner_type.from_device_memory(frame, exposed=False) - else: - owner = owner_type.from_host_memory(frame) - return cls( - owner=owner, - offset=0, - size=owner.size, - ) - - def __repr__(self) -> str: - return ( - f"{self.__class__.__name__}(owner={self._owner!r}, " - f"offset={self._offset!r}, size={self._size!r})" - ) - - def __str__(self) -> str: - return ( - f"<{self.__class__.__name__} size={format_bytes(self._size)} " - f"offset={format_bytes(self._offset)} of {self._owner}>" - ) - - -def get_ptr_and_size(array_interface: Mapping) -> tuple[int, int]: - """Retrieve the pointer and size from an array interface. - - Raises ValueError if array isn't C-contiguous. - - Parameters - ---------- - array_interface : Mapping - The array interface metadata. - - Return - ------ - pointer : int - The pointer to device or host memory - size : int - The size in bytes - """ - - shape = array_interface["shape"] or (1,) - strides = array_interface["strides"] - itemsize = cudf.dtype(array_interface["typestr"]).itemsize - if strides is None or pylibcudf.column.is_c_contiguous( - shape, strides, itemsize - ): - nelem = math.prod(shape) - ptr = array_interface["data"][0] or 0 - return ptr, nelem * itemsize - raise ValueError("Buffer data must be C-contiguous") diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py deleted file mode 100644 index 0bd8d6054b3..00000000000 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ /dev/null @@ -1,102 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -from typing import Literal, Mapping - -from typing_extensions import Self - -import cudf -from cudf.core.buffer.buffer import Buffer, BufferOwner - - -class ExposureTrackedBuffer(Buffer): - """An exposure tracked buffer. - - Parameters - ---------- - owner - The owning exposure tracked buffer this refers to. - offset - The offset relative to the start memory of owner (in bytes). - size - The size of the slice (in bytes) - """ - - def __init__( - self, - owner: BufferOwner, - offset: int = 0, - size: int | None = None, - ) -> None: - super().__init__(owner=owner, offset=offset, size=size) - self.owner._slices.add(self) - - def get_ptr(self, *, mode: Literal["read", "write"]) -> int: - if mode == "write" and cudf.get_option("copy_on_write"): - self.make_single_owner_inplace() - return super().get_ptr(mode=mode) - - def copy(self, deep: bool = True) -> Self: - """Return a copy of Buffer. - - What actually happens when `deep == False` depends on the - "copy_on_write" option. When copy-on-write is enabled, a shallow copy - becomes a deep copy if the buffer has been exposed. This is because we - have no control over knowing if the data is being modified when the - buffer has been exposed to third-party. - - Parameters - ---------- - deep : bool, default True - The semantics when copy-on-write is disabled: - - If deep=True, returns a deep copy of the underlying data. - - If deep=False, returns a shallow copy of the Buffer pointing - to the same underlying data. - The semantics when copy-on-write is enabled: - - From the users perspective, always a deep copy of the - underlying data. However, the data isn't actually copied - until someone writers to the returned buffer. - - Returns - ------- - ExposureTrackedBuffer - A slice pointing to either a new or the existing owner - depending on the expose status of the owner and the - copy-on-write option (see above). - """ - if cudf.get_option("copy_on_write"): - return super().copy(deep=deep or self.owner.exposed) - return super().copy(deep=deep) - - @property - def __cuda_array_interface__(self) -> Mapping: - if cudf.get_option("copy_on_write"): - self.make_single_owner_inplace() - return super().__cuda_array_interface__ - - def make_single_owner_inplace(self) -> None: - """Make sure this slice is the only one pointing to the owner. - - This is used by copy-on-write to trigger a deep copy when write - access is detected. - - Parameters - ---------- - data : device-buffer-like - An object implementing the CUDA Array Interface. - - Returns - ------- - Buffer - Buffer representing the same device memory as `data` - """ - - if len(self.owner._slices) > 1: - # If this is not the only slice pointing to `self.owner`, we - # point to a new copy of our slice of `self.owner`. - t = self.copy(deep=True) - self._owner = t.owner - self._offset = t._offset - self._size = t._size - self._owner._slices.add(self) diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py deleted file mode 100644 index ed351a6b107..00000000000 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ /dev/null @@ -1,504 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import gc -import io -import textwrap -import threading -import traceback -import warnings -import weakref -from collections import defaultdict -from contextlib import contextmanager -from dataclasses import dataclass -from functools import partial -from typing import TYPE_CHECKING - -import rmm.mr - -from cudf.options import get_option -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.string import format_bytes - -if TYPE_CHECKING: - from cudf.core.buffer.spillable_buffer import SpillableBufferOwner - -_spill_cudf_nvtx_annotate = partial( - _performance_tracking, domain="cudf_python-spill" -) - - -def get_traceback() -> str: - """Pretty print current traceback to a string""" - with io.StringIO() as f: - traceback.print_stack(file=f) - f.seek(0) - return f.read() - - -def get_rmm_memory_resource_stack( - mr: rmm.mr.DeviceMemoryResource, -) -> list[rmm.mr.DeviceMemoryResource]: - """Get the RMM resource stack - - Parameters - ---------- - mr : rmm.mr.DeviceMemoryResource - Top of the resource stack - - Return - ------ - list - List of RMM resources - """ - - if hasattr(mr, "upstream_mr"): - return [mr] + get_rmm_memory_resource_stack(mr.upstream_mr) - return [mr] - - -class SpillStatistics: - """Gather spill statistics - - Levels of information gathered: - 0 - disabled (no overhead). - 1+ - duration and number of bytes spilled (very low overhead). - 2+ - a traceback for each time a spillable buffer is exposed - permanently (potential high overhead). - - The statistics are printed when spilling-on-demand fails to find - any buffer to spill. It is possible to retrieve the statistics - manually through the spill manager, see example below. - - Parameters - ---------- - level : int - If not 0, enables statistics at the specified level. - - Examples - -------- - >>> import cudf - >>> from cudf.core.buffer.spill_manager import get_global_manager - >>> manager = get_global_manager() - >>> manager.statistics - - >>> df = cudf.DataFrame({"a": [1,2,3]}) - >>> manager.spill_to_device_limit(1) # Spill df - 24 - >>> print(get_global_manager().statistics) - Spill Statistics (level=1): - Spilling (level >= 1): - gpu => cpu: 24B in 0.0033579860000827466s - """ - - @dataclass - class Expose: - traceback: str - count: int = 1 - total_nbytes: int = 0 - spilled_nbytes: int = 0 - - spill_totals: dict[tuple[str, str], tuple[int, float]] - - def __init__(self, level) -> None: - self.lock = threading.Lock() - self.level = level - self.spill_totals = defaultdict(lambda: (0, 0)) - # Maps each traceback to a Expose - self.exposes: dict[str, SpillStatistics.Expose] = {} - - def log_spill(self, src: str, dst: str, nbytes: int, time: float) -> None: - """Log a (un-)spilling event - - Parameters - ---------- - src : str - The memory location before spilling. - dst : str - The memory location after spilling. - nbytes : int - Number of bytes (un-)spilled. - nbytes : float - Elapsed time the event took in seconds. - """ - if self.level < 1: - return - with self.lock: - total_nbytes, total_time = self.spill_totals[(src, dst)] - self.spill_totals[(src, dst)] = ( - total_nbytes + nbytes, - total_time + time, - ) - - def log_expose(self, buf: SpillableBufferOwner) -> None: - """Log an expose event - - We track logged exposes by grouping them by their traceback such - that `self.exposes` maps tracebacks (as strings) to their logged - data (as `Expose`). - - Parameters - ---------- - buf : spillabe-buffer - The buffer being exposed. - """ - if self.level < 2: - return - with self.lock: - tb = get_traceback() - stat = self.exposes.get(tb, None) - spilled_nbytes = buf.nbytes if buf.is_spilled else 0 - if stat is None: - self.exposes[tb] = self.Expose( - traceback=tb, - total_nbytes=buf.nbytes, - spilled_nbytes=spilled_nbytes, - ) - else: - stat.count += 1 - stat.total_nbytes += buf.nbytes - stat.spilled_nbytes += spilled_nbytes - - def __repr__(self) -> str: - return f"" - - def __str__(self) -> str: - with self.lock: - ret = f"Spill Statistics (level={self.level}):\n" - if self.level == 0: - return ret[:-1] + " N/A" - - # Print spilling stats - ret += " Spilling (level >= 1):" - if len(self.spill_totals) == 0: - ret += " None" - ret += "\n" - for (src, dst), (nbytes, time) in self.spill_totals.items(): - ret += f" {src} => {dst}: " - ret += f"{format_bytes(nbytes)} in {time:.3f}s\n" - - # Print expose stats - ret += " Exposed buffers (level >= 2): " - if self.level < 2: - return ret + "disabled" - if len(self.exposes) == 0: - ret += "None" - ret += "\n" - for s in sorted(self.exposes.values(), key=lambda x: -x.count): - ret += textwrap.indent( - ( - f"exposed {s.count} times, " - f"total: {format_bytes(s.total_nbytes)}, " - f"spilled: {format_bytes(s.spilled_nbytes)}, " - f"traceback:\n{s.traceback}" - ), - prefix=" " * 4, - ) - return ret[:-1] # Remove last `\n` - - -class SpillManager: - """Manager of spillable buffers. - - This class implements tracking of all known spillable buffers, on-demand - spilling of said buffers, and (optionally) maintains a memory usage limit. - - When `device_memory_limit=`, the manager will try keep - the device memory usage below the specified limit by spilling of spillable - buffers continuously, which will introduce a modest overhead. - Notice, this is a soft limit. The memory usage might exceed the limit if - too many buffers are unspillable. - - Parameters - ---------- - device_memory_limit: int, optional - If not None, this is the device memory limit in bytes that triggers - device to host spilling. The global manager sets this to the value - of `CUDF_SPILL_DEVICE_LIMIT` or None. - statistic_level: int, optional - If not 0, enables statistics at the specified level. See - SpillStatistics for the different levels. - """ - - _buffers: weakref.WeakValueDictionary[int, SpillableBufferOwner] - statistics: SpillStatistics - - def __init__( - self, - *, - device_memory_limit: int | None = None, - statistic_level: int = 0, - ) -> None: - self._lock = threading.Lock() - self._buffers = weakref.WeakValueDictionary() - self._id_counter = 0 - self._device_memory_limit = device_memory_limit - self.statistics = SpillStatistics(statistic_level) - - def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool: - """Try to handle an out-of-memory error by spilling - - This can by used as the callback function to RMM's - `FailureCallbackResourceAdaptor` - - Parameters - ---------- - nbytes : int - Number of bytes to try to spill. - retry_once : bool, optional - If True, call `gc.collect()` and retry once. - - Return - ------ - bool - True if any buffers were freed otherwise False. - - Warning - ------- - In order to avoid deadlock, this function should not lock - already locked buffers. - """ - # Let's try to spill device memory - - spilled = self.spill_device_memory(nbytes=nbytes) - - if spilled > 0: - return True # Ask RMM to retry the allocation - - if retry_once: - # Let's collect garbage and try one more time - gc.collect() - return self._out_of_memory_handle(nbytes, retry_once=False) - - # TODO: write to log instead of stdout - print( - f"[WARNING] RMM allocation of {format_bytes(nbytes)} bytes " - "failed, spill-on-demand couldn't find any device memory to " - f"spill:\n{repr(self)}\ntraceback:\n{get_traceback()}\n" - f"{self.statistics}" - ) - return False # Since we didn't find anything to spill, we give up - - def add(self, buffer: SpillableBufferOwner) -> None: - """Add buffer to the set of managed buffers - - The manager keeps a weak reference to the buffer - - Parameters - ---------- - buffer : SpillableBufferOwner - The buffer to manage - """ - if buffer.size > 0 and not buffer.exposed: - with self._lock: - self._buffers[self._id_counter] = buffer - self._id_counter += 1 - self.spill_to_device_limit() - - def buffers( - self, order_by_access_time: bool = False - ) -> tuple[SpillableBufferOwner, ...]: - """Get all managed buffers - - Parameters - ---------- - order_by_access_time : bool, optional - Order the buffer by access time (ascending order) - - Return - ------ - tuple - Tuple of buffers - """ - with self._lock: - ret = tuple(self._buffers.values()) - if order_by_access_time: - ret = tuple(sorted(ret, key=lambda b: b.last_accessed)) - return ret - - @_spill_cudf_nvtx_annotate - def spill_device_memory(self, nbytes: int) -> int: - """Try to spill device memory - - This function is safe to call doing spill-on-demand - since it does not lock buffers already locked. - - Parameters - ---------- - nbytes : int - Number of bytes to try to spill - - Return - ------ - int - Number of actually bytes spilled. - """ - spilled = 0 - for buf in self.buffers(order_by_access_time=True): - if buf.lock.acquire(blocking=False): - try: - if not buf.is_spilled and buf.spillable: - buf.spill(target="cpu") - spilled += buf.size - if spilled >= nbytes: - break - finally: - buf.lock.release() - return spilled - - def spill_to_device_limit(self, device_limit: int | None = None) -> int: - """Try to spill device memory until device limit - - Notice, by default this is a no-op. - - Parameters - ---------- - device_limit : int, optional - Limit in bytes. If None, the value of the environment variable - `CUDF_SPILL_DEVICE_LIMIT` is used. If this is not set, the method - does nothing and returns 0. - - Return - ------ - int - The number of bytes spilled. - """ - limit = ( - self._device_memory_limit if device_limit is None else device_limit - ) - if limit is None: - return 0 - unspilled = sum( - buf.size for buf in self.buffers() if not buf.is_spilled - ) - return self.spill_device_memory(nbytes=unspilled - limit) - - def __repr__(self) -> str: - spilled = sum(buf.size for buf in self.buffers() if buf.is_spilled) - unspilled = sum( - buf.size for buf in self.buffers() if not buf.is_spilled - ) - unspillable = 0 - for buf in self.buffers(): - if not (buf.is_spilled or buf.spillable): - unspillable += buf.size - unspillable_ratio = unspillable / unspilled if unspilled else 0 - - dev_limit = "N/A" - if self._device_memory_limit is not None: - dev_limit = format_bytes(self._device_memory_limit) - - return ( - f"" - ) - - -# The global manager has three states: -# - Uninitialized -# - Initialized to None (spilling disabled) -# - Initialized to a SpillManager instance (spilling enabled) -_global_manager_uninitialized: bool = True -_global_manager: SpillManager | None = None - - -def set_global_manager(manager: SpillManager | None) -> None: - """Set the global manager, which if None disables spilling""" - - global _global_manager, _global_manager_uninitialized - if _global_manager is not None: - gc.collect() - buffers = _global_manager.buffers() - if len(buffers) > 0: - warnings.warn(f"overwriting non-empty manager: {buffers}") - - _global_manager = manager - _global_manager_uninitialized = False - - -def get_global_manager() -> SpillManager | None: - """Get the global manager or None if spilling is disabled""" - global _global_manager_uninitialized - if _global_manager_uninitialized: - if get_option("spill"): - manager = SpillManager( - device_memory_limit=get_option("spill_device_limit"), - statistic_level=get_option("spill_stats"), - ) - set_global_manager(manager) - if get_option("spill_on_demand"): - set_spill_on_demand_globally() - else: - set_global_manager(None) - return _global_manager - - -def set_spill_on_demand_globally() -> None: - """Enable spill on demand in the current global spill manager. - - Warning: this modifies the current RMM memory resource. A memory resource - to handle out-of-memory errors is pushed onto the RMM memory resource stack. - - Raises - ------ - ValueError - If no global spill manager exists (spilling is disabled). - ValueError - If a failure callback resource is already in the resource stack. - """ - - manager = get_global_manager() - if manager is None: - raise ValueError( - "Cannot enable spill on demand with no global spill manager" - ) - mr = rmm.mr.get_current_device_resource() - if any( - isinstance(m, rmm.mr.FailureCallbackResourceAdaptor) - for m in get_rmm_memory_resource_stack(mr) - ): - raise ValueError( - "Spill on demand (or another failure callback resource) " - "is already registered" - ) - rmm.mr.set_current_device_resource( - rmm.mr.FailureCallbackResourceAdaptor( - mr, manager._out_of_memory_handle - ) - ) - - -@contextmanager -def spill_on_demand_globally(): - """Context to enable spill on demand temporarily. - - Warning: this modifies the current RMM memory resource. A memory resource - to handle out-of-memory errors is pushed onto the RMM memory resource stack - when entering the context and popped again when exiting. - - Raises - ------ - ValueError - If no global spill manager exists (spilling is disabled). - ValueError - If a failure callback resource is already in the resource stack. - ValueError - If the RMM memory source stack was changed while in the context. - """ - set_spill_on_demand_globally() - # Save the new memory resource stack for later cleanup - mr_stack = get_rmm_memory_resource_stack( - rmm.mr.get_current_device_resource() - ) - try: - yield - finally: - mr = rmm.mr.get_current_device_resource() - if mr_stack != get_rmm_memory_resource_stack(mr): - raise ValueError( - "RMM memory source stack was changed while in the context" - ) - rmm.mr.set_current_device_resource(mr_stack[1]) diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py deleted file mode 100644 index 4c9e524ee05..00000000000 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ /dev/null @@ -1,466 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import collections.abc -import pickle -import time -import weakref -from threading import RLock -from typing import TYPE_CHECKING, Any, Literal - -import numpy -import nvtx -from typing_extensions import Self - -import rmm - -from cudf.core.buffer.buffer import ( - Buffer, - BufferOwner, - cuda_array_interface_wrapper, - host_memory_allocation, -) -from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer -from cudf.utils.performance_tracking import _get_color_for_nvtx -from cudf.utils.string import format_bytes - -if TYPE_CHECKING: - from cudf.core.buffer.spill_manager import SpillManager - - -class SpillLock: - pass - - -class DelayedPointerTuple(collections.abc.Sequence): - """ - A delayed version of the "data" field in __cuda_array_interface__. - - The idea is to delay the access to `Buffer.ptr` until the user - actually accesses the data pointer. - - For instance, in many cases __cuda_array_interface__ is accessed - only to determine whether an object is a CUDA object or not. - - TODO: this doesn't support libraries such as PyTorch that declare - the tuple of __cuda_array_interface__["data"] in Cython. In such - cases, Cython will raise an error because DelayedPointerTuple - isn't a "real" tuple. - """ - - def __init__(self, buffer) -> None: - self._buf = buffer - - def __len__(self): - return 2 - - def __getitem__(self, i): - if i == 0: - return self._buf.get_ptr(mode="write") - elif i == 1: - return False - raise IndexError("tuple index out of range") - - -class SpillableBufferOwner(BufferOwner): - """A Buffer that supports spilling memory off the GPU to avoid OOMs. - - This buffer supports spilling the represented data to host memory. - Spilling can be done manually by calling `.spill(target="cpu")` but - usually the associated spilling manager triggers spilling based on current - device memory usage see `cudf.core.buffer.spill_manager.SpillManager`. - Unspill is triggered automatically when accessing the data of the buffer. - - The buffer might not be spillable, which is based on the "expose" status of - the buffer. We say that the buffer has been exposed if the device pointer - (integer or void*) has been accessed outside of SpillableBufferOwner. - In this case, we cannot invalidate the device pointer by moving the data - to host. - - A buffer can be exposed permanently at creation or by accessing the `.ptr` - property. To avoid this, one can use `.get_ptr()` instead, which support - exposing the buffer temporarily. - - Use the factory function `as_buffer` to create a SpillableBufferOwner - instance. - """ - - lock: RLock - _spill_locks: weakref.WeakSet - _last_accessed: float - _ptr_desc: dict[str, Any] - _manager: SpillManager - - def _finalize_init(self, ptr_desc: dict[str, Any]) -> None: - """Finish initialization of the spillable buffer - - This implements the common initialization that `from_device_memory` - and `from_host_memory` are missing. - - Parameters - ---------- - ptr_desc : dict - Description of the memory. - """ - - from cudf.core.buffer.spill_manager import get_global_manager - - self.lock = RLock() - self._spill_locks = weakref.WeakSet() - self._last_accessed = time.monotonic() - self._ptr_desc = ptr_desc - manager = get_global_manager() - if manager is None: - raise ValueError( - f"cannot create {self.__class__} without " - "a global spill manager" - ) - - self._manager = manager - self._manager.add(self) - - @classmethod - def from_device_memory(cls, data: Any, exposed: bool) -> Self: - """Create a spillabe buffer from device memory. - - No data is being copied. - - Parameters - ---------- - data : device-buffer-like - An object implementing the CUDA Array Interface. - exposed : bool - Mark the buffer as permanently exposed (unspillable). - - Returns - ------- - SpillableBufferOwner - Buffer representing the same device memory as `data` - """ - ret = super().from_device_memory(data, exposed=exposed) - ret._finalize_init(ptr_desc={"type": "gpu"}) - return ret - - @classmethod - def from_host_memory(cls, data: Any) -> Self: - """Create a spillabe buffer from host memory. - - Data must implement `__array_interface__`, the buffer protocol, and/or - be convertible to a buffer object using `numpy.asanyarray()` - - The new buffer is marked as spilled to host memory already. - - Raises ValueError if array isn't C-contiguous. - - Parameters - ---------- - data : Any - An object that represents host memory. - - Returns - ------- - SpillableBufferOwner - Buffer representing a copy of `data`. - """ - - # Convert to a memoryview using numpy array, this will not copy data - # in most cases. - data = memoryview(numpy.asanyarray(data)) - if not data.c_contiguous: - raise ValueError("Buffer data must be C-contiguous") - data = data.cast("B") # Make sure itemsize==1 - - # Create an already spilled buffer - ret = cls(ptr=0, size=data.nbytes, owner=None, exposed=False) - ret._finalize_init(ptr_desc={"type": "cpu", "memoryview": data}) - return ret - - @property - def is_spilled(self) -> bool: - return self._ptr_desc["type"] != "gpu" - - def spill(self, target: str = "cpu") -> None: - """Spill or un-spill this buffer in-place - - Parameters - ---------- - target : str - The target of the spilling. - """ - - time_start = time.perf_counter() - with self.lock: - ptr_type = self._ptr_desc["type"] - if ptr_type == target: - return - - if not self.spillable: - raise ValueError( - f"Cannot in-place move an unspillable buffer: {self}" - ) - - if (ptr_type, target) == ("gpu", "cpu"): - with nvtx.annotate( - message="SpillDtoH", - color=_get_color_for_nvtx("SpillDtoH"), - domain="cudf_python-spill", - ): - host_mem = host_memory_allocation(self.size) - rmm._lib.device_buffer.copy_ptr_to_host( - self._ptr, host_mem - ) - self._ptr_desc["memoryview"] = host_mem - self._ptr = 0 - self._owner = None - elif (ptr_type, target) == ("cpu", "gpu"): - # Notice, this operation is prone to deadlock because the RMM - # allocation might trigger spilling-on-demand which in turn - # trigger a new call to this buffer's `spill()`. - # Therefore, it is important that spilling-on-demand doesn't - # try to unspill an already locked buffer! - with nvtx.annotate( - message="SpillHtoD", - color=_get_color_for_nvtx("SpillHtoD"), - domain="cudf_python-spill", - ): - dev_mem = rmm.DeviceBuffer.to_device( - self._ptr_desc.pop("memoryview") - ) - self._ptr = dev_mem.ptr - self._owner = dev_mem - assert self._size == dev_mem.size - else: - # TODO: support moving to disk - raise ValueError(f"Unknown target: {target}") - self._ptr_desc["type"] = target - - time_end = time.perf_counter() - self._manager.statistics.log_spill( - src=ptr_type, - dst=target, - nbytes=self.size, - time=time_end - time_start, - ) - - def mark_exposed(self) -> None: - """Mark the buffer as "exposed" and make it unspillable permanently. - - This also unspills the buffer (unspillable buffers cannot be spilled!). - """ - - self._manager.spill_to_device_limit() - with self.lock: - if not self.exposed: - self._manager.statistics.log_expose(self) - self.spill(target="gpu") - super().mark_exposed() - self._last_accessed = time.monotonic() - - def spill_lock(self, spill_lock: SpillLock) -> None: - """Spill lock the buffer - - Mark the buffer as unspillable while `spill_lock` is alive, - which is tracked by monitoring a weakref to `spill_lock`. - - Parameters - ---------- - spill_lock : SpillLock - The object that defines the scope of the lock. - """ - - with self.lock: - self.spill(target="gpu") - self._spill_locks.add(spill_lock) - - def get_ptr(self, *, mode: Literal["read", "write"]) -> int: - """Get a device pointer to the memory of the buffer. - - If this is called within an `acquire_spill_lock` context, - a reference to this buffer is added to spill_lock, which - disable spilling of this buffer while in the context. - - If this is *not* called within a `acquire_spill_lock` context, - this buffer is marked as unspillable permanently. - - Returns - ------- - int - The device pointer as an integer - """ - from cudf.core.buffer.utils import get_spill_lock - - spill_lock = get_spill_lock() - if spill_lock is None: - self.mark_exposed() - else: - self.spill_lock(spill_lock) - self._last_accessed = time.monotonic() - return self._ptr - - def memory_info(self) -> tuple[int, int, str]: - """Get pointer, size, and device type of this buffer. - - Warning, it is not safe to access the pointer value without - spill lock the buffer manually. This method neither exposes - nor spill locks the buffer. - - Return - ------ - int - The memory pointer as an integer (device or host memory) - int - The size of the memory in bytes - str - The device type as a string ("cpu" or "gpu") - """ - - if self._ptr_desc["type"] == "gpu": - ptr = self._ptr - elif self._ptr_desc["type"] == "cpu": - ptr = numpy.array( - self._ptr_desc["memoryview"], copy=False - ).__array_interface__["data"][0] - return (ptr, self.nbytes, self._ptr_desc["type"]) - - @property - def spillable(self) -> bool: - return not self.exposed and len(self._spill_locks) == 0 - - @property - def last_accessed(self) -> float: - return self._last_accessed - - @property - def __cuda_array_interface__(self) -> dict: - return { - "data": DelayedPointerTuple(self), - "shape": (self.size,), - "strides": None, - "typestr": "|u1", - "version": 0, - } - - def memoryview( - self, *, offset: int = 0, size: int | None = None - ) -> memoryview: - size = self._size if size is None else size - with self.lock: - if self.spillable: - self.spill(target="cpu") - return self._ptr_desc["memoryview"][offset : offset + size] - else: - assert self._ptr_desc["type"] == "gpu" - ret = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( - self._ptr + offset, ret - ) - return ret - - def __str__(self) -> str: - if self._ptr_desc["type"] != "gpu": - ptr_info = str(self._ptr_desc) - else: - ptr_info = str(hex(self._ptr)) - return ( - f"<{self.__class__.__name__} size={format_bytes(self._size)} " - f"spillable={self.spillable} exposed={self.exposed} " - f"num-spill-locks={len(self._spill_locks)} " - f"ptr={ptr_info} owner={repr(self._owner)}>" - ) - - -class SpillableBuffer(ExposureTrackedBuffer): - """A slice of a spillable buffer""" - - _owner: SpillableBufferOwner - - def spill(self, target: str = "cpu") -> None: - return self._owner.spill(target=target) - - @property - def is_spilled(self) -> bool: - return self._owner.is_spilled - - @property - def spillable(self) -> bool: - return self._owner.spillable - - def spill_lock(self, spill_lock: SpillLock) -> None: - self._owner.spill_lock(spill_lock=spill_lock) - - def memory_info(self) -> tuple[int, int, str]: - (ptr, _, device_type) = self._owner.memory_info() - return (ptr + self._offset, self.nbytes, device_type) - - def serialize(self) -> tuple[dict, list]: - """Serialize the Buffer - - Normally, we would use `[self]` as the frames. This would work but - also mean that `self` becomes exposed permanently if the frames are - later accessed through `__cuda_array_interface__`, which is exactly - what libraries like Dask+UCX would do when communicating! - - The sound solution is to modify Dask et al. so that they access the - frames through `.get_ptr()` and holds on to the `spill_lock` until - the frame has been transferred. However, until this adaptation we - use a hack where the frame is a `Buffer` with a `spill_lock` as the - owner, which makes `self` unspillable while the frame is alive but - doesn't expose `self` when `__cuda_array_interface__` is accessed. - - Warning, this hack means that the returned frame must be copied before - given to `.deserialize()`, otherwise we would have a `Buffer` pointing - to memory already owned by an existing `SpillableBufferOwner`. - """ - header: dict[str, Any] = {} - frames: list[Buffer | memoryview] - with self._owner.lock: - header["type-serialized"] = pickle.dumps(self.__class__) - header["owner-type-serialized"] = pickle.dumps(type(self._owner)) - header["frame_count"] = 1 - if self.is_spilled: - frames = [self.memoryview()] - else: - # TODO: Use `frames=[self]` instead of this hack, see doc above - spill_lock = SpillLock() - self.spill_lock(spill_lock) - ptr, size, _ = self.memory_info() - frames = [ - Buffer( - owner=BufferOwner.from_device_memory( - cuda_array_interface_wrapper( - ptr=ptr, - size=size, - owner=(self._owner, spill_lock), - ), - exposed=False, - ) - ) - ] - return header, frames - - def copy(self, deep: bool = True) -> Self: - from cudf.core.buffer.utils import acquire_spill_lock - - if not deep: - return super().copy(deep=False) - - if self.is_spilled: - # In this case, we make the new copy point to the same spilled - # data in host memory. We can do this since spilled data is never - # modified. - owner = self._owner.from_host_memory(self.memoryview()) - return self.__class__(owner=owner, offset=0, size=owner.size) - - with acquire_spill_lock(): - return super().copy(deep=deep) - - @property - def __cuda_array_interface__(self) -> dict: - return { - "data": DelayedPointerTuple(self), - "shape": (self.size,), - "strides": None, - "typestr": "|u1", - "version": 0, - } diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py deleted file mode 100644 index 42a1501c914..00000000000 --- a/python/cudf/cudf/core/buffer/utils.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import threading -from contextlib import ContextDecorator -from typing import Any - -from cudf.core.buffer.buffer import ( - Buffer, - BufferOwner, - cuda_array_interface_wrapper, - get_ptr_and_size, -) -from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.core.buffer.spillable_buffer import ( - SpillableBuffer, - SpillableBufferOwner, - SpillLock, -) -from cudf.options import get_option - - -def get_buffer_owner(data: Any) -> BufferOwner | None: - """Get the owner of `data`, if one exists - - Search through the stack of data owners in order to find an - owner BufferOwner (incl. subclasses). - - Parameters - ---------- - data - The data object to search for a BufferOwner instance - - Return - ------ - BufferOwner or None - The owner of `data` if found otherwise None. - """ - - if isinstance(data, BufferOwner): - return data - if hasattr(data, "owner"): - return get_buffer_owner(data.owner) - return None - - -def as_buffer( - data: int | Any, - *, - size: int | None = None, - owner: object | None = None, - exposed: bool = False, -) -> Buffer: - """Factory function to wrap `data` in a Buffer object. - - If `data` isn't a buffer already, a new buffer that points to the memory of - `data` is created. If `data` represents host memory, it is copied to a new - `rmm.DeviceBuffer` device allocation. Otherwise, the memory of `data` is - **not** copied, instead the new buffer keeps a reference to `data` in order - to retain its lifetime. - - If `data` is an integer, it is assumed to point to device memory. - - Raises ValueError if `data` isn't C-contiguous. - - If copy-on-write is enabled, an ExposureTrackedBuffer is returned. - - If spilling is enabled, a SpillableBuffer that refers to a - SpillableBufferOwner is returned. If `data` is owned by a spillable buffer, - it must either be "exposed" or spill locked (called within an - acquire_spill_lock context). This is to guarantee that the memory of `data` - isn't spilled before this function gets to calculate the offset of the new - SpillableBuffer. - - - Parameters - ---------- - data : int or buffer-like or array-like - An integer representing a pointer to device memory or a buffer-like - or array-like object. When not an integer, `size` and `owner` must - be None. - size : int, optional - Size of device memory in bytes. Must be specified if `data` is an - integer. - owner : object, optional - Python object to which the lifetime of the memory allocation is tied. - A reference to this object is kept in the returned Buffer. - exposed : bool, optional - Mark the buffer as permanently exposed. This is used by - ExposureTrackedBuffer to determine when a deep copy is required and - by SpillableBuffer to mark the buffer unspillable. - - Return - ------ - Buffer - A buffer instance that represents the device memory of `data`. - """ - - if isinstance(data, Buffer): - return data - - # We handle the integer argument in the factory function by wrapping - # the pointer in a `__cuda_array_interface__` exposing object so that - # the Buffer (and its sub-classes) do not have to. - if isinstance(data, int): - if size is None: - raise ValueError( - "size must be specified when `data` is an integer" - ) - data = cuda_array_interface_wrapper(ptr=data, size=size, owner=owner) - elif size is not None or owner is not None: - raise ValueError( - "`size` and `owner` must be None when " - "`data` is a buffer-like or array-like object" - ) - - # Find the buffer types to return based on the current config - owner_class: type[BufferOwner] - buffer_class: type[Buffer] - if get_global_manager() is not None: - owner_class = SpillableBufferOwner - buffer_class = SpillableBuffer - elif get_option("copy_on_write"): - owner_class = BufferOwner - buffer_class = ExposureTrackedBuffer - else: - owner_class = BufferOwner - buffer_class = Buffer - - # Handle host memory, - if not hasattr(data, "__cuda_array_interface__"): - if exposed: - raise ValueError("cannot created exposed host memory") - return buffer_class(owner=owner_class.from_host_memory(data)) - - # Check if `data` is owned by a known class - owner = get_buffer_owner(data) - if owner is None: # `data` is new device memory - return buffer_class( - owner=owner_class.from_device_memory(data, exposed=exposed) - ) - - # At this point, we know that `data` is owned by a known class, which - # should be the same class as specified by the current config (see above) - assert owner.__class__ is owner_class - if ( - isinstance(owner, SpillableBufferOwner) - and not owner.exposed - and get_spill_lock() is None - ): - raise ValueError( - "An owning spillable buffer must " - "either be exposed or spill locked." - ) - ptr, size = get_ptr_and_size(data.__cuda_array_interface__) - base_ptr = owner.get_ptr(mode="read") - if size > 0 and base_ptr == 0: - raise ValueError("Cannot create a non-empty slice of a null buffer") - return buffer_class(owner=owner, offset=ptr - base_ptr, size=size) - - -_thread_spill_locks: dict[int, tuple[SpillLock | None, int]] = {} - - -def _push_thread_spill_lock() -> None: - _id = threading.get_ident() - spill_lock, count = _thread_spill_locks.get(_id, (None, 0)) - if spill_lock is None: - spill_lock = SpillLock() - _thread_spill_locks[_id] = (spill_lock, count + 1) - - -def _pop_thread_spill_lock() -> None: - _id = threading.get_ident() - spill_lock, count = _thread_spill_locks[_id] - if count == 1: - spill_lock = None - _thread_spill_locks[_id] = (spill_lock, count - 1) - - -class acquire_spill_lock(ContextDecorator): - """Decorator and context to set spill lock automatically. - - All calls to `get_spill_lock()` within the decorated function or context - will return a spill lock with a lifetime bound to the function or context. - - Developer Notes - --------------- - We use the global variable `_thread_spill_locks` to track the global spill - lock state. To support concurrency, each thread tracks its own state by - pushing and popping from `_thread_spill_locks` using its thread ID. - """ - - def __enter__(self) -> SpillLock | None: - _push_thread_spill_lock() - return get_spill_lock() - - def __exit__(self, *exc): - _pop_thread_spill_lock() - - -def get_spill_lock() -> SpillLock | None: - """Return a spill lock within the context of `acquire_spill_lock` or None - - Returns None, if spilling is disabled. - """ - - if get_global_manager() is None: - return None - _id = threading.get_ident() - spill_lock, _ = _thread_spill_locks.get(_id, (None, 0)) - return spill_lock diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py deleted file mode 100644 index 6ca64a0a2be..00000000000 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import cudf -from cudf._lib.nvtext.byte_pair_encode import ( - BPEMergePairs as cpp_merge_pairs, - byte_pair_encoding as cpp_byte_pair_encoding, -) - - -class BytePairEncoder: - """ - Given a merge pairs strings series, performs byte pair encoding on - a strings series using the provided separator. - - Parameters - ---------- - merges_pairs : str - Strings column of merge pairs - - Returns - ------- - BytePairEncoder - """ - - def __init__(self, merges_pair: "cudf.Series"): - self.merge_pairs = cpp_merge_pairs(merges_pair._column) - - def __call__(self, text, separator: str = " ") -> cudf.Series: - """ - - Parameters - ---------- - text : cudf string series - The strings to be encoded. - - Returns - ------- - Encoded strings - - Examples - -------- - >>> import cudf - >>> from cudf.core.byte_pair_encoding import BytePairEncoder - >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t", - ... "c e", "es t", "en ce", "T h", "Th is", - ... "t est", "s ent", "t h", "th is"]) - >>> bpe = BytePairEncoder(mps) - >>> str_series = cudf.Series(['This is the sentence', 'thisisit']) - >>> bpe(str_series) - 0 This is a sent ence - 1 this is it - dtype: object - """ - sep = cudf.Scalar(separator, dtype="str") - result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) - - return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py deleted file mode 100644 index 06791df7dc0..00000000000 --- a/python/cudf/cudf/core/column/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -""" -isort: skip_file -""" - -from cudf.core.column.categorical import CategoricalColumn -from cudf.core.column.column import ( - ColumnBase, - as_column, - build_column, - column_empty, - column_empty_like, - concat_columns, - deserialize_columns, - serialize_columns, -) -from cudf.core.column.datetime import DatetimeColumn # noqa: F401 -from cudf.core.column.datetime import DatetimeTZColumn # noqa: F401 -from cudf.core.column.lists import ListColumn # noqa: F401 -from cudf.core.column.numerical import NumericalColumn # noqa: F401 -from cudf.core.column.string import StringColumn # noqa: F401 -from cudf.core.column.struct import StructColumn # noqa: F401 -from cudf.core.column.timedelta import TimeDeltaColumn # noqa: F401 -from cudf.core.column.interval import IntervalColumn # noqa: F401 -from cudf.core.column.decimal import ( # noqa: F401 - Decimal32Column, - Decimal64Column, - Decimal128Column, - DecimalBaseColumn, -) -from cudf.core.column.interval import IntervalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py deleted file mode 100644 index 864e87b5377..00000000000 --- a/python/cudf/cudf/core/column/categorical.py +++ /dev/null @@ -1,1478 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import warnings -from functools import cached_property -from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast - -import numpy as np -import pandas as pd -import pyarrow as pa -from typing_extensions import Self - -import cudf -from cudf import _lib as libcudf -from cudf._lib.transform import bools_to_mask -from cudf.core.column import column -from cudf.core.column.methods import ColumnMethods -from cudf.core.dtypes import CategoricalDtype, IntervalDtype -from cudf.utils.dtypes import ( - find_common_type, - is_mixed_with_object_dtype, - min_signed_type, - min_unsigned_type, -) - -if TYPE_CHECKING: - from collections import abc - - import numba.cuda - - from cudf._typing import ( - ColumnBinaryOperand, - ColumnLike, - Dtype, - ScalarLike, - SeriesOrIndex, - SeriesOrSingleColumnIndex, - ) - from cudf.core.buffer import Buffer - from cudf.core.column import ( - ColumnBase, - DatetimeColumn, - NumericalColumn, - StringColumn, - TimeDeltaColumn, - ) - - -# Using np.int8(-1) to allow silent wrap-around when casting to uint -# it may make sense to make this dtype specific or a function. -_DEFAULT_CATEGORICAL_VALUE = np.int8(-1) - - -def as_unsigned_codes( - num_cats: int, codes: NumericalColumn -) -> NumericalColumn: - codes_dtype = min_unsigned_type(num_cats) - return cast( - cudf.core.column.numerical.NumericalColumn, codes.astype(codes_dtype) - ) - - -class CategoricalAccessor(ColumnMethods): - """ - Accessor object for categorical properties of the Series values. - Be aware that assigning to `categories` is a inplace operation, - while all methods return new categorical data per default. - - Parameters - ---------- - column : Column - parent : Series or CategoricalIndex - - Examples - -------- - >>> s = cudf.Series([1,2,3], dtype='category') - >>> s - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1, 2, 3] - >>> s.cat.categories - Index([1, 2, 3], dtype='int64') - >>> s.cat.reorder_categories([3,2,1]) - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [3, 2, 1] - >>> s.cat.remove_categories([1]) - 0 - 1 2 - 2 3 - dtype: category - Categories (2, int64): [2, 3] - >>> s.cat.set_categories(list('abcde')) - 0 - 1 - 2 - dtype: category - Categories (5, object): ['a', 'b', 'c', 'd', 'e'] - >>> s.cat.as_ordered() - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1 < 2 < 3] - >>> s.cat.as_unordered() - 0 1 - 1 2 - 2 3 - dtype: category - Categories (3, int64): [1, 2, 3] - """ - - _column: CategoricalColumn - - def __init__(self, parent: SeriesOrSingleColumnIndex): - if not isinstance(parent.dtype, CategoricalDtype): - raise AttributeError( - "Can only use .cat accessor with a 'category' dtype" - ) - super().__init__(parent=parent) - - @property - def categories(self) -> "cudf.core.index.Index": - """ - The categories of this categorical. - """ - return self._column.dtype.categories - - @property - def codes(self) -> cudf.Series: - """ - Return Series of codes as well as the index. - """ - index = ( - self._parent.index - if isinstance(self._parent, cudf.Series) - else None - ) - return cudf.Series._from_column(self._column.codes, index=index) - - @property - def ordered(self) -> bool: - """ - Whether the categories have an ordered relationship. - """ - return self._column.ordered - - def as_ordered(self) -> SeriesOrIndex | None: - """ - Set the Categorical to be ordered. - - Returns - ------- - Categorical - Ordered Categorical. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category") - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1, 2, 10] - >>> s.cat.as_ordered() - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1 < 2 < 10] - """ - return self._return_or_inplace(self._column.as_ordered(ordered=True)) - - def as_unordered(self) -> SeriesOrIndex | None: - """ - Set the Categorical to be unordered. - - Returns - ------- - Categorical - Unordered Categorical or None if inplace. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category") - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1, 2, 10] - >>> s = s.cat.as_ordered() - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1 < 2 < 10] - >>> s.cat.as_unordered() - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1, 2, 10] - """ - return self._return_or_inplace(self._column.as_ordered(ordered=False)) - - def add_categories(self, new_categories: Any) -> SeriesOrIndex | None: - """ - Add new categories. - - `new_categories` will be included at the last/highest - place in the categories and will be unused directly - after this call. - - Parameters - ---------- - new_categories : category or list-like of category - The new categories to be included. - - Returns - ------- - cat - Categorical with new categories added. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([1, 2], dtype="category") - >>> s - 0 1 - 1 2 - dtype: category - Categories (2, int64): [1, 2] - >>> s.cat.add_categories([0, 3, 4]) - 0 1 - 1 2 - dtype: category - Categories (5, int64): [1, 2, 0, 3, 4] - >>> s - 0 1 - 1 2 - dtype: category - Categories (2, int64): [1, 2] - """ - return self._return_or_inplace( - self._column.add_categories(new_categories=new_categories) - ) - - def remove_categories( - self, - removals: Any, - ) -> SeriesOrIndex | None: - """ - Remove the specified categories. - - `removals` must be included in the - old categories. Values which were in the - removed categories will be set to null. - - Parameters - ---------- - removals : category or list-like of category - The categories which should be removed. - - Returns - ------- - cat - Categorical with removed categories - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category") - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1, 2, 10] - >>> s.cat.remove_categories([1]) - 0 10 - 1 - 2 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (2, int64): [2, 10] - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1, 2, 10] - """ - return self._return_or_inplace( - self._column.remove_categories(removals=removals) - ) - - def set_categories( - self, - new_categories: Any, - ordered: bool = False, - rename: bool = False, - ) -> SeriesOrIndex | None: - """ - Set the categories to the specified new_categories. - - - `new_categories` can include new categories (which - will result in unused categories) or remove old categories - (which results in values set to null). If `rename==True`, - the categories will simple be renamed (less or more items - than in old categories will result in values set to null or - in unused categories respectively). - - This method can be used to perform more than one action - of adding, removing, and reordering simultaneously and - is therefore faster than performing the individual steps - via the more specialised methods. - - On the other hand this methods does not do checks - (e.g., whether the old categories are included in the - new categories on a reorder), which can result in - surprising changes. - - Parameters - ---------- - new_categories : list-like - The categories in new order. - ordered : bool, default None - Whether or not the categorical is treated as - a ordered categorical. If not given, do - not change the ordered information. - rename : bool, default False - Whether or not the `new_categories` should be - considered as a rename of the old categories - or as reordered categories. - - Returns - ------- - cat - Categorical with reordered categories - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([1, 1, 2, 10, 2, 10], dtype='category') - >>> s - 0 1 - 1 1 - 2 2 - 3 10 - 4 2 - 5 10 - dtype: category - Categories (3, int64): [1, 2, 10] - >>> s.cat.set_categories([1, 10]) - 0 1 - 1 1 - 2 - 3 10 - 4 - 5 10 - dtype: category - Categories (2, int64): [1, 10] - """ - return self._return_or_inplace( - self._column.set_categories( - new_categories=new_categories, ordered=ordered, rename=rename - ) - ) - - def reorder_categories( - self, - new_categories: Any, - ordered: bool = False, - ) -> SeriesOrIndex | None: - """ - Reorder categories as specified in new_categories. - - `new_categories` need to include all old categories - and no new category items. - - Parameters - ---------- - new_categories : Index-like - The categories in new order. - ordered : bool, optional - Whether or not the categorical is treated - as a ordered categorical. If not given, do - not change the ordered information. - - Returns - ------- - cat - Categorical with reordered categories - - Raises - ------ - ValueError - If the new categories do not contain all old - category items or any new ones. - - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category") - >>> s - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [1, 2, 10] - >>> s.cat.reorder_categories([10, 1, 2]) - 0 10 - 1 1 - 2 1 - 3 2 - 4 10 - 5 2 - 6 10 - dtype: category - Categories (3, int64): [10, 1, 2] - >>> s.cat.reorder_categories([10, 1]) - ValueError: items in new_categories are not the same as in - old categories - """ - return self._return_or_inplace( - self._column.reorder_categories(new_categories, ordered=ordered), - ) - - -def validate_categorical_children(children) -> None: - if not ( - len(children) == 1 - and isinstance(children[0], cudf.core.column.numerical.NumericalColumn) - and children[0].dtype.kind in "iu" - ): - # TODO: Enforce unsigned integer? - raise ValueError( - "Must specify exactly one child NumericalColumn of integers for representing the codes." - ) - - -class CategoricalColumn(column.ColumnBase): - """ - Implements operations for Columns of Categorical type - - Parameters - ---------- - dtype : CategoricalDtype - mask : Buffer - The validity mask - offset : int - Data offset - children : Tuple[ColumnBase] - Two non-null columns containing the categories and codes - respectively - """ - - dtype: CategoricalDtype - _children: tuple[NumericalColumn] - _VALID_REDUCTIONS = { - "max", - "min", - } - _VALID_BINARY_OPERATIONS = { - "__eq__", - "__ne__", - "__lt__", - "__le__", - "__gt__", - "__ge__", - } - - def __init__( - self, - data: None, - size: int | None, - dtype: CategoricalDtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple[NumericalColumn] = (), # type: ignore[assignment] - ): - if data is not None: - raise ValueError(f"{data=} must be None") - validate_categorical_children(children) - if size is None: - child = children[0] - assert child.offset == 0 - assert child.base_mask is None - size = child.size - size = size - offset - if not isinstance(dtype, CategoricalDtype): - raise ValueError( - f"{dtype=} must be cudf.CategoricalDtype instance." - ) - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - self._codes = self.children[0].set_mask(self.mask) - - @property - def base_size(self) -> int: - return int( - (self.base_children[0].size) / self.base_children[0].dtype.itemsize - ) - - def __contains__(self, item: ScalarLike) -> bool: - try: - self._encode(item) - except ValueError: - return False - return self._encode(item) in self.codes - - def set_base_data(self, value): - if value is not None: - raise RuntimeError( - "CategoricalColumns do not use data attribute of Column, use " - "`set_base_children` instead" - ) - else: - super().set_base_data(value) - - def _process_values_for_isin( - self, values: Sequence - ) -> tuple[ColumnBase, ColumnBase]: - lhs = self - # We need to convert values to same type as self, - # hence passing dtype=self.dtype - rhs = cudf.core.column.as_column(values, dtype=self.dtype) - return lhs, rhs - - def set_base_mask(self, value: Buffer | None) -> None: - super().set_base_mask(value) - self._codes = self.children[0].set_mask(self.mask) - - def set_base_children(self, value: tuple[NumericalColumn]) -> None: # type: ignore[override] - super().set_base_children(value) - validate_categorical_children(value) - self._codes = value[0].set_mask(self.mask) - - @property - def children(self) -> tuple[NumericalColumn]: - if self._children is None: - codes_column = self.base_children[0] - start = self.offset * codes_column.dtype.itemsize - end = start + self.size * codes_column.dtype.itemsize - codes_column = cudf.core.column.NumericalColumn( - data=codes_column.base_data[start:end], - dtype=codes_column.dtype, - size=self.size, - ) - self._children = (codes_column,) - return self._children - - @property - def categories(self) -> ColumnBase: - return self.dtype.categories._values - - @property - def codes(self) -> NumericalColumn: - return self._codes - - @property - def ordered(self) -> bool: - return self.dtype.ordered - - def __setitem__(self, key, value): - if cudf.api.types.is_scalar( - value - ) and cudf._lib.scalar._is_null_host_scalar(value): - to_add_categories = 0 - else: - if cudf.api.types.is_scalar(value): - arr = column.as_column(value, length=1, nan_as_null=False) - else: - arr = column.as_column(value, nan_as_null=False) - to_add_categories = len( - cudf.Index._from_column(arr).difference( - cudf.Index._from_column(self.categories) - ) - ) - - if to_add_categories > 0: - raise TypeError( - "Cannot setitem on a Categorical with a new " - "category, set the categories first" - ) - - if cudf.api.types.is_scalar(value): - value = self._encode(value) if value is not None else value - else: - value = cudf.core.column.as_column(value).astype(self.dtype) - value = value.codes - codes = self.codes - codes[key] = value - out = type(self)( - data=self.data, - size=codes.size, - dtype=self.dtype, - mask=codes.base_mask, - children=(codes,), - ) - self._mimic_inplace(out, inplace=True) - - def _fill( - self, - fill_value: ScalarLike, - begin: int, - end: int, - inplace: bool = False, - ) -> Self: - if end <= begin or begin >= self.size: - return self if inplace else self.copy() - - fill_code = self._encode(fill_value) - fill_scalar = cudf._lib.scalar.as_device_scalar( - fill_code, self.codes.dtype - ) - - result = self if inplace else self.copy() - - libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) - return result - - def slice(self, start: int, stop: int, stride: int | None = None) -> Self: - codes = self.codes.slice(start, stop, stride) - return type(self)( - data=self.data, # type: ignore[arg-type] - size=codes.size, - dtype=self.dtype, - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), - ) - - def _reduce( - self, - op: str, - skipna: bool | None = None, - min_count: int = 0, - *args, - **kwargs, - ) -> ScalarLike: - # Only valid reductions are min and max - if not self.ordered: - raise TypeError( - f"Categorical is not ordered for operation {op} " - "you can use .as_ordered() to change the Categorical " - "to an ordered one." - ) - return self._decode( - self.codes._reduce(op, skipna, min_count, *args, **kwargs) - ) - - def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: - other = self._wrap_binop_normalization(other) - # TODO: This is currently just here to make mypy happy, but eventually - # we'll need to properly establish the APIs for these methods. - if not isinstance(other, CategoricalColumn): - raise ValueError - # Note: at this stage we are guaranteed that the dtypes are equal. - if not self.ordered and op not in { - "__eq__", - "__ne__", - "NULL_EQUALS", - "NULL_NOT_EQUALS", - }: - raise TypeError( - "The only binary operations supported by unordered " - "categorical columns are equality and inequality." - ) - return self.codes._binaryop(other.codes, op) - - def normalize_binop_value(self, other: ScalarLike) -> Self: - if isinstance(other, column.ColumnBase): - if not isinstance(other, CategoricalColumn): - return NotImplemented - if other.dtype != self.dtype: - raise TypeError( - "Categoricals can only compare with the same type" - ) - return cast(Self, other) - codes = column.as_column( - self._encode(other), length=len(self), dtype=self.codes.dtype - ) - return type(self)( - data=None, - size=self.size, - dtype=self.dtype, - mask=self.base_mask, - children=(codes,), # type: ignore[arg-type] - ) - - def sort_values(self, ascending: bool = True, na_position="last") -> Self: - codes = self.codes.sort_values(ascending, na_position) - return type(self)( - data=self.data, # type: ignore[arg-type] - size=codes.size, - dtype=self.dtype, - mask=codes.base_mask, - children=(codes,), - ) - - def element_indexing(self, index: int) -> ScalarLike: - val = self.codes.element_indexing(index) - return self._decode(int(val)) if val is not None else val - - @property - def __cuda_array_interface__(self) -> Mapping[str, Any]: - raise TypeError( - "Categorical does not support `__cuda_array_interface__`." - " Please consider using `.codes` or `.categories`" - " if you need this functionality." - ) - - def to_pandas( - self, - *, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Index: - if nullable: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - elif arrow_type: - raise NotImplementedError(f"{arrow_type=} is not implemented.") - - if self.categories.dtype.kind == "f": - new_mask = bools_to_mask(self.notnull()) - col = type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=self.dtype, - mask=new_mask, - children=self.children, - ) - else: - col = self - - signed_dtype = min_signed_type(len(col.categories)) - codes = ( - col.codes.astype(signed_dtype) - .fillna(_DEFAULT_CATEGORICAL_VALUE) - .values_host - ) - - cats = col.categories.nans_to_nulls() - if not isinstance(cats.dtype, IntervalDtype): - # leaving out dropna because it temporarily changes an interval - # index into a struct and throws off results. - # TODO: work on interval index dropna - cats = cats.dropna() - data = pd.Categorical.from_codes( - codes, categories=cats.to_pandas(), ordered=col.ordered - ) - return pd.Index(data) - - def to_arrow(self) -> pa.Array: - """Convert to PyArrow Array.""" - # arrow doesn't support unsigned codes - signed_type = ( - min_signed_type(self.codes.max()) - if self.codes.size > 0 - else np.int8 - ) - codes = self.codes.astype(signed_type) - categories = self.categories - - out_indices = codes.to_arrow() - out_dictionary = categories.to_arrow() - - return pa.DictionaryArray.from_arrays( - out_indices, - out_dictionary, - ordered=self.ordered, - ) - - @property - def values_host(self) -> np.ndarray: - """ - Return a numpy representation of the CategoricalColumn. - """ - return self.to_pandas().values - - @property - def values(self): - """ - Return a CuPy representation of the CategoricalColumn. - """ - raise NotImplementedError("cudf.Categorical is not yet implemented") - - def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": - return ( - self.astype(self.categories.dtype).clip(lo, hi).astype(self.dtype) - ) - - def data_array_view( - self, *, mode="write" - ) -> numba.cuda.devicearray.DeviceNDArray: - return self.codes.data_array_view(mode=mode) - - def unique(self) -> Self: - codes = self.codes.unique() - return type(self)( - data=self.data, # type: ignore[arg-type] - size=codes.size, - dtype=self.dtype, - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), - ) - - def _encode(self, value) -> ScalarLike: - return self.categories.find_first_value(value) - - def _decode(self, value: int) -> ScalarLike: - if value == _DEFAULT_CATEGORICAL_VALUE: - return None - return self.categories.element_indexing(value) - - def find_and_replace( - self, - to_replace: ColumnLike, - replacement: ColumnLike, - all_nan: bool = False, - ) -> CategoricalColumn: - """ - Return col with *to_replace* replaced with *replacement*. - """ - to_replace_col = column.as_column(to_replace) - if len(to_replace_col) == to_replace_col.null_count: - to_replace_col = to_replace_col.astype(self.categories.dtype) - replacement_col = column.as_column(replacement) - if len(replacement_col) == replacement_col.null_count: - replacement_col = replacement_col.astype(self.categories.dtype) - - if type(to_replace_col) != type(replacement_col): - raise TypeError( - f"to_replace and value should be of same types," - f"got to_replace dtype: {to_replace_col.dtype} and " - f"value dtype: {replacement_col.dtype}" - ) - df = cudf.DataFrame._from_data( - {"old": to_replace_col, "new": replacement_col} - ) - df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) - if df._data["old"].null_count == 1: - fill_value = ( - df._data["new"] - .apply_boolean_mask(df._data["old"].isnull()) - .element_indexing(0) - ) - # TODO: This line of code does not work because we cannot use the - # `in` operator on self.categories (which is a column). mypy - # realizes that this is wrong because __iter__ is not implemented. - # However, it seems that this functionality has been broken for a - # long time so for now we're just having mypy ignore and we'll come - # back to this. - if fill_value in self.categories: # type: ignore - replaced = self.fillna(fill_value) - else: - new_categories = self.categories.append( - column.as_column([fill_value]) - ) - replaced = self._set_categories(new_categories) - replaced = replaced.fillna(fill_value) - df = df.dropna(subset=["old"]) - to_replace_col = df._data["old"] - replacement_col = df._data["new"] - else: - replaced = self - if df._data["new"].null_count > 0: - drop_values = df._data["old"].apply_boolean_mask( - df._data["new"].isnull() - ) - cur_categories = replaced.categories - new_categories = cur_categories.apply_boolean_mask( - cur_categories.isin(drop_values).unary_operator("not") - ) - replaced = replaced._set_categories(new_categories) - df = df.dropna(subset=["new"]) - to_replace_col = df._data["old"] - replacement_col = df._data["new"] - - # create a dataframe containing the pre-replacement categories - # and a column with the appropriate labels replaced. - # The index of this dataframe represents the original - # ints that map to the categories - cats_col = column.as_column(replaced.dtype.categories) - old_cats = cudf.DataFrame._from_data( - { - "cats": cats_col, - "cats_replace": cats_col.find_and_replace( - to_replace_col, replacement_col - ), - } - ) - - # Construct the new categorical labels - # If a category is being replaced by an existing one, we - # want to map it to None. If it's totally new, we want to - # map it to the new label it is to be replaced by - dtype_replace = cudf.Series._from_column(replacement_col) - dtype_replace[dtype_replace.isin(cats_col)] = None - new_cats_col = cats_col.find_and_replace( - to_replace_col, dtype_replace._column - ) - - # anything we mapped to None, we want to now filter out since - # those categories don't exist anymore - # Resetting the index creates a column 'index' that associates - # the original integers to the new labels - bmask = new_cats_col.notnull() - new_cats_col = new_cats_col.apply_boolean_mask(bmask) - new_cats = cudf.DataFrame._from_data( - { - "index": column.as_column(range(len(new_cats_col))), - "cats": new_cats_col, - } - ) - - # old_cats contains replaced categories and the ints that - # previously mapped to those categories and the index of - # new_cats is a RangeIndex that contains the new ints - catmap = old_cats.merge( - new_cats, left_on="cats_replace", right_on="cats", how="inner" - ) - - # The index of this frame is now the old ints, but the column - # named 'index', which came from the filtered categories, - # contains the new ints that we need to map to - to_replace_col = column.as_column(catmap.index).astype( - replaced.codes.dtype - ) - replacement_col = catmap._data["index"].astype(replaced.codes.dtype) - - replaced_codes = column.as_column(replaced.codes) - output = libcudf.replace.replace( - replaced_codes, to_replace_col, replacement_col - ) - codes = as_unsigned_codes(len(new_cats["cats"]), output) - - result = type(self)( - data=self.data, # type: ignore[arg-type] - size=codes.size, - dtype=CategoricalDtype( - categories=new_cats["cats"], ordered=self.dtype.ordered - ), - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), - ) - if result.dtype != self.dtype: - warnings.warn( - "The behavior of replace with " - "CategoricalDtype is deprecated. In a future version, replace " - "will only be used for cases that preserve the categories. " - "To change the categories, use ser.cat.rename_categories " - "instead.", - FutureWarning, - ) - return result - - def isnull(self) -> ColumnBase: - """ - Identify missing values in a CategoricalColumn. - """ - result = libcudf.unary.is_null(self) - - if self.categories.dtype.kind == "f": - # Need to consider `np.nan` values in case - # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) - if categories.any(): - code = self._encode(np.nan) - result = result | (self.codes == cudf.Scalar(code)) - - return result - - def notnull(self) -> ColumnBase: - """ - Identify non-missing values in a CategoricalColumn. - """ - result = libcudf.unary.is_valid(self) - - if self.categories.dtype.kind == "f": - # Need to consider `np.nan` values in case - # of an underlying float column - categories = libcudf.unary.is_nan(self.categories) - if categories.any(): - code = self._encode(np.nan) - result = result & (self.codes != cudf.Scalar(code)) - - return result - - def _validate_fillna_value( - self, fill_value: ScalarLike | ColumnLike - ) -> cudf.Scalar | ColumnBase: - """Align fill_value for .fillna based on column type.""" - if cudf.api.types.is_scalar(fill_value): - if fill_value != _DEFAULT_CATEGORICAL_VALUE: - try: - fill_value = self._encode(fill_value) - except ValueError as err: - raise ValueError( - f"{fill_value=} must be in categories" - ) from err - return cudf.Scalar(fill_value, dtype=self.codes.dtype) - else: - fill_value = column.as_column(fill_value, nan_as_null=False) - if isinstance(fill_value.dtype, CategoricalDtype): - if self.dtype != fill_value.dtype: - raise TypeError( - "Cannot set a categorical with another without identical categories" - ) - else: - raise TypeError( - "Cannot set a categorical with non-categorical data" - ) - fill_value = cast(CategoricalColumn, fill_value)._set_categories( - self.categories, - ) - return fill_value.codes.astype(self.codes.dtype) - - def indices_of( - self, value: ScalarLike - ) -> cudf.core.column.NumericalColumn: - return self.codes.indices_of(self._encode(value)) - - @property - def is_monotonic_increasing(self) -> bool: - return bool(self.ordered) and self.codes.is_monotonic_increasing - - @property - def is_monotonic_decreasing(self) -> bool: - return bool(self.ordered) and self.codes.is_monotonic_decreasing - - def as_categorical_column(self, dtype: Dtype) -> Self: - if isinstance(dtype, str) and dtype == "category": - return self - if isinstance(dtype, pd.CategoricalDtype): - dtype = cudf.CategoricalDtype.from_pandas(dtype) - if ( - isinstance(dtype, cudf.CategoricalDtype) - and dtype.categories is None - and dtype.ordered is None - ): - return self - elif not isinstance(dtype, CategoricalDtype): - raise ValueError("dtype must be CategoricalDtype") - - if not isinstance(self.categories, type(dtype.categories._column)): - # If both categories are of different Column types, - # return a column full of Nulls. - codes = cast( - cudf.core.column.numerical.NumericalColumn, - column.as_column( - _DEFAULT_CATEGORICAL_VALUE, - length=self.size, - dtype=self.codes.dtype, - ), - ) - codes = as_unsigned_codes(len(dtype.categories), codes) - return type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - children=(codes,), - ) - - return self.set_categories( - new_categories=dtype.categories, ordered=bool(dtype.ordered) - ) - - def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: - return self._get_decategorized_column().as_numerical_column(dtype) - - def as_string_column(self) -> StringColumn: - return self._get_decategorized_column().as_string_column() - - def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: - return self._get_decategorized_column().as_datetime_column(dtype) - - def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn: - return self._get_decategorized_column().as_timedelta_column(dtype) - - def _get_decategorized_column(self) -> ColumnBase: - if self.null_count == len(self): - # self.categories is empty; just return codes - return self.codes - gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0) - out = self.categories.take(gather_map) - out = out.set_mask(self.mask) - return out - - def copy(self, deep: bool = True) -> Self: - result_col = super().copy(deep=deep) - if deep: - dtype_copy = CategoricalDtype( - categories=self.categories.copy(), - ordered=self.ordered, - ) - result_col = cast(Self, result_col._with_type_metadata(dtype_copy)) - return result_col - - @cached_property - def memory_usage(self) -> int: - return self.categories.memory_usage + self.codes.memory_usage - - def _mimic_inplace( - self, other_col: ColumnBase, inplace: bool = False - ) -> Self | None: - out = super()._mimic_inplace(other_col, inplace=inplace) - if inplace and isinstance(other_col, CategoricalColumn): - self._codes = other_col.codes - return out - - def view(self, dtype: Dtype) -> ColumnBase: - raise NotImplementedError( - "Categorical column views are not currently supported" - ) - - @staticmethod - def _concat( - objs: abc.MutableSequence[CategoricalColumn], - ) -> CategoricalColumn: - # TODO: This function currently assumes it is being called from - # column.concat_columns, at least to the extent that all the - # preprocessing in that function has already been done. That should be - # improved as the concatenation API is solidified. - - # Find the first non-null column: - head = next( - (obj for obj in objs if obj.null_count != len(obj)), objs[0] - ) - - # Combine and de-dupe the categories - cats = column.concat_columns([o.categories for o in objs]).unique() - objs = [o._set_categories(cats, is_unique=True) for o in objs] - codes = [o.codes for o in objs] - - newsize = sum(map(len, codes)) - if newsize > libcudf.MAX_COLUMN_SIZE: - raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" - ) - elif newsize == 0: - codes_col = column.column_empty(0, head.codes.dtype, masked=True) - else: - # Filter out inputs that have 0 length, then concatenate. - codes = [o for o in codes if len(o)] - codes_col = libcudf.concat.concat_columns(objs) - - codes_col = as_unsigned_codes( - len(cats), - cast(cudf.core.column.numerical.NumericalColumn, codes_col), - ) - return CategoricalColumn( - data=None, - size=codes_col.size, - dtype=CategoricalDtype(categories=cats), - mask=codes_col.base_mask, - offset=codes_col.offset, - children=(codes_col,), # type: ignore[arg-type] - ) - - def _with_type_metadata(self: Self, dtype: Dtype) -> Self: - if isinstance(dtype, CategoricalDtype): - return type(self)( - data=self.data, # type: ignore[arg-type] - size=self.codes.size, - dtype=dtype, - mask=self.codes.base_mask, - offset=self.codes.offset, - null_count=self.codes.null_count, - children=(self.codes,), - ) - return self - - def set_categories( - self, - new_categories: Any, - ordered: bool = False, - rename: bool = False, - ) -> Self: - # See CategoricalAccessor.set_categories. - - ordered = ordered if ordered is not None else self.ordered - new_categories = column.as_column(new_categories) - - if isinstance(new_categories, CategoricalColumn): - new_categories = new_categories.categories - - # when called with rename=True, the pandas behavior is - # to replace the current category values with the new - # categories. - if rename: - # enforce same length - if len(new_categories) != len(self.categories): - raise ValueError( - "new_categories must have the same " - "number of items as old categories" - ) - out_col = type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=CategoricalDtype( - categories=new_categories, ordered=ordered - ), - mask=self.base_mask, - offset=self.offset, - children=(self.codes,), - ) - else: - out_col = self - if type(out_col.categories) is not type(new_categories): - # If both categories are of different Column types, - # return a column full of Nulls. - new_codes = cast( - cudf.core.column.numerical.NumericalColumn, - column.as_column( - _DEFAULT_CATEGORICAL_VALUE, - length=self.size, - dtype=self.codes.dtype, - ), - ) - new_codes = as_unsigned_codes(len(new_categories), new_codes) - out_col = type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=CategoricalDtype( - categories=new_categories, ordered=ordered - ), - mask=self.base_mask, - offset=self.offset, - children=(new_codes,), - ) - elif ( - not out_col._categories_equal(new_categories, ordered=True) - or not self.ordered == ordered - ): - out_col = out_col._set_categories( - new_categories, - ordered=ordered, - ) - return out_col - - def _categories_equal( - self, new_categories: ColumnBase, ordered=False - ) -> bool: - cur_categories = self.categories - if len(new_categories) != len(cur_categories): - return False - if new_categories.dtype != cur_categories.dtype: - return False - # if order doesn't matter, sort before the equals call below - if not ordered: - cur_categories = cur_categories.sort_values() - new_categories = new_categories.sort_values() - return cur_categories.equals(new_categories) - - def _set_categories( - self, - new_categories: Any, - is_unique: bool = False, - ordered: bool = False, - ) -> Self: - """Returns a new CategoricalColumn with the categories set to the - specified *new_categories*. - - Notes - ----- - Assumes ``new_categories`` is the same dtype as the current categories - """ - - cur_cats = column.as_column(self.categories) - new_cats = column.as_column(new_categories) - - # Join the old and new categories to build a map from - # old to new codes, inserting na_sentinel for any old - # categories that don't exist in the new categories - - # Ensure new_categories is unique first - if not (is_unique or new_cats.is_unique): - new_cats = new_cats.unique() - - if cur_cats.equals(new_cats, check_dtypes=True): - # TODO: Internal usages don't always need a copy; add a copy keyword - # as_ordered shallow copies - return self.copy().as_ordered(ordered=ordered) - - cur_codes = self.codes - out_code_dtype = min_unsigned_type(max(len(cur_cats), len(new_cats))) - - cur_order = column.as_column(range(len(cur_codes))) - old_codes = column.as_column( - range(len(cur_cats)), dtype=out_code_dtype - ) - new_codes = column.as_column( - range(len(new_cats)), dtype=out_code_dtype - ) - - new_df = cudf.DataFrame._from_data( - data={"new_codes": new_codes, "cats": new_cats} - ) - old_df = cudf.DataFrame._from_data( - data={"old_codes": old_codes, "cats": cur_cats} - ) - cur_df = cudf.DataFrame._from_data( - data={"old_codes": cur_codes, "order": cur_order} - ) - - # Join the old and new categories and line up their codes - df = old_df.merge(new_df, on="cats", how="left") - # Join the old and new codes to "recode" the codes data buffer - df = cur_df.merge(df, on="old_codes", how="left") - df = df.sort_values(by="order") - df.reset_index(drop=True, inplace=True) - - ordered = ordered if ordered is not None else self.ordered - new_codes = cast( - cudf.core.column.numerical.NumericalColumn, df._data["new_codes"] - ) - - # codes can't have masks, so take mask out before moving in - new_codes = as_unsigned_codes(len(new_cats), new_codes) - return type(self)( - data=self.data, # type: ignore[arg-type] - size=new_codes.size, - dtype=CategoricalDtype(categories=new_cats, ordered=ordered), - mask=new_codes.base_mask, - offset=new_codes.offset, - children=(new_codes,), - ) - - def add_categories(self, new_categories: Any) -> Self: - old_categories = self.categories - new_categories = column.as_column( - new_categories, - dtype=old_categories.dtype if len(new_categories) == 0 else None, - ) - if is_mixed_with_object_dtype(old_categories, new_categories): - raise TypeError( - f"cudf does not support adding categories with existing " - f"categories of dtype `{old_categories.dtype}` and new " - f"categories of dtype `{new_categories.dtype}`, please " - f"type-cast new_categories to the same type as " - f"existing categories." - ) - common_dtype = find_common_type( - [old_categories.dtype, new_categories.dtype] - ) - - new_categories = new_categories.astype(common_dtype) - old_categories = old_categories.astype(common_dtype) - - if old_categories.isin(new_categories).any(): - raise ValueError("new categories must not include old categories") - - new_categories = old_categories.append(new_categories) - if not self._categories_equal(new_categories): - return self._set_categories(new_categories) - return self - - def remove_categories( - self, - removals: Any, - ) -> Self: - removals = column.as_column(removals).astype(self.categories.dtype) - removals_mask = removals.isin(self.categories) - - # ensure all the removals are in the current categories - # list. If not, raise an error to match Pandas behavior - if not removals_mask.all(): - raise ValueError("removals must all be in old categories") - - new_categories = self.categories.apply_boolean_mask( - self.categories.isin(removals).unary_operator("not") - ) - if not self._categories_equal(new_categories): - return self._set_categories(new_categories) - return self - - def reorder_categories( - self, - new_categories: Any, - ordered: bool = False, - ) -> CategoricalColumn: - new_categories = column.as_column(new_categories) - # Compare new_categories against current categories. - # Ignore order for comparison because we're only interested - # in whether new_categories has all the same values as the - # current set of categories. - if not self._categories_equal(new_categories, ordered=False): - raise ValueError( - "items in new_categories are not the same as in " - "old categories" - ) - return self._set_categories(new_categories, ordered=ordered) - - def rename_categories(self, new_categories) -> CategoricalColumn: - raise NotImplementedError( - "rename_categories is currently not supported." - ) - - def remove_unused_categories(self) -> Self: - raise NotImplementedError( - "remove_unused_categories is currently not supported." - ) - - def as_ordered(self, ordered: bool) -> Self: - if self.dtype.ordered == ordered: - return self - return type(self)( - data=self.data, # type: ignore[arg-type] - size=self.size, - dtype=CategoricalDtype( - categories=self.categories, ordered=ordered - ), - mask=self.base_mask, - offset=self.offset, - children=self.children, - ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py deleted file mode 100644 index 7674565e2c3..00000000000 --- a/python/cudf/cudf/core/column/column.py +++ /dev/null @@ -1,2301 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import pickle -from collections import abc -from functools import cached_property -from itertools import chain -from types import SimpleNamespace -from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast - -import cupy -import numpy as np -import pandas as pd -import pyarrow as pa -import pyarrow.compute as pc -from numba import cuda -from pandas.core.arrays.arrow.extension_types import ArrowIntervalType -from typing_extensions import Self - -import rmm - -import cudf -from cudf import _lib as libcudf -from cudf._lib.column import Column -from cudf._lib.null_mask import ( - MaskState, - bitmask_allocation_size_bytes, - create_null_mask, -) -from cudf._lib.scalar import as_device_scalar -from cudf._lib.stream_compaction import ( - apply_boolean_mask, - distinct_count as cpp_distinct_count, - drop_duplicates, - drop_nulls, -) -from cudf._lib.transform import bools_to_mask -from cudf._lib.types import size_type_dtype -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - _is_pandas_nullable_extension_dtype, - infer_dtype, - is_dtype_equal, - is_scalar, - is_string_dtype, -) -from cudf.core._compat import PANDAS_GE_210 -from cudf.core._internals.timezones import get_compatible_timezone -from cudf.core.abc import Serializable -from cudf.core.buffer import ( - Buffer, - acquire_spill_lock, - as_buffer, - cuda_array_interface_wrapper, -) -from cudf.core.dtypes import ( - CategoricalDtype, - DecimalDtype, - IntervalDtype, - ListDtype, - StructDtype, -) -from cudf.core.mixins import BinaryOperand, Reducible -from cudf.errors import MixedTypeError -from cudf.utils.dtypes import ( - _maybe_convert_to_default_type, - cudf_dtype_from_pa_type, - cudf_dtype_to_pa_type, - find_common_type, - get_time_unit, - is_column_like, - is_mixed_with_object_dtype, - min_signed_type, - min_unsigned_type, -) -from cudf.utils.utils import _array_ufunc, mask_dtype - -if TYPE_CHECKING: - import builtins - - from cudf._typing import ColumnLike, Dtype, ScalarLike - -if PANDAS_GE_210: - NumpyExtensionArray = pd.arrays.NumpyExtensionArray -else: - NumpyExtensionArray = pd.arrays.PandasArray - - -class ColumnBase(Column, Serializable, BinaryOperand, Reducible): - _VALID_REDUCTIONS = { - "any", - "all", - "max", - "min", - } - - def data_array_view( - self, *, mode: Literal["write", "read"] = "write" - ) -> "cuda.devicearray.DeviceNDArray": - """ - View the data as a device array object - - Parameters - ---------- - mode : str, default 'write' - Supported values are {'read', 'write'} - If 'write' is passed, a device array object - with readonly flag set to False in CAI is returned. - If 'read' is passed, a device array object - with readonly flag set to True in CAI is returned. - This also means, If the caller wishes to modify - the data returned through this view, they must - pass mode="write", else pass mode="read". - - Returns - ------- - numba.cuda.cudadrv.devicearray.DeviceNDArray - """ - if self.data is not None: - if mode == "read": - obj = cuda_array_interface_wrapper( - ptr=self.data.get_ptr(mode="read"), - size=self.data.size, - owner=self.data, - ) - elif mode == "write": - obj = self.data - else: - raise ValueError(f"Unsupported mode: {mode}") - else: - obj = None - return cuda.as_cuda_array(obj).view(self.dtype) - - def mask_array_view( - self, *, mode: Literal["write", "read"] = "write" - ) -> "cuda.devicearray.DeviceNDArray": - """ - View the mask as a device array - - Parameters - ---------- - mode : str, default 'write' - Supported values are {'read', 'write'} - If 'write' is passed, a device array object - with readonly flag set to False in CAI is returned. - If 'read' is passed, a device array object - with readonly flag set to True in CAI is returned. - This also means, If the caller wishes to modify - the data returned through this view, they must - pass mode="write", else pass mode="read". - - Returns - ------- - numba.cuda.cudadrv.devicearray.DeviceNDArray - """ - if self.mask is not None: - if mode == "read": - obj = cuda_array_interface_wrapper( - ptr=self.mask.get_ptr(mode="read"), - size=self.mask.size, - owner=self.mask, - ) - elif mode == "write": - obj = self.mask - else: - raise ValueError(f"Unsupported mode: {mode}") - else: - obj = None - return cuda.as_cuda_array(obj).view(mask_dtype) - - def __len__(self) -> int: - return self.size - - def __repr__(self): - return ( - f"{object.__repr__(self)}\n" - f"{self.to_arrow().to_string()}\n" - f"dtype: {self.dtype}" - ) - - def to_pandas( - self, - *, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Index: - """Convert object to pandas type. - - The default implementation falls back to PyArrow for the conversion. - """ - # This default implementation does not handle nulls in any meaningful - # way - if arrow_type and nullable: - raise ValueError( - f"{arrow_type=} and {nullable=} cannot both be set." - ) - elif nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - pa_array = self.to_arrow() - if arrow_type: - return pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) - else: - return pd.Index(pa_array.to_pandas()) - - @property - def values_host(self) -> np.ndarray: - """ - Return a numpy representation of the Column. - """ - if len(self) == 0: - return np.array([], dtype=self.dtype) - - if self.has_nulls(): - raise ValueError("Column must have no nulls.") - - with acquire_spill_lock(): - return self.data_array_view(mode="read").copy_to_host() - - @property - def values(self) -> cupy.ndarray: - """ - Return a CuPy representation of the Column. - """ - if len(self) == 0: - return cupy.array([], dtype=self.dtype) - - if self.has_nulls(): - raise ValueError("Column must have no nulls.") - - return cupy.asarray(self.data_array_view(mode="write")) - - def find_and_replace( - self, - to_replace: ColumnLike, - replacement: ColumnLike, - all_nan: bool = False, - ) -> Self: - raise NotImplementedError - - def clip(self, lo: ScalarLike, hi: ScalarLike) -> ColumnBase: - return libcudf.replace.clip(self, lo, hi) - - def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: - if self is other: - return True - if other is None or len(self) != len(other): - return False - if check_dtypes and (self.dtype != other.dtype): - return False - ret = self._binaryop(other, "NULL_EQUALS") - if ret is NotImplemented: - raise TypeError(f"Cannot compare equality with {type(other)}") - return ret.all() - - def all(self, skipna: bool = True) -> bool: - # The skipna argument is only used for numerical columns. - # If all entries are null the result is True, including when the column - # is empty. - - if self.null_count == self.size: - return True - - return libcudf.reduce.reduce("all", self) - - def any(self, skipna: bool = True) -> bool: - # Early exit for fast cases. - - if not skipna and self.has_nulls(): - return True - elif skipna and self.null_count == self.size: - return False - - return libcudf.reduce.reduce("any", self) - - def dropna(self) -> Self: - if self.has_nulls(): - return drop_nulls([self])[0]._with_type_metadata(self.dtype) - else: - return self.copy() - - def to_arrow(self) -> pa.Array: - """Convert to PyArrow Array - - Examples - -------- - >>> import cudf - >>> col = cudf.core.column.as_column([1, 2, 3, 4]) - >>> col.to_arrow() - - [ - 1, - 2, - 3, - 4 - ] - """ - return libcudf.interop.to_arrow([self], [("None", self.dtype)])[ - "None" - ].chunk(0) - - @classmethod - def from_arrow(cls, array: pa.Array) -> ColumnBase: - """ - Convert PyArrow Array/ChunkedArray to column - - Parameters - ---------- - array : PyArrow Array/ChunkedArray - - Returns - ------- - column - - Examples - -------- - >>> import pyarrow as pa - >>> import cudf - >>> cudf.core.column.ColumnBase.from_arrow(pa.array([1, 2, 3, 4])) - - """ - if not isinstance(array, (pa.Array, pa.ChunkedArray)): - raise TypeError("array should be PyArrow array or chunked array") - elif pa.types.is_float16(array.type): - raise NotImplementedError( - "Type casting from `float16` to `float32` is not " - "yet supported in pyarrow, see: " - "https://github.com/apache/arrow/issues/20213" - ) - elif isinstance(array.type, ArrowIntervalType): - return cudf.core.column.IntervalColumn.from_arrow(array) - - data = pa.table([array], [None]) - - if isinstance(array.type, pa.DictionaryType): - indices_table = pa.table( - { - "None": pa.chunked_array( - [chunk.indices for chunk in data["None"].chunks], - type=array.type.index_type, - ) - } - ) - dictionaries_table = pa.table( - { - "None": pa.chunked_array( - [chunk.dictionary for chunk in data["None"].chunks], - type=array.type.value_type, - ) - } - ) - - codes = libcudf.interop.from_arrow(indices_table)[0] - categories = libcudf.interop.from_arrow(dictionaries_table)[0] - codes = cudf.core.column.categorical.as_unsigned_codes( - len(categories), codes - ) - return cudf.core.column.CategoricalColumn( - data=None, - size=codes.size, - dtype=CategoricalDtype( - categories=categories, ordered=array.type.ordered - ), - mask=codes.base_mask, - children=(codes,), - ) - - result = libcudf.interop.from_arrow(data)[0] - - return result._with_type_metadata(cudf_dtype_from_pa_type(array.type)) - - def _get_mask_as_column(self) -> ColumnBase: - return libcudf.transform.mask_to_bools( - self.base_mask, self.offset, self.offset + len(self) - ) - - @cached_property - def memory_usage(self) -> int: - n = 0 - if self.data is not None: - n += self.data.size - if self.nullable: - n += bitmask_allocation_size_bytes(self.size) - return n - - def _fill( - self, - fill_value: ScalarLike, - begin: int, - end: int, - inplace: bool = False, - ) -> Self | None: - if end <= begin or begin >= self.size: - return self if inplace else self.copy() - - # Constructing a cuDF scalar can cut unnecessary DtoH copy if - # the scalar is None when calling `is_valid`. - slr = cudf.Scalar(fill_value, dtype=self.dtype) - - if not inplace: - return libcudf.filling.fill(self, begin, end, slr.device_value) - - if is_string_dtype(self.dtype): - return self._mimic_inplace( - libcudf.filling.fill(self, begin, end, slr.device_value), - inplace=True, - ) - - if not slr.is_valid() and not self.nullable: - mask = create_null_mask(self.size, state=MaskState.ALL_VALID) - self.set_base_mask(mask) - - libcudf.filling.fill_in_place(self, begin, end, slr.device_value) - - return self - - def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase: - return libcudf.copying.shift(self, offset, fill_value) - - @property - def nullmask(self) -> Buffer: - """The gpu buffer for the null-mask""" - if not self.nullable: - raise ValueError("Column has no null mask") - return self.mask_array_view(mode="read") - - def copy(self, deep: bool = True) -> Self: - """ - Makes a copy of the Column. - - Parameters - ---------- - deep : bool, default True - If True, a true physical copy of the column - is made. - If False and `copy_on_write` is False, the same - memory is shared between the buffers of the Column - and changes made to one Column will propagate to - its copy and vice-versa. - If False and `copy_on_write` is True, the same - memory is shared between the buffers of the Column - until there is a write operation being performed on - them. - """ - if deep: - result = libcudf.copying.copy_column(self) - return result._with_type_metadata(self.dtype) - else: - return cast( - Self, - build_column( - data=self.base_data - if self.base_data is None - else self.base_data.copy(deep=False), - dtype=self.dtype, - mask=self.base_mask - if self.base_mask is None - else self.base_mask.copy(deep=False), - size=self.size, - offset=self.offset, - children=tuple( - col.copy(deep=False) for col in self.base_children - ), - ), - ) - - def view(self, dtype: Dtype) -> ColumnBase: - """ - View the data underlying a column as different dtype. - The source column must divide evenly into the size of - the desired data type. Columns with nulls may only be - viewed as dtypes with size equal to source dtype size - - Parameters - ---------- - dtype : NumPy dtype, string - The dtype to view the data as - - """ - - dtype = cudf.dtype(dtype) - - if dtype.kind in ("o", "u", "s"): - raise TypeError( - "Bytes viewed as str without metadata is ambiguous" - ) - - if self.dtype.itemsize == dtype.itemsize: - return build_column( - self.base_data, - dtype=dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - ) - - else: - if self.null_count > 0: - raise ValueError( - "Can not produce a view of a column with nulls" - ) - - if (self.size * self.dtype.itemsize) % dtype.itemsize: - raise ValueError( - f"Can not divide {self.size * self.dtype.itemsize}" - + f" total bytes into {dtype} with size {dtype.itemsize}" - ) - - # This assertion prevents mypy errors below. - assert self.base_data is not None - - start = self.offset * self.dtype.itemsize - end = start + self.size * self.dtype.itemsize - return build_column(self.base_data[start:end], dtype=dtype) - - def element_indexing(self, index: int): - """Default implementation for indexing to an element - - Raises - ------ - ``IndexError`` if out-of-bound - """ - idx = np.int32(index) - if idx < 0: - idx = len(self) + idx - if idx > len(self) - 1 or idx < 0: - raise IndexError("single positional indexer is out-of-bounds") - return libcudf.copying.get_element(self, idx).value - - def slice(self, start: int, stop: int, stride: int | None = None) -> Self: - stride = 1 if stride is None else stride - if start < 0: - start = start + len(self) - if stop < 0 and not (stride < 0 and stop == -1): - stop = stop + len(self) - if (stride > 0 and start >= stop) or (stride < 0 and start <= stop): - return cast(Self, column_empty(0, self.dtype, masked=True)) - # compute mask slice - if stride == 1: - return libcudf.copying.column_slice(self, [start, stop])[ - 0 - ]._with_type_metadata(self.dtype) - else: - # Need to create a gather map for given slice with stride - gather_map = as_column( - range(start, stop, stride), - dtype=cudf.dtype(np.int32), - ) - return self.take(gather_map) - - def __setitem__(self, key: Any, value: Any): - """ - Set the value of ``self[key]`` to ``value``. - - If ``value`` and ``self`` are of different types, ``value`` is coerced - to ``self.dtype``. Assumes ``self`` and ``value`` are index-aligned. - """ - - # Normalize value to scalar/column - value_normalized: cudf.Scalar | ColumnBase = ( - cudf.Scalar(value, dtype=self.dtype) - if is_scalar(value) - else as_column(value, dtype=self.dtype) - ) - - out: ColumnBase | None # If None, no need to perform mimic inplace. - if isinstance(key, slice): - out = self._scatter_by_slice(key, value_normalized) - else: - key = as_column(key) - if not isinstance(key, cudf.core.column.NumericalColumn): - raise ValueError(f"Invalid scatter map type {key.dtype}.") - out = self._scatter_by_column(key, value_normalized) - - if out: - self._mimic_inplace(out, inplace=True) - - def _wrap_binop_normalization(self, other): - if cudf.utils.utils.is_na_like(other): - return cudf.Scalar(other, dtype=self.dtype) - if isinstance(other, np.ndarray) and other.ndim == 0: - # Try and maintain the dtype - other = other.dtype.type(other.item()) - return self.normalize_binop_value(other) - - def _scatter_by_slice( - self, - key: builtins.slice, - value: cudf.core.scalar.Scalar | ColumnBase, - ) -> Self | None: - """If this function returns None, it's either a no-op (slice is empty), - or the inplace replacement is already performed (fill-in-place). - """ - start, stop, step = key.indices(len(self)) - if start >= stop: - return None - rng = range(start, stop, step) - num_keys = len(rng) - - self._check_scatter_key_length(num_keys, value) - - if step == 1 and not isinstance( - self, (cudf.core.column.StructColumn, cudf.core.column.ListColumn) - ): - # NOTE: List & Struct dtypes aren't supported by both - # inplace & out-of-place fill. Hence we need to use scatter for - # these two types. - if isinstance(value, cudf.core.scalar.Scalar): - return self._fill(value, start, stop, inplace=True) - else: - return libcudf.copying.copy_range( - value, self, 0, num_keys, start, stop, False - ) - - # step != 1, create a scatter map with arange - scatter_map = cast( - cudf.core.column.NumericalColumn, - as_column( - rng, - dtype=cudf.dtype(np.int32), - ), - ) - - return self._scatter_by_column(scatter_map, value) - - def _scatter_by_column( - self, - key: cudf.core.column.NumericalColumn, - value: cudf.core.scalar.Scalar | ColumnBase, - ) -> Self: - if key.dtype.kind == "b": - # `key` is boolean mask - if len(key) != len(self): - raise ValueError( - "Boolean mask must be of same length as column" - ) - if isinstance(value, ColumnBase) and len(self) == len(value): - # Both value and key are aligned to self. Thus, the values - # corresponding to the false values in key should be - # ignored. - value = value.apply_boolean_mask(key) - # After applying boolean mask, the length of value equals - # the number of elements to scatter, we can skip computing - # the sum of ``key`` below. - num_keys = len(value) - else: - # Compute the number of element to scatter by summing all - # `True`s in the boolean mask. - num_keys = key.sum() - else: - # `key` is integer scatter map - num_keys = len(key) - - self._check_scatter_key_length(num_keys, value) - - if key.dtype.kind == "b": - return libcudf.copying.boolean_mask_scatter([value], [self], key)[ - 0 - ]._with_type_metadata(self.dtype) - else: - return libcudf.copying.scatter([value], key, [self])[ - 0 - ]._with_type_metadata(self.dtype) - - def _check_scatter_key_length( - self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase - ) -> None: - """`num_keys` is the number of keys to scatter. Should equal to the - number of rows in ``value`` if ``value`` is a column. - """ - if isinstance(value, ColumnBase) and len(value) != num_keys: - raise ValueError( - f"Size mismatch: cannot set value " - f"of size {len(value)} to indexing result of size " - f"{num_keys}" - ) - - def _validate_fillna_value( - self, fill_value: ScalarLike | ColumnLike - ) -> cudf.Scalar | ColumnBase: - """Align fill_value for .fillna based on column type.""" - if is_scalar(fill_value): - return cudf.Scalar(fill_value, dtype=self.dtype) - return as_column(fill_value) - - def fillna( - self, - fill_value: ScalarLike | ColumnLike, - method: Literal["ffill", "bfill", None] = None, - ) -> Self: - """Fill null values with ``value``. - - Returns a copy with null filled. - """ - if not self.has_nulls(include_nan=True): - return self.copy() - elif method is None: - if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar( - fill_value - ): - return self.copy() - else: - fill_value = self._validate_fillna_value(fill_value) - return libcudf.replace.replace_nulls( - input_col=self.nans_to_nulls(), - replacement=fill_value, - method=method, - )._with_type_metadata(self.dtype) - - def isnull(self) -> ColumnBase: - """Identify missing values in a Column.""" - if not self.has_nulls(include_nan=self.dtype.kind == "f"): - return as_column(False, length=len(self)) - - result = libcudf.unary.is_null(self) - - if self.dtype.kind == "f": - # Need to consider `np.nan` values in case - # of a float column - result = result | libcudf.unary.is_nan(self) - - return result - - def notnull(self) -> ColumnBase: - """Identify non-missing values in a Column.""" - if not self.has_nulls(include_nan=self.dtype.kind == "f"): - return as_column(True, length=len(self)) - - result = libcudf.unary.is_valid(self) - - if self.dtype.kind == "f": - # Need to consider `np.nan` values in case - # of a float column - result = result & libcudf.unary.is_non_nan(self) - - return result - - def indices_of( - self, value: ScalarLike - ) -> cudf.core.column.NumericalColumn: - """ - Find locations of value in the column - - Parameters - ---------- - value - Scalar to look for (cast to dtype of column), or a length-1 column - - Returns - ------- - Column of indices that match value - """ - if not is_scalar(value): - raise ValueError("value must be a scalar") - else: - value = as_column(value, dtype=self.dtype, length=1) - mask = libcudf.search.contains(value, self) - return apply_boolean_mask( - [as_column(range(0, len(self)), dtype=size_type_dtype)], mask - )[0] - - def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: - indices = self.indices_of(value) - if n := len(indices): - return ( - indices.element_indexing(0), - indices.element_indexing(n - 1), - ) - else: - raise ValueError(f"Value {value} not found in column") - - def find_first_value(self, value: ScalarLike) -> int: - """ - Return index of first value that matches - - Parameters - ---------- - value - Value to search for (cast to dtype of column) - - Returns - ------- - Index of value - - Raises - ------ - ValueError if value is not found - """ - first, _ = self._find_first_and_last(value) - return first - - def find_last_value(self, value: ScalarLike) -> int: - """ - Return index of last value that matches - - Parameters - ---------- - value - Value to search for (cast to dtype of column) - - Returns - ------- - Index of value - - Raises - ------ - ValueError if value is not found - """ - _, last = self._find_first_and_last(value) - return last - - def append(self, other: ColumnBase) -> ColumnBase: - return concat_columns([self, as_column(other)]) - - def quantile( - self, - q: np.ndarray, - interpolation: str, - exact: bool, - return_scalar: bool, - ) -> ColumnBase: - raise TypeError(f"cannot perform quantile with type {self.dtype}") - - def take( - self, indices: ColumnBase, nullify: bool = False, check_bounds=True - ) -> Self: - """Return Column by taking values from the corresponding *indices*. - - Skip bounds checking if check_bounds is False. - Set rows to null for all out of bound indices if nullify is `True`. - """ - # Handle zero size - if indices.size == 0: - return cast(Self, column_empty_like(self, newsize=0)) - - # TODO: For performance, the check and conversion of gather map should - # be done by the caller. This check will be removed in future release. - if indices.dtype.kind not in {"u", "i"}: - indices = indices.astype(libcudf.types.size_type_dtype) - if not libcudf.copying._gather_map_is_valid( - indices, len(self), check_bounds, nullify - ): - raise IndexError("Gather map index is out of bounds.") - - return libcudf.copying.gather([self], indices, nullify=nullify)[ - 0 - ]._with_type_metadata(self.dtype) - - def isin(self, values: Sequence) -> ColumnBase: - """Check whether values are contained in the Column. - - Parameters - ---------- - values : set or list-like - The sequence of values to test. Passing in a single string will - raise a TypeError. Instead, turn a single string into a list - of one element. - - Returns - ------- - result: Column - Column of booleans indicating if each element is in values. - """ - try: - lhs, rhs = self._process_values_for_isin(values) - res = lhs._isin_earlystop(rhs) - if res is not None: - return res - except ValueError: - # pandas functionally returns all False when cleansing via - # typecasting fails - return as_column(False, length=len(self), dtype="bool") - - return lhs._obtain_isin_result(rhs) - - def _process_values_for_isin( - self, values: Sequence - ) -> tuple[ColumnBase, ColumnBase]: - """ - Helper function for `isin` which pre-process `values` based on `self`. - """ - lhs = self - rhs = as_column(values, nan_as_null=False) - if lhs.null_count == len(lhs): - lhs = lhs.astype(rhs.dtype) - elif rhs.null_count == len(rhs): - rhs = rhs.astype(lhs.dtype) - return lhs, rhs - - def _isin_earlystop(self, rhs: ColumnBase) -> ColumnBase | None: - """ - Helper function for `isin` which determines possibility of - early-stopping or not. - """ - if self.dtype != rhs.dtype: - if self.null_count and rhs.null_count: - return self.isnull() - else: - return as_column(False, length=len(self), dtype="bool") - elif self.null_count == 0 and (rhs.null_count == len(rhs)): - return as_column(False, length=len(self), dtype="bool") - else: - return None - - def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: - """ - Helper function for `isin` which merges `self` & `rhs` - to determine what values of `rhs` exist in `self`. - """ - # We've already matched dtypes by now - # self.isin(other) asks "which values of self are in other" - # contains(haystack, needles) asks "which needles are in haystack" - # hence this argument ordering. - result = libcudf.search.contains(rhs, self) - if self.null_count > 0: - # If one of the needles is null, then the result contains - # nulls, these nulls should be replaced by whether or not the - # haystack contains a null. - # TODO: this is unnecessary if we resolve - # https://github.com/rapidsai/cudf/issues/14515 by - # providing a mode in which cudf::contains does not mask - # the result. - result = result.fillna(cudf.Scalar(rhs.null_count > 0)) - return result - - def as_mask(self) -> Buffer: - """Convert booleans to bitmask - - Returns - ------- - Buffer - """ - - if self.has_nulls(): - raise ValueError("Column must have no nulls.") - - return bools_to_mask(self) - - @property - def is_unique(self) -> bool: - # distinct_count might already be cached - return self.distinct_count(dropna=False) == len(self) - - @cached_property - def is_monotonic_increasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( - [self], [True], None - ) - - @cached_property - def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted( - [self], [False], None - ) - - def sort_values( - self: Self, - ascending: bool = True, - na_position: str = "last", - ) -> Self: - if (not ascending and self.is_monotonic_decreasing) or ( - ascending and self.is_monotonic_increasing - ): - return self.copy() - return libcudf.sort.sort( - [self], column_order=[ascending], null_precedence=[na_position] - )[0] - - def distinct_count(self, dropna: bool = True) -> int: - try: - return self._distinct_count[dropna] - except KeyError: - self._distinct_count[dropna] = cpp_distinct_count( - self, ignore_nulls=dropna - ) - return self._distinct_count[dropna] - - def can_cast_safely(self, to_dtype: Dtype) -> bool: - raise NotImplementedError() - - def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase: - if len(self) == 0: - dtype = cudf.dtype(dtype) - if self.dtype == dtype: - result = self - else: - result = column_empty(0, dtype=dtype, masked=self.nullable) - elif dtype == "category": - # TODO: Figure out why `cudf.dtype("category")` - # astype's different than just the string - result = self.as_categorical_column(dtype) - elif ( - isinstance(dtype, str) - and dtype == "interval" - and isinstance(self.dtype, cudf.IntervalDtype) - ): - # astype("interval") (the string only) should no-op - result = self - else: - was_object = dtype == object or dtype == np.dtype(object) - dtype = cudf.dtype(dtype) - if self.dtype == dtype: - result = self - elif isinstance(dtype, CategoricalDtype): - result = self.as_categorical_column(dtype) - elif isinstance(dtype, IntervalDtype): - result = self.as_interval_column(dtype) - elif isinstance(dtype, (ListDtype, StructDtype)): - if not self.dtype == dtype: - raise NotImplementedError( - f"Casting {self.dtype} columns not currently supported" - ) - result = self - elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): - result = self.as_decimal_column(dtype) - elif dtype.kind == "M": - result = self.as_datetime_column(dtype) - elif dtype.kind == "m": - result = self.as_timedelta_column(dtype) - elif dtype.kind == "O": - if cudf.get_option("mode.pandas_compatible") and was_object: - raise ValueError( - f"Casting to {dtype} is not supported, use " - "`.astype('str')` instead." - ) - result = self.as_string_column() - else: - result = self.as_numerical_column(dtype) - - if copy and result is self: - return result.copy() - return result - - def as_categorical_column(self, dtype) -> ColumnBase: - if isinstance(dtype, pd.CategoricalDtype): - dtype = cudf.CategoricalDtype.from_pandas(dtype) - if isinstance(dtype, cudf.CategoricalDtype): - ordered = dtype.ordered - else: - ordered = False - - # Re-label self w.r.t. the provided categories - if ( - isinstance(dtype, cudf.CategoricalDtype) - and dtype._categories is not None - ): - cat_col = dtype._categories - codes = self._label_encoding(cats=cat_col) - codes = cudf.core.column.categorical.as_unsigned_codes( - len(cat_col), codes - ) - return cudf.core.column.categorical.CategoricalColumn( - data=None, - size=None, - dtype=dtype, - mask=self.mask, - children=(codes,), - ) - - # Categories must be unique and sorted in ascending order. - cats = self.unique().sort_values().astype(self.dtype) - label_dtype = min_unsigned_type(len(cats)) - labels = self._label_encoding( - cats=cats, dtype=label_dtype, na_sentinel=cudf.Scalar(1) - ) - # columns include null index in factorization; remove: - if self.has_nulls(): - cats = cats.dropna() - - labels = cudf.core.column.categorical.as_unsigned_codes( - len(cats), labels - ) - return cudf.core.column.categorical.CategoricalColumn( - data=None, - size=None, - dtype=CategoricalDtype(categories=cats, ordered=ordered), - mask=self.mask, - children=(labels,), - ) - - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": - raise NotImplementedError - - def as_datetime_column( - self, dtype: Dtype - ) -> cudf.core.column.DatetimeColumn: - raise NotImplementedError - - def as_interval_column( - self, dtype: Dtype - ) -> "cudf.core.column.IntervalColumn": - raise NotImplementedError - - def as_timedelta_column( - self, dtype: Dtype - ) -> cudf.core.column.TimeDeltaColumn: - raise NotImplementedError - - def as_string_column(self) -> cudf.core.column.StringColumn: - raise NotImplementedError - - def as_decimal_column( - self, dtype: Dtype - ) -> "cudf.core.column.decimal.DecimalBaseColumn": - raise NotImplementedError - - def apply_boolean_mask(self, mask) -> ColumnBase: - mask = as_column(mask) - if mask.dtype.kind != "b": - raise ValueError("boolean_mask is not boolean type.") - - return apply_boolean_mask([self], mask)[0]._with_type_metadata( - self.dtype - ) - - def argsort( - self, - ascending: bool = True, - na_position: Literal["first", "last"] = "last", - ) -> cudf.core.column.NumericalColumn: - if (ascending and self.is_monotonic_increasing) or ( - not ascending and self.is_monotonic_decreasing - ): - return cast( - cudf.core.column.NumericalColumn, as_column(range(len(self))) - ) - elif (ascending and self.is_monotonic_decreasing) or ( - not ascending and self.is_monotonic_increasing - ): - return cast( - cudf.core.column.NumericalColumn, - as_column(range(len(self) - 1, -1, -1)), - ) - else: - return libcudf.sort.order_by( - [self], [ascending], na_position, stable=True - ) - - def __arrow_array__(self, type=None): - raise TypeError( - "Implicit conversion to a host PyArrow Array via __arrow_array__ " - "is not allowed, To explicitly construct a PyArrow Array, " - "consider using .to_arrow()" - ) - - @property - def __cuda_array_interface__(self) -> abc.Mapping[str, Any]: - output = { - "shape": (len(self),), - "strides": (self.dtype.itemsize,), - "typestr": self.dtype.str, - "data": (self.data_ptr, False), - "version": 1, - } - - if self.nullable and self.has_nulls(): - # Create a simple Python object that exposes the - # `__cuda_array_interface__` attribute here since we need to modify - # some of the attributes from the numba device array - output["mask"] = cuda_array_interface_wrapper( - ptr=self.mask_ptr, - size=len(self), - owner=self.mask, - readonly=True, - typestr=" Self: - if not isinstance(value, ColumnBase) or value.dtype != self.dtype: - raise ValueError( - "Column searchsorted expects values to be column of same dtype" - ) - return libcudf.search.search_sorted( - [self], - [value], - side=side, - ascending=ascending, - na_position=na_position, - ) - - def unique(self) -> Self: - """ - Get unique values in the data - """ - if self.is_unique: - return self.copy() - else: - return drop_duplicates([self], keep="first")[ - 0 - ]._with_type_metadata(self.dtype) - - def serialize(self) -> tuple[dict, list]: - # data model: - - # Serialization produces a nested metadata "header" and a flattened - # list of memoryviews/buffers that reference data (frames). Each - # header advertises a frame_count slot which indicates how many - # frames deserialization will consume. The class used to construct - # an object is named under the key "type-serialized" to match with - # Dask's serialization protocol (see - # distributed.protocol.serialize). Since column dtypes may either be - # cudf native or foreign some special-casing is required here for - # serialization. - - header: dict[Any, Any] = {} - frames = [] - header["type-serialized"] = pickle.dumps(type(self)) - try: - dtype, dtype_frames = self.dtype.serialize() - header["dtype"] = dtype - frames.extend(dtype_frames) - header["dtype-is-cudf-serialized"] = True - except AttributeError: - header["dtype"] = pickle.dumps(self.dtype) - header["dtype-is-cudf-serialized"] = False - - if self.data is not None: - data_header, data_frames = self.data.serialize() - header["data"] = data_header - frames.extend(data_frames) - - if self.mask is not None: - mask_header, mask_frames = self.mask.serialize() - header["mask"] = mask_header - frames.extend(mask_frames) - if self.children: - child_headers, child_frames = zip( - *(c.serialize() for c in self.children) - ) - header["subheaders"] = list(child_headers) - frames.extend(chain(*child_frames)) - header["size"] = self.size - header["frame_count"] = len(frames) - return header, frames - - @classmethod - def deserialize(cls, header: dict, frames: list) -> ColumnBase: - def unpack(header, frames) -> tuple[Any, list]: - count = header["frame_count"] - klass = pickle.loads(header["type-serialized"]) - obj = klass.deserialize(header, frames[:count]) - return obj, frames[count:] - - assert header["frame_count"] == len(frames), ( - f"Deserialization expected {header['frame_count']} frames, " - f"but received {len(frames)}" - ) - if header["dtype-is-cudf-serialized"]: - dtype, frames = unpack(header["dtype"], frames) - else: - dtype = pickle.loads(header["dtype"]) - if "data" in header: - data, frames = unpack(header["data"], frames) - else: - data = None - if "mask" in header: - mask, frames = unpack(header["mask"], frames) - else: - mask = None - children = [] - if "subheaders" in header: - for h in header["subheaders"]: - child, frames = unpack(h, frames) - children.append(child) - assert len(frames) == 0, "Deserialization did not consume all frames" - return build_column( - data=data, - dtype=dtype, - mask=mask, - size=header.get("size", None), - children=tuple(children), - ) - - def unary_operator(self, unaryop: str): - raise TypeError( - f"Operation {unaryop} not supported for dtype {self.dtype}." - ) - - def nans_to_nulls(self: Self) -> Self: - """Convert NaN to NA.""" - return self - - def normalize_binop_value( - self, other: ScalarLike - ) -> ColumnBase | ScalarLike: - raise NotImplementedError - - def _reduce( - self, - op: str, - skipna: bool | None = None, - min_count: int = 0, - *args, - **kwargs, - ) -> ScalarLike: - """Compute {op} of column values. - - skipna : bool - Whether or not na values must be skipped. - min_count : int, default 0 - The minimum number of entries for the reduction, otherwise the - reduction returns NaN. - """ - preprocessed = self._process_for_reduction( - skipna=skipna, min_count=min_count - ) - if isinstance(preprocessed, ColumnBase): - dtype = kwargs.pop("dtype", None) - return libcudf.reduce.reduce( - op, preprocessed, dtype=dtype, **kwargs - ) - return preprocessed - - def _process_for_reduction( - self, skipna: bool | None = None, min_count: int = 0 - ) -> ColumnBase | ScalarLike: - if skipna is None: - skipna = True - - if self.has_nulls(): - if skipna: - result_col = self.dropna() - else: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - result_col = self - - # TODO: If and when pandas decides to validate that `min_count` >= 0 we - # should insert comparable behavior. - # https://github.com/pandas-dev/pandas/issues/50022 - if min_count > 0: - valid_count = len(result_col) - result_col.null_count - if valid_count < min_count: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - return result_col - - def _reduction_result_dtype(self, reduction_op: str) -> Dtype: - """ - Determine the correct dtype to pass to libcudf based on - the input dtype, data dtype, and specific reduction op - """ - if reduction_op in {"any", "all"}: - return np.dtype(np.bool_) - return self.dtype - - def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: - """ - Copies type metadata from self onto other, returning a new column. - - When ``self`` is a nested column, recursively apply this function on - the children of ``self``. - """ - return self - - def _label_encoding( - self, - cats: ColumnBase, - dtype: Dtype | None = None, - na_sentinel: cudf.Scalar | None = None, - ): - """ - Convert each value in `self` into an integer code, with `cats` - providing the mapping between codes and values. - - Examples - -------- - >>> from cudf.core.column import as_column - >>> col = as_column(['foo', 'bar', 'foo', 'baz']) - >>> cats = as_column(['foo', 'bar', 'baz']) - >>> col._label_encoding(cats) - - [ - 0, - 1, - 0, - 2 - ] - dtype: int8 - >>> cats = as_column(['foo', 'bar']) - >>> col._label_encoding(cats) - - [ - 0, - 1, - 0, - -1 - ] - dtype: int8 - """ - from cudf._lib.join import join as cpp_join - - if na_sentinel is None or na_sentinel.value is cudf.NA: - na_sentinel = cudf.Scalar(-1) - - def _return_sentinel_column(): - return as_column(na_sentinel, dtype=dtype, length=len(self)) - - if dtype is None: - dtype = min_signed_type(max(len(cats), na_sentinel.value), 8) - - if is_mixed_with_object_dtype(self, cats): - return _return_sentinel_column() - - try: - # Where there is a type-cast failure, we have - # to catch the exception and return encoded labels - # with na_sentinel values as there would be no corresponding - # encoded values of cats in self. - cats = cats.astype(self.dtype) - except ValueError: - return _return_sentinel_column() - - left_gather_map, right_gather_map = cpp_join( - [self], [cats], how="left" - ) - codes = libcudf.copying.gather( - [as_column(range(len(cats)), dtype=dtype)], - right_gather_map, - nullify=True, - ) - del right_gather_map - # reorder `codes` so that its values correspond to the - # values of `self`: - (codes,) = libcudf.sort.sort_by_key( - codes, [left_gather_map], [True], ["last"], stable=True - ) - return codes.fillna(na_sentinel.value) - - -def column_empty_like( - column: ColumnBase, - dtype: Dtype | None = None, - masked: bool = False, - newsize: int | None = None, -) -> ColumnBase: - """Allocate a new column like the given *column*""" - if dtype is None: - dtype = column.dtype - row_count = len(column) if newsize is None else newsize - - if ( - hasattr(column, "dtype") - and isinstance(column.dtype, cudf.CategoricalDtype) - and dtype == column.dtype - ): - catcolumn = cast("cudf.core.column.CategoricalColumn", column) - codes = column_empty_like( - catcolumn.codes, masked=masked, newsize=newsize - ) - return build_column( - data=None, - dtype=dtype, - mask=codes.base_mask, - children=(codes,), - size=codes.size, - ) - - return column_empty(row_count, dtype, masked) - - -def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool: - """Check if an object dtype Series or array contains NaN.""" - return any( - isinstance(x, (float, np.floating)) and np.isnan(x) - for x in np.asarray(arbitrary) - ) - - -def column_empty( - row_count: int, dtype: Dtype = "object", masked: bool = False -) -> ColumnBase: - """Allocate a new column like the given row_count and dtype.""" - dtype = cudf.dtype(dtype) - children: tuple[ColumnBase, ...] = () - - if isinstance(dtype, StructDtype): - data = None - children = tuple( - column_empty(row_count, field_dtype) - for field_dtype in dtype.fields.values() - ) - elif isinstance(dtype, ListDtype): - data = None - children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), - column_empty(row_count, dtype=dtype.element_type), - ) - elif isinstance(dtype, CategoricalDtype): - data = None - children = ( - cudf.core.column.NumericalColumn( - data=as_buffer( - rmm.DeviceBuffer( - size=row_count - * cudf.dtype(libcudf.types.size_type_dtype).itemsize - ) - ), - size=None, - dtype=libcudf.types.size_type_dtype, - ), - ) - elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): - data = as_buffer(rmm.DeviceBuffer(size=0)) - children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), - ) - else: - data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) - - if masked: - mask = create_null_mask(row_count, state=MaskState.ALL_NULL) - else: - mask = None - - return build_column( - data, dtype, mask=mask, size=row_count, children=children - ) - - -def build_column( - data: Buffer | None, - dtype: Dtype, - *, - size: int | None = None, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple[ColumnBase, ...] = (), -) -> ColumnBase: - """ - Build a Column of the appropriate type from the given parameters - - Parameters - ---------- - data : Buffer - The data buffer (can be None if constructing certain Column - types like StringColumn, ListColumn, or CategoricalColumn) - dtype - The dtype associated with the Column to construct - mask : Buffer, optional - The mask buffer - size : int, optional - offset : int, optional - children : tuple, optional - """ - dtype = cudf.dtype(dtype) - - if _is_non_decimal_numeric_dtype(dtype): - assert data is not None - col = cudf.core.column.NumericalColumn( - data=data, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - ) - return col - - if isinstance(dtype, CategoricalDtype): - return cudf.core.column.CategoricalColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=children, # type: ignore[arg-type] - ) - elif dtype.type is np.datetime64: - return cudf.core.column.DatetimeColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - ) - elif isinstance(dtype, pd.DatetimeTZDtype): - return cudf.core.column.datetime.DatetimeTZColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - ) - elif dtype.type is np.timedelta64: - return cudf.core.column.TimeDeltaColumn( - data=data, # type: ignore[arg-type] - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - ) - elif dtype.type in (np.object_, np.str_): - return cudf.core.column.StringColumn( - data=data, - mask=mask, - size=size, - offset=offset, - children=children, - null_count=null_count, - ) - elif isinstance(dtype, ListDtype): - return cudf.core.column.ListColumn( - data=None, - size=size, # type: ignore[arg-type] - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, # type: ignore[arg-type] - ) - elif isinstance(dtype, IntervalDtype): - return cudf.core.column.IntervalColumn( - data=None, - size=size, # type: ignore[arg-type] - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, # type: ignore[arg-type] - ) - elif isinstance(dtype, StructDtype): - return cudf.core.column.StructColumn( - data=None, - size=size, # type: ignore[arg-type] - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, # type: ignore[arg-type] - ) - elif isinstance(dtype, cudf.Decimal64Dtype): - return cudf.core.column.Decimal64Column( - data=data, # type: ignore[arg-type] - size=size, # type: ignore[arg-type] - offset=offset, - dtype=dtype, - mask=mask, - null_count=null_count, - children=children, - ) - elif isinstance(dtype, cudf.Decimal32Dtype): - return cudf.core.column.Decimal32Column( - data=data, # type: ignore[arg-type] - size=size, # type: ignore[arg-type] - offset=offset, - dtype=dtype, - mask=mask, - null_count=null_count, - children=children, - ) - elif isinstance(dtype, cudf.Decimal128Dtype): - return cudf.core.column.Decimal128Column( - data=data, # type: ignore[arg-type] - size=size, # type: ignore[arg-type] - offset=offset, - dtype=dtype, - mask=mask, - null_count=null_count, - children=children, - ) - else: - raise TypeError(f"Unrecognized dtype: {dtype}") - - -def check_invalid_array(shape: tuple, dtype): - """Invalid ndarrays properties that are not supported""" - if len(shape) > 1: - raise ValueError("Data must be 1-dimensional") - elif dtype == "float16": - raise TypeError("Unsupported type float16") - - -def as_memoryview(arbitrary: Any) -> memoryview | None: - try: - return memoryview(arbitrary) - except TypeError: - return None - - -def as_column( - arbitrary: Any, - nan_as_null: bool | None = None, - dtype: Dtype | None = None, - length: int | None = None, -) -> ColumnBase: - """Create a Column from an arbitrary object - - Parameters - ---------- - arbitrary : object - Object to construct the Column from. See *Notes*. - nan_as_null : bool, optional, default None - If None (default), treats NaN values in arbitrary as null if there is - no mask passed along with it. If True, combines the mask and NaNs to - form a new validity mask. If False, leaves NaN values as is. - Only applies when arbitrary is not a cudf object - (Index, Series, Column). - dtype : optional - Optionally typecast the constructed Column to the given - dtype. - length : int, optional - If `arbitrary` is a scalar, broadcast into a Column of - the given length. - - Returns - ------- - A Column of the appropriate type and size. - - Notes - ----- - Currently support inputs are: - - * ``Column`` - * ``Series`` - * ``Index`` - * Scalars (can be broadcasted to a specified `length`) - * Objects exposing ``__cuda_array_interface__`` (e.g., numba device arrays) - * Objects exposing ``__array_interface__``(e.g., numpy arrays) - * pyarrow array - * pandas.Categorical objects - * range objects - """ - if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)): - column = libcudf.filling.sequence( - len(arbitrary), - as_device_scalar(arbitrary.start, dtype=cudf.dtype("int64")), - as_device_scalar(arbitrary.step, dtype=cudf.dtype("int64")), - ) - if cudf.get_option("default_integer_bitwidth") and dtype is None: - dtype = cudf.dtype( - f'i{cudf.get_option("default_integer_bitwidth")//8}' - ) - if dtype is not None: - return column.astype(dtype) - return column - elif isinstance(arbitrary, (ColumnBase, cudf.Series, cudf.BaseIndex)): - # Ignoring nan_as_null per the docstring - if isinstance(arbitrary, cudf.Series): - arbitrary = arbitrary._column - elif isinstance(arbitrary, cudf.BaseIndex): - arbitrary = arbitrary._values - if dtype is not None: - return arbitrary.astype(dtype) - return arbitrary - elif hasattr(arbitrary, "__cuda_array_interface__"): - desc = arbitrary.__cuda_array_interface__ - check_invalid_array(desc["shape"], np.dtype(desc["typestr"])) - - if desc.get("mask", None) is not None: - # Extract and remove the mask from arbitrary before - # passing to cupy.asarray - cai_copy = desc.copy() - mask = _mask_from_cuda_array_interface_desc( - arbitrary, cai_copy.pop("mask") - ) - arbitrary = SimpleNamespace(__cuda_array_interface__=cai_copy) - else: - mask = None - - arbitrary = cupy.asarray(arbitrary, order="C") - - data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write")) - col = build_column(data, dtype=arbitrary.dtype, mask=mask) - if nan_as_null or (mask is None and nan_as_null is None): - col = col.nans_to_nulls() - if dtype is not None: - col = col.astype(dtype) - return col - - elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)): - if (nan_as_null is None or nan_as_null) and pa.types.is_floating( - arbitrary.type - ): - arbitrary = pc.if_else( - pc.is_nan(arbitrary), - pa.nulls(len(arbitrary), type=arbitrary.type), - arbitrary, - ) - elif dtype is None and pa.types.is_null(arbitrary.type): - # default "empty" type - dtype = "str" - col = ColumnBase.from_arrow(arbitrary) - - if dtype is not None: - col = col.astype(dtype) - - return col - - elif isinstance( - arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray) - ): - if isinstance(arbitrary.dtype, (pd.SparseDtype, pd.PeriodDtype)): - raise NotImplementedError( - f"cuDF does not yet support {type(arbitrary.dtype).__name__}" - ) - elif ( - cudf.get_option("mode.pandas_compatible") - and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex)) - and arbitrary.freq is not None - ): - raise NotImplementedError("freq is not implemented yet") - elif isinstance(arbitrary.dtype, pd.IntervalDtype) and isinstance( - arbitrary.dtype.subtype, pd.DatetimeTZDtype - ): - raise NotImplementedError( - "cuDF does not yet support Intervals with timezone-aware datetimes" - ) - elif _is_pandas_nullable_extension_dtype(arbitrary.dtype): - if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError("not supported") - if isinstance(arbitrary, (pd.Series, pd.Index)): - # pandas arrays define __arrow_array__ for better - # pyarrow.array conversion - arbitrary = arbitrary.array - return as_column( - pa.array(arbitrary, from_pandas=True), - nan_as_null=nan_as_null, - dtype=dtype, - length=length, - ) - elif isinstance( - arbitrary.dtype, - (pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype), - ): - if isinstance(arbitrary.dtype, pd.DatetimeTZDtype): - new_tz = get_compatible_timezone(arbitrary.dtype) - arbitrary = arbitrary.astype(new_tz) - if isinstance(arbitrary.dtype, pd.CategoricalDtype) and isinstance( - arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype - ): - new_tz = get_compatible_timezone( - arbitrary.dtype.categories.dtype - ) - new_cats = arbitrary.dtype.categories.astype(new_tz) - new_dtype = pd.CategoricalDtype( - categories=new_cats, ordered=arbitrary.dtype.ordered - ) - arbitrary = arbitrary.astype(new_dtype) - - return as_column( - pa.array(arbitrary, from_pandas=True), - nan_as_null=nan_as_null, - dtype=dtype, - length=length, - ) - elif isinstance( - arbitrary.dtype, pd.api.extensions.ExtensionDtype - ) and not isinstance(arbitrary, NumpyExtensionArray): - raise NotImplementedError( - "Custom pandas ExtensionDtypes are not supported" - ) - elif arbitrary.dtype.kind in "fiubmM": - # numpy dtype like - if isinstance(arbitrary, NumpyExtensionArray): - arbitrary = np.array(arbitrary) - arb_dtype = np.dtype(arbitrary.dtype) - if arb_dtype.kind == "f" and arb_dtype.itemsize == 2: - raise TypeError("Unsupported type float16") - elif arb_dtype.kind in "mM": - # not supported by cupy - arbitrary = np.asarray(arbitrary) - else: - arbitrary = cupy.asarray(arbitrary) - return as_column( - arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length - ) - elif arbitrary.dtype.kind == "O": - if isinstance(arbitrary, NumpyExtensionArray): - # infer_dtype does not handle NumpyExtensionArray - arbitrary = np.array(arbitrary, dtype=object) - inferred_dtype = infer_dtype(arbitrary) - if inferred_dtype in ("mixed-integer", "mixed-integer-float"): - raise MixedTypeError("Cannot create column with mixed types") - elif dtype is None and inferred_dtype not in ( - "mixed", - "decimal", - "string", - "empty", - "boolean", - ): - raise TypeError( - f"Cannot convert a {inferred_dtype} of object type" - ) - elif inferred_dtype == "boolean": - if cudf.get_option("mode.pandas_compatible"): - if dtype != np.dtype("bool") or pd.isna(arbitrary).any(): - raise MixedTypeError( - f"Cannot have mixed values with {inferred_dtype}" - ) - elif nan_as_null is False and _has_any_nan(arbitrary): - raise MixedTypeError( - f"Cannot have mixed values with {inferred_dtype}" - ) - elif ( - nan_as_null is False - and inferred_dtype not in ("decimal", "empty") - and _has_any_nan(arbitrary) - ): - # Decimal can hold float("nan") - # All np.nan is not restricted by type - raise MixedTypeError(f"Cannot have NaN with {inferred_dtype}") - - pyarrow_array = pa.array( - arbitrary, - from_pandas=True, - ) - return as_column( - pyarrow_array, - dtype=dtype, - nan_as_null=nan_as_null, - length=length, - ) - else: - raise NotImplementedError( - f"{type(arbitrary).__name__} with " - f"{type(arbitrary.dtype).__name__} is not supported." - ) - elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview): - if length is None: - length = 1 - elif length < 0: - raise ValueError(f"{length=} must be >=0.") - if isinstance( - arbitrary, pd.Interval - ) or cudf.api.types._is_categorical_dtype(dtype): - # No cudf.Scalar support yet - return as_column( - pd.Series([arbitrary] * length), - nan_as_null=nan_as_null, - dtype=dtype, - length=length, - ) - if ( - nan_as_null is True - and isinstance(arbitrary, (np.floating, float)) - and np.isnan(arbitrary) - ): - if dtype is None: - dtype = getattr(arbitrary, "dtype", cudf.dtype("float64")) - arbitrary = None - arbitrary = cudf.Scalar(arbitrary, dtype=dtype) - if length == 0: - return column_empty(length, dtype=arbitrary.dtype) - else: - return ColumnBase.from_scalar(arbitrary, length) - - elif hasattr(arbitrary, "__array_interface__"): - desc = arbitrary.__array_interface__ - check_invalid_array(desc["shape"], np.dtype(desc["typestr"])) - - # CUDF assumes values are always contiguous - arbitrary = np.asarray(arbitrary, order="C") - - if arbitrary.ndim == 0: - # TODO: Or treat as scalar? - arbitrary = arbitrary[np.newaxis] - - if arbitrary.dtype.kind in "OSU": - if pd.isna(arbitrary).any(): - arbitrary = pa.array(arbitrary) - else: - # Let pandas potentially infer object type - # e.g. np.array([pd.Timestamp(...)], dtype=object) -> datetime64 - arbitrary = pd.Series(arbitrary) - return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null) - elif arbitrary.dtype.kind in "biuf": - from_pandas = nan_as_null is None or nan_as_null - return as_column( - pa.array(arbitrary, from_pandas=from_pandas), - dtype=dtype, - nan_as_null=nan_as_null, - ) - elif arbitrary.dtype.kind in "mM": - time_unit = get_time_unit(arbitrary) - if time_unit in ("D", "W", "M", "Y"): - # TODO: Raise in these cases instead of downcasting to s? - new_type = f"{arbitrary.dtype.type.__name__}[s]" - arbitrary = arbitrary.astype(new_type) - elif time_unit == "generic": - # TODO: This should probably be in cudf.dtype - raise TypeError( - f"{arbitrary.dtype.type.__name__} must have a unit specified" - ) - - is_nat = np.isnat(arbitrary) - mask = None - if is_nat.any(): - if nan_as_null is None or nan_as_null: - # Convert NaT to NA, which pyarrow does by default - return as_column( - pa.array(arbitrary), - dtype=dtype, - nan_as_null=nan_as_null, - ) - # Consider NaT as NA in the mask - # but maintain NaT as a value - bool_mask = as_column(~is_nat) - mask = as_buffer(bools_to_mask(bool_mask)) - buffer = as_buffer(arbitrary.view("|u1")) - col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype) - if dtype: - col = col.astype(dtype) - return col - else: - raise NotImplementedError(f"{arbitrary.dtype} not supported") - elif (view := as_memoryview(arbitrary)) is not None: - return as_column( - np.asarray(view), dtype=dtype, nan_as_null=nan_as_null - ) - elif hasattr(arbitrary, "__array__"): - # e.g. test_cuda_array_interface_pytorch - try: - arbitrary = cupy.asarray(arbitrary) - except (ValueError, TypeError): - arbitrary = np.asarray(arbitrary) - return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null) - elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)): - raise TypeError( - f"{type(arbitrary).__name__} must be an iterable or sequence." - ) - elif isinstance(arbitrary, abc.Iterator): - arbitrary = list(arbitrary) - - # Start of arbitrary that's not handed above but dtype provided - if isinstance(dtype, pd.DatetimeTZDtype): - raise NotImplementedError( - "Use `tz_localize()` to construct timezone aware data." - ) - elif isinstance(dtype, cudf.core.dtypes.DecimalDtype): - # Arrow throws a type error if the input is of - # mixed-precision and cannot fit into the provided - # decimal type properly, see: - # https://github.com/apache/arrow/pull/9948 - # Hence we should let the exception propagate to - # the user. - data = pa.array( - arbitrary, - type=pa.decimal128(precision=dtype.precision, scale=dtype.scale), - ) - if isinstance(dtype, cudf.core.dtypes.Decimal128Dtype): - return cudf.core.column.Decimal128Column.from_arrow(data) - elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - return cudf.core.column.Decimal64Column.from_arrow(data) - elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - return cudf.core.column.Decimal32Column.from_arrow(data) - else: - raise NotImplementedError(f"{dtype} not implemented") - elif isinstance( - dtype, - ( - pd.CategoricalDtype, - cudf.CategoricalDtype, - pd.IntervalDtype, - cudf.IntervalDtype, - ), - ) or dtype in { - "category", - "interval", - "str", - str, - np.str_, - object, - np.dtype(object), - }: - if isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype)): - dtype = dtype.to_pandas() - elif dtype == object: - # Unlike pandas, interpret object as "str" instead of "python object" - dtype = "str" - ser = pd.Series(arbitrary, dtype=dtype) - return as_column(ser, nan_as_null=nan_as_null) - elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)): - try: - data = pa.array(arbitrary, type=dtype.to_arrow()) - except (pa.ArrowInvalid, pa.ArrowTypeError): - if isinstance(dtype, cudf.ListDtype): - # e.g. test_cudf_list_struct_write - return cudf.core.column.ListColumn.from_sequences(arbitrary) - raise - return as_column(data, nan_as_null=nan_as_null) - - from_pandas = nan_as_null is None or nan_as_null - if dtype is not None: - dtype = cudf.dtype(dtype) - try: - arbitrary = pa.array( - arbitrary, - type=cudf_dtype_to_pa_type(dtype), - from_pandas=from_pandas, - ) - except (pa.ArrowInvalid, pa.ArrowTypeError): - if not isinstance(dtype, np.dtype): - dtype = dtype.to_pandas() - arbitrary = pd.Series(arbitrary, dtype=dtype) - return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) - else: - for element in arbitrary: - # Carve-outs that cannot be parsed by pyarrow/pandas - if is_column_like(element): - # e.g. test_nested_series_from_sequence_data - return cudf.core.column.ListColumn.from_sequences(arbitrary) - elif isinstance(element, cupy.ndarray): - # e.g. test_series_from_cupy_scalars - return as_column( - cupy.array(arbitrary), - dtype=dtype, - nan_as_null=nan_as_null, - length=length, - ) - elif ( - isinstance(element, (pd.Timestamp, pd.Timedelta)) - or element is pd.NaT - ): - # TODO: Remove this after - # https://github.com/apache/arrow/issues/26492 - # is fixed. - return as_column( - pd.Series(arbitrary), - dtype=dtype, - nan_as_null=nan_as_null, - length=length, - ) - elif not any(element is na for na in (None, pd.NA, np.nan)): - # Might have NA + element like above, but short-circuit if - # an element pyarrow/pandas might be able to parse - break - try: - arbitrary = pa.array(arbitrary, from_pandas=from_pandas) - if ( - cudf.get_option("mode.pandas_compatible") - and pa.types.is_integer(arbitrary.type) - and arbitrary.null_count > 0 - ): - arbitrary = arbitrary.cast(pa.float64()) - if ( - cudf.get_option("default_integer_bitwidth") - and pa.types.is_integer(arbitrary.type) - ) or ( - cudf.get_option("default_float_bitwidth") - and pa.types.is_floating(arbitrary.type) - ): - dtype = _maybe_convert_to_default_type( - cudf.dtype(arbitrary.type.to_pandas_dtype()) - ) - except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError): - arbitrary = pd.Series(arbitrary) - if ( - cudf.get_option("default_integer_bitwidth") - and arbitrary.dtype.kind in set("iu") - ) or ( - cudf.get_option("default_float_bitwidth") - and arbitrary.dtype.kind == "f" - ): - dtype = _maybe_convert_to_default_type(arbitrary.dtype) - return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) - - -def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: - desc = cai_mask.__cuda_array_interface__ - typestr = desc["typestr"] - typecode = typestr[1] - if typecode == "t": - mask_size = bitmask_allocation_size_bytes(desc["shape"][0]) - return as_buffer(data=desc["data"][0], size=mask_size, owner=obj) - elif typecode == "b": - col = as_column(cai_mask) - return bools_to_mask(col) - else: - raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") - - -def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: - """ - Return the headers and frames resulting - from serializing a list of Column - - Parameters - ---------- - columns : list - list of Columns to serialize - - Returns - ------- - headers : list - list of header metadata for each Column - frames : list - list of frames - """ - headers: list[dict[Any, Any]] = [] - frames = [] - - if len(columns) > 0: - header_columns = [c.serialize() for c in columns] - headers, column_frames = zip(*header_columns) - for f in column_frames: - frames.extend(f) - - return headers, frames - - -def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]: - """ - Construct a list of Columns from a list of headers - and frames. - """ - columns = [] - - for meta in headers: - col_frame_count = meta["frame_count"] - col_typ = pickle.loads(meta["type-serialized"]) - colobj = col_typ.deserialize(meta, frames[:col_frame_count]) - columns.append(colobj) - # Advance frames - frames = frames[col_frame_count:] - - return columns - - -def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: - """Concatenate a sequence of columns.""" - if len(objs) == 0: - dtype = cudf.dtype(None) - return column_empty(0, dtype=dtype, masked=True) - - # If all columns are `NumericalColumn` with different dtypes, - # we cast them to a common dtype. - # Notice, we can always cast pure null columns - not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)] - if len(not_null_col_dtypes) and all( - _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M" - for dtype in not_null_col_dtypes - ): - common_dtype = find_common_type(not_null_col_dtypes) - # Cast all columns to the common dtype - objs = [obj.astype(common_dtype) for obj in objs] - - # Find the first non-null column: - head = next((obj for obj in objs if obj.null_count != len(obj)), objs[0]) - - for i, obj in enumerate(objs): - # Check that all columns are the same type: - if not is_dtype_equal(obj.dtype, head.dtype): - # if all null, cast to appropriate dtype - if obj.null_count == len(obj): - objs[i] = column_empty_like( - head, dtype=head.dtype, masked=True, newsize=len(obj) - ) - else: - raise ValueError("All columns must be the same type") - - # TODO: This logic should be generalized to a dispatch to - # ColumnBase._concat so that all subclasses can override necessary - # behavior. However, at the moment it's not clear what that API should look - # like, so CategoricalColumn simply implements a minimal working API. - if all(isinstance(o.dtype, CategoricalDtype) for o in objs): - return cudf.core.column.categorical.CategoricalColumn._concat( - cast( - MutableSequence[ - cudf.core.column.categorical.CategoricalColumn - ], - objs, - ) - ) - - newsize = sum(map(len, objs)) - if newsize > libcudf.MAX_COLUMN_SIZE: - raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" - ) - elif newsize == 0: - return column_empty(0, head.dtype, masked=True) - - # Filter out inputs that have 0 length, then concatenate. - return libcudf.concat.concat_columns([o for o in objs if len(o)]) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py deleted file mode 100644 index d0ea4612a1b..00000000000 --- a/python/cudf/cudf/core/column/datetime.py +++ /dev/null @@ -1,983 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import calendar -import datetime -import functools -import locale -import re -from locale import nl_langinfo -from typing import TYPE_CHECKING, Literal, Sequence, cast - -import numpy as np -import pandas as pd -import pyarrow as pa - -import cudf -from cudf import _lib as libcudf -from cudf._lib.labeling import label_bins -from cudf._lib.search import search_sorted -from cudf.core._compat import PANDAS_GE_220 -from cudf.core._internals.timezones import ( - check_ambiguous_and_nonexistent, - get_compatible_timezone, - get_tz_data, -) -from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase, as_column, column, string -from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion -from cudf.utils.dtypes import _get_base_dtype -from cudf.utils.utils import _all_bools_with_nulls - -if TYPE_CHECKING: - from cudf._typing import ( - ColumnBinaryOperand, - DatetimeLikeScalar, - Dtype, - ScalarLike, - ) - from cudf.core.column.numerical import NumericalColumn - -if PANDAS_GE_220: - _guess_datetime_format = pd.tseries.api.guess_datetime_format -else: - _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format - -# nanoseconds per time_unit -_dtype_to_format_conversion = { - "datetime64[ns]": "%Y-%m-%d %H:%M:%S.%9f", - "datetime64[us]": "%Y-%m-%d %H:%M:%S.%6f", - "datetime64[ms]": "%Y-%m-%d %H:%M:%S.%3f", - "datetime64[s]": "%Y-%m-%d %H:%M:%S", -} - -_DATETIME_SPECIAL_FORMATS = { - "%b", - "%B", - "%A", - "%a", -} - -_DATETIME_NAMES = [ - nl_langinfo(locale.AM_STR), # type: ignore - nl_langinfo(locale.PM_STR), # type: ignore - nl_langinfo(locale.DAY_1), - nl_langinfo(locale.DAY_2), - nl_langinfo(locale.DAY_3), - nl_langinfo(locale.DAY_4), - nl_langinfo(locale.DAY_5), - nl_langinfo(locale.DAY_6), - nl_langinfo(locale.DAY_7), - nl_langinfo(locale.ABDAY_1), - nl_langinfo(locale.ABDAY_2), - nl_langinfo(locale.ABDAY_3), - nl_langinfo(locale.ABDAY_4), - nl_langinfo(locale.ABDAY_5), - nl_langinfo(locale.ABDAY_6), - nl_langinfo(locale.ABDAY_7), - nl_langinfo(locale.MON_1), - nl_langinfo(locale.MON_2), - nl_langinfo(locale.MON_3), - nl_langinfo(locale.MON_4), - nl_langinfo(locale.MON_5), - nl_langinfo(locale.MON_6), - nl_langinfo(locale.MON_7), - nl_langinfo(locale.MON_8), - nl_langinfo(locale.MON_9), - nl_langinfo(locale.MON_10), - nl_langinfo(locale.MON_11), - nl_langinfo(locale.MON_12), - nl_langinfo(locale.ABMON_1), - nl_langinfo(locale.ABMON_2), - nl_langinfo(locale.ABMON_3), - nl_langinfo(locale.ABMON_4), - nl_langinfo(locale.ABMON_5), - nl_langinfo(locale.ABMON_6), - nl_langinfo(locale.ABMON_7), - nl_langinfo(locale.ABMON_8), - nl_langinfo(locale.ABMON_9), - nl_langinfo(locale.ABMON_10), - nl_langinfo(locale.ABMON_11), - nl_langinfo(locale.ABMON_12), -] - - -def infer_format(element: str, **kwargs) -> str: - """ - Infers datetime format from a string, also takes cares for `ms` and `ns` - """ - if not cudf.get_option("mode.pandas_compatible"): - # We allow "Z" but don't localize it to datetime64[ns, UTC] type (yet) - element = element.replace("Z", "") - fmt = _guess_datetime_format(element, **kwargs) - - if fmt is not None: - if "%z" in fmt or "%Z" in fmt: - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) - if ".%f" not in fmt: - # For context read: - # https://github.com/pandas-dev/pandas/issues/52418 - # We cannot rely on format containing only %f - # c++/libcudf expects .%3f, .%6f, .%9f - # Logic below handles those cases well. - return fmt - - element_parts = element.split(".") - if len(element_parts) != 2: - raise ValueError("Given date string not likely a datetime.") - - # There is possibility that the element is of following format - # '00:00:03.333333 2016-01-01' - second_parts = re.split(r"(\D+)", element_parts[1], maxsplit=1) - subsecond_fmt = ".%" + str(len(second_parts[0])) + "f" - - first_part = _guess_datetime_format(element_parts[0], **kwargs) - # For the case where first_part is '00:00:03' - if first_part is None: - tmp = "1970-01-01 " + element_parts[0] - first_part = _guess_datetime_format(tmp, **kwargs).split(" ", 1)[1] - if first_part is None: - raise ValueError("Unable to infer the timestamp format from the data") - - if len(second_parts) > 1: - # We may have a non-digit, timezone-like component - # like Z, UTC-3, +01:00 - if any(re.search(r"\D", part) for part in second_parts): - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) - second_part = "".join(second_parts[1:]) - - if len(second_part) > 1: - # Only infer if second_parts is not an empty string. - second_part = _guess_datetime_format(second_part, **kwargs) - else: - second_part = "" - - try: - fmt = first_part + subsecond_fmt + second_part - except Exception: - raise ValueError("Unable to infer the timestamp format from the data") - - return fmt - - -def _resolve_mixed_dtypes( - lhs: ColumnBinaryOperand, rhs: ColumnBinaryOperand, base_type: str -) -> Dtype: - units = ["s", "ms", "us", "ns"] - lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) - lhs_unit = units.index(lhs_time_unit) - rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) - rhs_unit = units.index(rhs_time_unit) - return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]") - - -class DatetimeColumn(column.ColumnBase): - """ - A Column implementation for Date-time types. - - Parameters - ---------- - data : Buffer - The datetime values - dtype : np.dtype - The data type - mask : Buffer; optional - The validity mask - """ - - _VALID_BINARY_OPERATIONS = { - "__eq__", - "__ne__", - "__lt__", - "__le__", - "__gt__", - "__ge__", - "__add__", - "__sub__", - "__radd__", - "__rsub__", - } - - def __init__( - self, - data: Buffer, - size: int | None, - dtype: np.dtype | pd.DatetimeTZDtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - if not isinstance(data, Buffer): - raise ValueError("data must be a Buffer.") - dtype = self._validate_dtype_instance(dtype) - if data.size % dtype.itemsize: - raise ValueError("Buffer size must be divisible by element size") - if size is None: - size = data.size // dtype.itemsize - size = size - offset - if len(children) != 0: - raise ValueError(f"{type(self).__name__} must have no children.") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - @staticmethod - def _validate_dtype_instance(dtype: np.dtype) -> np.dtype: - if not (isinstance(dtype, np.dtype) and dtype.kind == "M"): - raise ValueError("dtype must be a datetime, numpy dtype") - return dtype - - def __contains__(self, item: ScalarLike) -> bool: - try: - ts = pd.Timestamp(item).as_unit(self.time_unit) - except Exception: - # pandas can raise a variety of errors - # item cannot exist in self. - return False - if ts.tzinfo is None and isinstance(self.dtype, pd.DatetimeTZDtype): - return False - elif ts.tzinfo is not None: - ts = ts.tz_convert(None) - return ts.to_numpy().astype("int64") in cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ) - - @functools.cached_property - def time_unit(self) -> str: - return np.datetime_data(self.dtype)[0] - - @property - def quarter(self) -> ColumnBase: - return libcudf.datetime.extract_quarter(self) - - @property - def year(self) -> ColumnBase: - return self.get_dt_field("year") - - @property - def month(self) -> ColumnBase: - return self.get_dt_field("month") - - @property - def day(self) -> ColumnBase: - return self.get_dt_field("day") - - @property - def hour(self) -> ColumnBase: - return self.get_dt_field("hour") - - @property - def minute(self) -> ColumnBase: - return self.get_dt_field("minute") - - @property - def second(self) -> ColumnBase: - return self.get_dt_field("second") - - @property - def weekday(self) -> ColumnBase: - return self.get_dt_field("weekday") - - @property - def dayofyear(self) -> ColumnBase: - return self.get_dt_field("day_of_year") - - @property - def day_of_year(self) -> ColumnBase: - return self.get_dt_field("day_of_year") - - @property - def is_month_start(self) -> ColumnBase: - return (self.day == 1).fillna(False) - - @property - def is_month_end(self) -> ColumnBase: - last_day_col = libcudf.datetime.last_day_of_month(self) - return (self.day == last_day_col.day).fillna(False) - - @property - def is_quarter_end(self) -> ColumnBase: - last_month = self.month.isin([3, 6, 9, 12]) - return (self.is_month_end & last_month).fillna(False) - - @property - def is_quarter_start(self) -> ColumnBase: - first_month = self.month.isin([1, 4, 7, 10]) - return (self.is_month_start & first_month).fillna(False) - - @property - def is_year_end(self) -> ColumnBase: - day_of_year = self.day_of_year - leap_dates = self.is_leap_year - - leap = day_of_year == cudf.Scalar(366) - non_leap = day_of_year == cudf.Scalar(365) - return libcudf.copying.copy_if_else(leap, non_leap, leap_dates).fillna( - False - ) - - @property - def is_leap_year(self) -> ColumnBase: - return libcudf.datetime.is_leap_year(self) - - @property - def is_year_start(self) -> ColumnBase: - return (self.day_of_year == 1).fillna(False) - - @property - def days_in_month(self) -> ColumnBase: - return libcudf.datetime.days_in_month(self) - - @property - def day_of_week(self) -> ColumnBase: - raise NotImplementedError("day_of_week is currently not implemented.") - - @property - def is_normalized(self) -> bool: - raise NotImplementedError( - "is_normalized is currently not implemented." - ) - - def to_julian_date(self) -> ColumnBase: - raise NotImplementedError( - "to_julian_date is currently not implemented." - ) - - def normalize(self) -> ColumnBase: - raise NotImplementedError("normalize is currently not implemented.") - - @property - def values(self): - """ - Return a CuPy representation of the DateTimeColumn. - """ - raise NotImplementedError( - "DateTime Arrays is not yet implemented in cudf" - ) - - def element_indexing(self, index: int): - result = super().element_indexing(index) - if cudf.get_option("mode.pandas_compatible"): - return pd.Timestamp(result) - return result - - def get_dt_field(self, field: str) -> ColumnBase: - return libcudf.datetime.extract_datetime_component(self, field) - - def _get_field_names( - self, - field: Literal["month", "weekday"], - labels: list[str], - locale: str | None = None, - ) -> ColumnBase: - if locale is not None: - raise NotImplementedError( - "Setting a locale is currently not supported. " - "Results will be returned in your current locale." - ) - col_labels = as_column(labels) - indices = self.get_dt_field(field) - has_nulls = indices.has_nulls() - if has_nulls: - indices = indices.fillna(len(col_labels)) - return col_labels.take(indices, nullify=True, check_bounds=has_nulls) - - def get_day_names(self, locale: str | None = None) -> ColumnBase: - return self._get_field_names( - "weekday", list(calendar.day_name), locale=locale - ) - - def get_month_names(self, locale: str | None = None) -> ColumnBase: - return self._get_field_names( - "month", list(calendar.month_name), locale=locale - ) - - def ceil(self, freq: str) -> ColumnBase: - return libcudf.datetime.ceil_datetime(self, freq) - - def floor(self, freq: str) -> ColumnBase: - return libcudf.datetime.floor_datetime(self, freq) - - def round(self, freq: str) -> ColumnBase: - return libcudf.datetime.round_datetime(self, freq) - - def isocalendar(self) -> dict[str, ColumnBase]: - return { - field: self.strftime(format=directive).astype("uint32") - for field, directive in zip( - ["year", "week", "day"], ["%G", "%V", "%u"] - ) - } - - def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: - if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)): - return other - - tz_error_msg = ( - "Cannot perform binary operation on timezone-naive columns" - " and timezone-aware timestamps." - ) - if isinstance(other, pd.Timestamp): - if other.tz is not None: - raise NotImplementedError(tz_error_msg) - other = other.to_datetime64() - elif isinstance(other, pd.Timedelta): - other = other.to_timedelta64() - elif isinstance(other, datetime.datetime): - if other.tzinfo is not None: - raise NotImplementedError(tz_error_msg) - other = np.datetime64(other) - elif isinstance(other, datetime.timedelta): - other = np.timedelta64(other) - - if isinstance(other, np.datetime64): - if np.isnat(other): - other_time_unit = cudf.utils.dtypes.get_time_unit(other) - if other_time_unit not in {"s", "ms", "ns", "us"}: - other_time_unit = "ns" - - return cudf.Scalar( - None, dtype=f"datetime64[{other_time_unit}]" - ) - - other = other.astype(self.dtype) - return cudf.Scalar(other) - elif isinstance(other, np.timedelta64): - other_time_unit = cudf.utils.dtypes.get_time_unit(other) - - if np.isnat(other): - return cudf.Scalar( - None, - dtype="timedelta64[ns]" - if other_time_unit not in {"s", "ms", "ns", "us"} - else other.dtype, - ) - - if other_time_unit not in {"s", "ms", "ns", "us"}: - other = other.astype("timedelta64[s]") - - return cudf.Scalar(other) - elif isinstance(other, str): - try: - return cudf.Scalar(other, dtype=self.dtype) - except ValueError: - pass - - return NotImplemented - - def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: - if dtype == self.dtype: - return self - return libcudf.unary.cast(self, dtype=dtype) - - def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override] - raise TypeError( - f"cannot astype a datetimelike from {self.dtype} to {dtype}" - ) - - def as_numerical_column( - self, dtype: Dtype - ) -> cudf.core.column.NumericalColumn: - col = cudf.core.column.NumericalColumn( - data=self.base_data, # type: ignore[arg-type] - dtype=np.dtype(np.int64), - mask=self.base_mask, - offset=self.offset, - size=self.size, - ) - return cast(cudf.core.column.NumericalColumn, col.astype(dtype)) - - def strftime(self, format: str) -> cudf.core.column.StringColumn: - if len(self) == 0: - return cast( - cudf.core.column.StringColumn, - column.column_empty(0, dtype="object", masked=False), - ) - if format in _DATETIME_SPECIAL_FORMATS: - names = as_column(_DATETIME_NAMES) - else: - names = cudf.core.column.column_empty( - 0, dtype="object", masked=False - ) - return string._datetime_to_str_typecast_functions[self.dtype]( - self, format, names - ) - - def as_string_column(self) -> cudf.core.column.StringColumn: - format = _dtype_to_format_conversion.get( - self.dtype.name, "%Y-%m-%d %H:%M:%S" - ) - if cudf.get_option("mode.pandas_compatible"): - if format.endswith("f"): - sub_second_res_len = 3 - else: - sub_second_res_len = 0 - - has_nanos = ( - self.time_unit in {"ns"} - and self.get_dt_field("nanosecond").any() - ) - has_micros = ( - self.time_unit in {"ns", "us"} - and self.get_dt_field("microsecond").any() - ) - has_millis = ( - self.time_unit in {"ns", "us", "ms"} - and self.get_dt_field("millisecond").any() - ) - has_seconds = self.get_dt_field("second").any() - has_minutes = self.get_dt_field("minute").any() - has_hours = self.get_dt_field("hour").any() - if sub_second_res_len: - if has_nanos: - # format should be intact and rest of the - # following conditions shouldn't execute. - pass - elif has_micros: - format = format[:-sub_second_res_len] + "%6f" - elif has_millis: - format = format[:-sub_second_res_len] + "%3f" - elif has_seconds or has_minutes or has_hours: - format = format[:-4] - else: - format = format.split(" ")[0] - elif not (has_seconds or has_minutes or has_hours): - format = format.split(" ")[0] - return self.strftime(format) - - def mean(self, skipna=None, min_count: int = 0) -> ScalarLike: - return pd.Timestamp( - cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ).mean(skipna=skipna, min_count=min_count), - unit=self.time_unit, - ).as_unit(self.time_unit) - - def std( - self, - skipna: bool | None = None, - min_count: int = 0, - ddof: int = 1, - ) -> pd.Timedelta: - return pd.Timedelta( - cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( - skipna=skipna, min_count=min_count, ddof=ddof - ) - * _unit_to_nanoseconds_conversion[self.time_unit], - ).as_unit(self.time_unit) - - def median(self, skipna: bool | None = None) -> pd.Timestamp: - return pd.Timestamp( - cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ).median(skipna=skipna), - unit=self.time_unit, - ).as_unit(self.time_unit) - - def cov(self, other: DatetimeColumn) -> float: - if not isinstance(other, DatetimeColumn): - raise TypeError( - f"cannot perform cov with types {self.dtype}, {other.dtype}" - ) - return cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) - - def corr(self, other: DatetimeColumn) -> float: - if not isinstance(other, DatetimeColumn): - raise TypeError( - f"cannot perform corr with types {self.dtype}, {other.dtype}" - ) - return cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) - - def quantile( - self, - q: np.ndarray, - interpolation: str, - exact: bool, - return_scalar: bool, - ) -> ColumnBase: - result = self.astype("int64").quantile( - q=q, - interpolation=interpolation, - exact=exact, - return_scalar=return_scalar, - ) - if return_scalar: - return pd.Timestamp(result, unit=self.time_unit).as_unit( - self.time_unit - ) - return result.astype(self.dtype) - - def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: - reflect, op = self._check_reflected_op(op) - other = self._wrap_binop_normalization(other) - if other is NotImplemented: - return NotImplemented - if isinstance(other, cudf.DateOffset): - return other._datetime_binop(self, op, reflect=reflect) - - # We check this on `other` before reflection since we already know the - # dtype of `self`. - other_is_timedelta = other.dtype.kind == "m" - other_is_datetime64 = other.dtype.kind == "M" - lhs, rhs = (other, self) if reflect else (self, other) - out_dtype = None - - if ( - op - in { - "__ne__", - "__lt__", - "__gt__", - "__le__", - "__ge__", - } - and other_is_datetime64 - ): - out_dtype = cudf.dtype(np.bool_) - elif op == "__add__" and other_is_timedelta: - # The only thing we can add to a datetime is a timedelta. This - # operation is symmetric, i.e. we allow `datetime + timedelta` or - # `timedelta + datetime`. Both result in DatetimeColumns. - out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64") - elif op == "__sub__": - # Subtracting a datetime from a datetime results in a timedelta. - if other_is_datetime64: - out_dtype = _resolve_mixed_dtypes(lhs, rhs, "timedelta64") - # We can subtract a timedelta from a datetime, but not vice versa. - # Not only is subtraction antisymmetric (as is normal), it is only - # well-defined if this operation was not invoked via reflection. - elif other_is_timedelta and not reflect: - out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64") - elif op in { - "__eq__", - "__ne__", - "NULL_EQUALS", - "NULL_NOT_EQUALS", - }: - out_dtype = cudf.dtype(np.bool_) - if isinstance(other, ColumnBase) and not isinstance( - other, DatetimeColumn - ): - fill_value = op in ("__ne__", "NULL_NOT_EQUALS") - result = _all_bools_with_nulls( - self, other, bool_fill_value=fill_value - ) - if cudf.get_option("mode.pandas_compatible"): - result = result.fillna(fill_value) - return result - - if out_dtype is None: - return NotImplemented - - result_col = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - if out_dtype != cudf.dtype(np.bool_) and op == "__add__": - return result_col - elif cudf.get_option( - "mode.pandas_compatible" - ) and out_dtype == cudf.dtype(np.bool_): - return result_col.fillna(op == "__ne__") - else: - return result_col - - def indices_of( - self, value: ScalarLike - ) -> cudf.core.column.NumericalColumn: - value = ( - pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64") - ) - return self.astype("int64").indices_of(value) - - @property - def is_unique(self) -> bool: - return self.astype("int64").is_unique - - def isin(self, values: Sequence) -> ColumnBase: - return cudf.core.tools.datetimes._isin_datetimelike(self, values) - - def can_cast_safely(self, to_dtype: Dtype) -> bool: - if to_dtype.kind == "M": # type: ignore[union-attr] - to_res, _ = np.datetime_data(to_dtype) - self_res, _ = np.datetime_data(self.dtype) - - max_int = np.iinfo(cudf.dtype("int64")).max - - max_dist = np.timedelta64( - self.max().astype(cudf.dtype("int64"), copy=False), self_res - ) - min_dist = np.timedelta64( - self.min().astype(cudf.dtype("int64"), copy=False), self_res - ) - - self_delta_dtype = np.timedelta64(0, self_res).dtype - - if max_dist <= np.timedelta64(max_int, to_res).astype( - self_delta_dtype - ) and min_dist <= np.timedelta64(max_int, to_res).astype( - self_delta_dtype - ): - return True - else: - return False - elif to_dtype == cudf.dtype("int64") or to_dtype == cudf.dtype("O"): - # can safely cast to representation, or string - return True - else: - return False - - def _with_type_metadata(self, dtype): - if isinstance(dtype, pd.DatetimeTZDtype): - return DatetimeTZColumn( - data=self.base_data, - dtype=dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - null_count=self.null_count, - ) - return self - - def _find_ambiguous_and_nonexistent( - self, zone_name: str - ) -> tuple[NumericalColumn, NumericalColumn] | tuple[bool, bool]: - """ - Recognize ambiguous and nonexistent timestamps for the given timezone. - - Returns a tuple of columns, both of "bool" dtype and of the same - size as `self`, that respectively indicate ambiguous and - nonexistent timestamps in `self` with the value `True`. - - Ambiguous and/or nonexistent timestamps are only possible if any - transitions occur in the time zone database for the given timezone. - If no transitions occur, the tuple `(False, False)` is returned. - """ - transition_times, offsets = get_tz_data(zone_name) - offsets = offsets.astype(f"timedelta64[{self.time_unit}]") # type: ignore[assignment] - - if len(offsets) == 1: # no transitions - return False, False - - transition_times, offsets, old_offsets = ( - transition_times.slice(1, len(transition_times)), - offsets.slice(1, len(offsets)), - offsets.slice(0, len(offsets) - 1), - ) - - # Assume we have two clocks at the moment of transition: - # - Clock 1 is turned forward or backwards correctly - # - Clock 2 makes no changes - clock_1 = transition_times + offsets - clock_2 = transition_times + old_offsets - - # At the start of an ambiguous time period, Clock 1 (which has - # been turned back) reads less than Clock 2: - cond = clock_1 < clock_2 - ambiguous_begin = clock_1.apply_boolean_mask(cond) - - # The end of an ambiguous time period is what Clock 2 reads at - # the moment of transition: - ambiguous_end = clock_2.apply_boolean_mask(cond) - ambiguous = label_bins( - self, - left_edges=ambiguous_begin, - left_inclusive=True, - right_edges=ambiguous_end, - right_inclusive=False, - ).notnull() - - # At the start of a non-existent time period, Clock 2 reads less - # than Clock 1 (which has been turned forward): - cond = clock_1 > clock_2 - nonexistent_begin = clock_2.apply_boolean_mask(cond) - - # The end of the non-existent time period is what Clock 1 reads - # at the moment of transition: - nonexistent_end = clock_1.apply_boolean_mask(cond) - nonexistent = label_bins( - self, - left_edges=nonexistent_begin, - left_inclusive=True, - right_edges=nonexistent_end, - right_inclusive=False, - ).notnull() - - return ambiguous, nonexistent - - def tz_localize( - self, - tz: str | None, - ambiguous: Literal["NaT"] = "NaT", - nonexistent: Literal["NaT"] = "NaT", - ): - if tz is None: - return self.copy() - ambiguous, nonexistent = check_ambiguous_and_nonexistent( - ambiguous, nonexistent - ) - dtype = get_compatible_timezone(pd.DatetimeTZDtype(self.time_unit, tz)) - tzname = dtype.tz.key - ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent( - tzname - ) - localized = self._scatter_by_column( - self.isnull() | (ambiguous_col | nonexistent_col), - cudf.Scalar(cudf.NaT, dtype=self.dtype), - ) - - transition_times, offsets = get_tz_data(tzname) - transition_times_local = (transition_times + offsets).astype( - localized.dtype - ) - indices = ( - search_sorted([transition_times_local], [localized], "right") - 1 - ) - offsets_to_utc = offsets.take(indices, nullify=True) - gmt_data = localized - offsets_to_utc - return DatetimeTZColumn( - data=gmt_data.base_data, - dtype=dtype, - mask=localized.base_mask, - size=gmt_data.size, - offset=gmt_data.offset, - ) - - def tz_convert(self, tz: str | None): - raise TypeError( - "Cannot convert tz-naive timestamps, use tz_localize to localize" - ) - - -class DatetimeTZColumn(DatetimeColumn): - def __init__( - self, - data: Buffer, - size: int | None, - dtype: pd.DatetimeTZDtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - @staticmethod - def _validate_dtype_instance( - dtype: pd.DatetimeTZDtype, - ) -> pd.DatetimeTZDtype: - if not isinstance(dtype, pd.DatetimeTZDtype): - raise ValueError("dtype must be a pandas.DatetimeTZDtype") - return get_compatible_timezone(dtype) - - def to_pandas( - self, - *, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Index: - if arrow_type or nullable: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - else: - return self._local_time.to_pandas().tz_localize( - self.dtype.tz, ambiguous="NaT", nonexistent="NaT" - ) - - def to_arrow(self): - return pa.compute.assume_timezone( - self._local_time.to_arrow(), str(self.dtype.tz) - ) - - @functools.cached_property - def time_unit(self) -> str: - return self.dtype.unit - - @property - def _utc_time(self): - """Return UTC time as naive timestamps.""" - return DatetimeColumn( - data=self.base_data, - dtype=_get_base_dtype(self.dtype), - mask=self.base_mask, - size=self.size, - offset=self.offset, - null_count=self.null_count, - ) - - @property - def _local_time(self): - """Return the local time as naive timestamps.""" - transition_times, offsets = get_tz_data(str(self.dtype.tz)) - transition_times = transition_times.astype(_get_base_dtype(self.dtype)) - indices = search_sorted([transition_times], [self], "right") - 1 - offsets_from_utc = offsets.take(indices, nullify=True) - return self + offsets_from_utc - - def strftime(self, format: str) -> cudf.core.column.StringColumn: - return self._local_time.strftime(format) - - def as_string_column(self) -> cudf.core.column.StringColumn: - return self._local_time.as_string_column() - - def get_dt_field(self, field: str) -> ColumnBase: - return libcudf.datetime.extract_datetime_component( - self._local_time, field - ) - - def __repr__(self): - # Arrow prints the UTC timestamps, but we want to print the - # local timestamps: - arr = self._local_time.to_arrow().cast( - pa.timestamp(self.dtype.unit, str(self.dtype.tz)) - ) - return ( - f"{object.__repr__(self)}\n" - f"{arr.to_string()}\n" - f"dtype: {self.dtype}" - ) - - def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"): - if tz is None: - return self._local_time - ambiguous, nonexistent = check_ambiguous_and_nonexistent( - ambiguous, nonexistent - ) - raise ValueError( - "Already localized. " - "Use `tz_convert` to convert between time zones." - ) - - def tz_convert(self, tz: str | None): - if tz is None: - return self._utc_time - elif tz == str(self.dtype.tz): - return self.copy() - utc_time = self._utc_time - return type(self)( - data=utc_time.base_data, - dtype=pd.DatetimeTZDtype(self.time_unit, tz), - mask=utc_time.base_mask, - size=utc_time.size, - offset=utc_time.offset, - ) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py deleted file mode 100644 index 8803ebd6791..00000000000 --- a/python/cudf/cudf/core/column/decimal.py +++ /dev/null @@ -1,527 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import warnings -from decimal import Decimal -from typing import TYPE_CHECKING, Sequence, cast - -import cupy as cp -import numpy as np -import pyarrow as pa - -import cudf -from cudf import _lib as libcudf -from cudf._lib.strings.convert.convert_fixed_point import ( - from_decimal as cpp_from_decimal, -) -from cudf.api.types import is_scalar -from cudf.core.buffer import as_buffer -from cudf.core.column import ColumnBase -from cudf.core.dtypes import ( - Decimal32Dtype, - Decimal64Dtype, - Decimal128Dtype, - DecimalDtype, -) -from cudf.core.mixins import BinaryOperand -from cudf.utils.utils import pa_mask_buffer_to_mask - -from .numerical_base import NumericalBaseColumn - -if TYPE_CHECKING: - from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike - from cudf.core.buffer import Buffer - - -class DecimalBaseColumn(NumericalBaseColumn): - """Base column for decimal32, decimal64 or decimal128 columns""" - - _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS - - def __init__( - self, - data: Buffer, - size: int, - dtype: DecimalDtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - if not isinstance(size, int): - raise ValueError("Must specify an integer size") - if not isinstance(dtype, DecimalDtype): - raise ValueError(f"{dtype=} must be a DecimalDtype instance") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - @property - def __cuda_array_interface__(self): - raise NotImplementedError( - "Decimals are not yet supported via `__cuda_array_interface__`" - ) - - def as_decimal_column( - self, - dtype: Dtype, - ) -> "DecimalBaseColumn": - if ( - isinstance(dtype, cudf.core.dtypes.DecimalDtype) - and dtype.scale < self.dtype.scale - ): - warnings.warn( - "cuDF truncates when downcasting decimals to a lower scale. " - "To round, use Series.round() or DataFrame.round()." - ) - - if dtype == self.dtype: - return self - return libcudf.unary.cast(self, dtype) - - def as_string_column(self) -> cudf.core.column.StringColumn: - if len(self) > 0: - return cpp_from_decimal(self) - else: - return cast( - cudf.core.column.StringColumn, - cudf.core.column.column_empty(0, dtype="object"), - ) - - def __pow__(self, other): - if isinstance(other, int): - if other == 0: - res = cudf.core.column.as_column( - 1, dtype=self.dtype, length=len(self) - ) - if self.nullable: - res = res.set_mask(self.mask) - return res - elif other < 0: - raise TypeError("Power of negative integers not supported.") - res = self - for _ in range(other - 1): - res = self * res - return res - else: - raise NotImplementedError( - f"__pow__ of types {self.dtype} and {type(other)} is " - "not yet implemented." - ) - - # Decimals in libcudf don't support truediv, see - # https://github.com/rapidsai/cudf/pull/7435 for explanation. - def __truediv__(self, other): - return self._binaryop(other, "__div__") - - def __rtruediv__(self, other): - return self._binaryop(other, "__rdiv__") - - def _binaryop(self, other: ColumnBinaryOperand, op: str): - reflect, op = self._check_reflected_op(op) - other = self._wrap_binop_normalization(other) - if other is NotImplemented: - return NotImplemented - lhs, rhs = (other, self) if reflect else (self, other) - - # Binary Arithmetics between decimal columns. `Scale` and `precision` - # are computed outside of libcudf - if op in {"__add__", "__sub__", "__mul__", "__div__"}: - output_type = _get_decimal_type(lhs.dtype, rhs.dtype, op) - lhs = lhs.astype( - type(output_type)(lhs.dtype.precision, lhs.dtype.scale) - ) - rhs = rhs.astype( - type(output_type)(rhs.dtype.precision, rhs.dtype.scale) - ) - result = libcudf.binaryop.binaryop(lhs, rhs, op, output_type) - # libcudf doesn't support precision, so result.dtype doesn't - # maintain output_type.precision - result.dtype.precision = output_type.precision - elif op in { - "__eq__", - "__ne__", - "__lt__", - "__gt__", - "__le__", - "__ge__", - }: - result = libcudf.binaryop.binaryop(lhs, rhs, op, bool) - else: - raise TypeError( - f"{op} not supported for the following dtypes: " - f"{self.dtype}, {other.dtype}" - ) - - return result - - def _validate_fillna_value( - self, fill_value: ScalarLike | ColumnLike - ) -> cudf.Scalar | ColumnBase: - """Align fill_value for .fillna based on column type.""" - if isinstance(fill_value, (int, Decimal)): - return cudf.Scalar(fill_value, dtype=self.dtype) - elif isinstance(fill_value, ColumnBase) and ( - isinstance(self.dtype, DecimalDtype) or self.dtype.kind in "iu" - ): - return fill_value.astype(self.dtype) - raise TypeError( - "Decimal columns only support using fillna with decimal and " - "integer values" - ) - - def normalize_binop_value(self, other): - if isinstance(other, ColumnBase): - if isinstance(other, cudf.core.column.NumericalColumn): - if other.dtype.kind not in "iu": - raise TypeError( - "Decimal columns only support binary operations with " - "integer numerical columns." - ) - other = other.astype( - self.dtype.__class__(self.dtype.__class__.MAX_PRECISION, 0) - ) - elif not isinstance(other, DecimalBaseColumn): - return NotImplemented - elif not isinstance(self.dtype, other.dtype.__class__): - # This branch occurs if we have a DecimalBaseColumn of a - # different size (e.g. 64 instead of 32). - if _same_precision_and_scale(self.dtype, other.dtype): - other = other.astype(self.dtype) - return other - if isinstance(other, cudf.Scalar) and isinstance( - # TODO: Should it be possible to cast scalars of other numerical - # types to decimal? - other.dtype, - cudf.core.dtypes.DecimalDtype, - ): - if _same_precision_and_scale(self.dtype, other.dtype): - other = other.astype(self.dtype) - return other - elif is_scalar(other) and isinstance(other, (int, Decimal)): - other = Decimal(other) - metadata = other.as_tuple() - precision = max(len(metadata.digits), metadata.exponent) - scale = -metadata.exponent - return cudf.Scalar( - other, dtype=self.dtype.__class__(precision, scale) - ) - return NotImplemented - - def _decimal_quantile( - self, q: float | Sequence[float], interpolation: str, exact: bool - ) -> ColumnBase: - quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q - # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( - [self], [True], "first", stable=True - ).slice(self.null_count, len(self)) - result = libcudf.quantiles.quantile( - self, quant, interpolation, indices, exact - ) - return result._with_type_metadata(self.dtype) - - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": - return libcudf.unary.cast(self, dtype) - - -class Decimal32Column(DecimalBaseColumn): - def __init__( - self, - data: Buffer, - size: int, - dtype: Decimal32Dtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - if not isinstance(dtype, Decimal32Dtype): - raise ValueError(f"{dtype=} must be a Decimal32Dtype instance") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - @classmethod - def from_arrow(cls, data: pa.Array): - dtype = Decimal32Dtype.from_arrow(data.type) - mask_buf = data.buffers()[0] - mask = ( - mask_buf - if mask_buf is None - else pa_mask_buffer_to_mask(mask_buf, len(data)) - ) - data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32")) - data_32 = data_128[::4].copy() - return cls( - data=as_buffer(data_32.view("uint8")), - size=len(data), - dtype=dtype, - offset=data.offset, - mask=mask, - ) - - def to_arrow(self): - data_buf_32 = np.array(self.base_data.memoryview()).view("int32") - data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32") - - # use striding to set the first 32 bits of each 128-bit chunk: - data_buf_128[::4] = data_buf_32 - # use striding again to set the remaining bits of each 128-bit chunk: - # 0 for non-negative values, -1 for negative values: - data_buf_128[1::4] = np.piecewise( - data_buf_32, [data_buf_32 < 0], [-1, 0] - ) - data_buf_128[2::4] = np.piecewise( - data_buf_32, [data_buf_32 < 0], [-1, 0] - ) - data_buf_128[3::4] = np.piecewise( - data_buf_32, [data_buf_32 < 0], [-1, 0] - ) - data_buf = pa.py_buffer(data_buf_128) - mask_buf = ( - self.base_mask - if self.base_mask is None - else pa.py_buffer(self.base_mask.memoryview()) - ) - return pa.Array.from_buffers( - type=self.dtype.to_arrow(), - offset=self._offset, - length=self.size, - buffers=[mask_buf, data_buf], - ) - - def _with_type_metadata( - self: "cudf.core.column.Decimal32Column", dtype: Dtype - ) -> "cudf.core.column.Decimal32Column": - if isinstance(dtype, Decimal32Dtype): - self.dtype.precision = dtype.precision - - return self - - -class Decimal128Column(DecimalBaseColumn): - def __init__( - self, - data: Buffer, - size: int, - dtype: Decimal128Dtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - if not isinstance(dtype, Decimal128Dtype): - raise ValueError(f"{dtype=} must be a Decimal128Dtype instance") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - @classmethod - def from_arrow(cls, data: pa.Array): - result = cast(Decimal128Dtype, super().from_arrow(data)) - result.dtype.precision = data.type.precision - return result - - def to_arrow(self): - return super().to_arrow().cast(self.dtype.to_arrow()) - - def _with_type_metadata( - self: "cudf.core.column.Decimal128Column", dtype: Dtype - ) -> "cudf.core.column.Decimal128Column": - if isinstance(dtype, Decimal128Dtype): - self.dtype.precision = dtype.precision - - return self - - -class Decimal64Column(DecimalBaseColumn): - def __init__( - self, - data: Buffer, - size: int, - dtype: Decimal64Dtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - if not isinstance(dtype, Decimal64Dtype): - raise ValueError(f"{dtype=} must be a Decimal64Dtype instance") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - def __setitem__(self, key, value): - if isinstance(value, np.integer): - value = int(value) - super().__setitem__(key, value) - - @classmethod - def from_arrow(cls, data: pa.Array): - dtype = Decimal64Dtype.from_arrow(data.type) - mask_buf = data.buffers()[0] - mask = ( - mask_buf - if mask_buf is None - else pa_mask_buffer_to_mask(mask_buf, len(data)) - ) - data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64")) - data_64 = data_128[::2].copy() - return cls( - data=as_buffer(data_64.view("uint8")), - size=len(data), - dtype=dtype, - offset=data.offset, - mask=mask, - ) - - def to_arrow(self): - data_buf_64 = np.array(self.base_data.memoryview()).view("int64") - data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64") - - # use striding to set the first 64 bits of each 128-bit chunk: - data_buf_128[::2] = data_buf_64 - # use striding again to set the remaining bits of each 128-bit chunk: - # 0 for non-negative values, -1 for negative values: - data_buf_128[1::2] = np.piecewise( - data_buf_64, [data_buf_64 < 0], [-1, 0] - ) - data_buf = pa.py_buffer(data_buf_128) - mask_buf = ( - self.base_mask - if self.base_mask is None - else pa.py_buffer(self.base_mask.memoryview()) - ) - return pa.Array.from_buffers( - type=self.dtype.to_arrow(), - offset=self._offset, - length=self.size, - buffers=[mask_buf, data_buf], - ) - - def _with_type_metadata( - self: "cudf.core.column.Decimal64Column", dtype: Dtype - ) -> "cudf.core.column.Decimal64Column": - if isinstance(dtype, Decimal64Dtype): - self.dtype.precision = dtype.precision - - return self - - -def _get_decimal_type( - lhs_dtype: DecimalDtype, - rhs_dtype: DecimalDtype, - op: str, -) -> DecimalDtype: - """ - Returns the resulting decimal type after calculating - precision & scale when performing the binary operation - `op` for the given dtypes. - - For precision & scale calculations see : https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql - """ # noqa: E501 - - # This should at some point be hooked up to libcudf's - # binary_operation_fixed_point_scale - # Note: libcudf decimal types don't have a concept of precision - - p1, p2 = lhs_dtype.precision, rhs_dtype.precision - s1, s2 = lhs_dtype.scale, rhs_dtype.scale - - if op in {"__add__", "__sub__"}: - scale = max(s1, s2) - precision = scale + max(p1 - s1, p2 - s2) + 1 - if precision > Decimal128Dtype.MAX_PRECISION: - precision = Decimal128Dtype.MAX_PRECISION - scale = Decimal128Dtype.MAX_PRECISION - max(p1 - s1, p2 - s2) - elif op in {"__mul__", "__div__"}: - if op == "__mul__": - scale = s1 + s2 - precision = p1 + p2 + 1 - else: - scale = max(6, s1 + p2 + 1) - precision = p1 - s1 + s2 + scale - if precision > Decimal128Dtype.MAX_PRECISION: - integral = precision - scale - if integral < 32: - scale = min(scale, Decimal128Dtype.MAX_PRECISION - integral) - elif scale > 6 and integral > 32: - scale = 6 - precision = Decimal128Dtype.MAX_PRECISION - else: - raise NotImplementedError() - - try: - if isinstance(lhs_dtype, type(rhs_dtype)): - # SCENARIO 1: If `lhs_dtype` & `rhs_dtype` are same, then try to - # see if `precision` & `scale` can be fit into this type. - return lhs_dtype.__class__(precision=precision, scale=scale) - else: - # SCENARIO 2: If `lhs_dtype` & `rhs_dtype` are of different dtypes, - # then try to see if `precision` & `scale` can be fit into the type - # with greater MAX_PRECISION (i.e., the bigger dtype). - if lhs_dtype.MAX_PRECISION >= rhs_dtype.MAX_PRECISION: - return lhs_dtype.__class__(precision=precision, scale=scale) - else: - return rhs_dtype.__class__(precision=precision, scale=scale) - except ValueError: - # Call to _validate fails, which means we need - # to goto SCENARIO 3. - pass - - # SCENARIO 3: If either of the above two scenarios fail, then get the - # MAX_PRECISION of `lhs_dtype` & `rhs_dtype` so that we can only check - # and return a dtype that is greater than or equal to input dtype that - # can fit `precision` & `scale`. - max_precision = max(lhs_dtype.MAX_PRECISION, rhs_dtype.MAX_PRECISION) - for decimal_type in ( - Decimal32Dtype, - Decimal64Dtype, - Decimal128Dtype, - ): - if decimal_type.MAX_PRECISION >= max_precision: - try: - return decimal_type(precision=precision, scale=scale) - except ValueError: - # Call to _validate fails, which means we need - # to try the next dtype - continue - - # if we've reached this point, we cannot create a decimal type without - # overflow; raise an informative error - raise ValueError( - f"Performing {op} between columns of type {repr(lhs_dtype)} and " - f"{repr(rhs_dtype)} would result in overflow" - ) - - -def _same_precision_and_scale(lhs: DecimalDtype, rhs: DecimalDtype) -> bool: - return lhs.precision == rhs.precision and lhs.scale == rhs.scale diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py deleted file mode 100644 index 9147270c289..00000000000 --- a/python/cudf/cudf/core/column/interval.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. -from __future__ import annotations - -from typing import TYPE_CHECKING, Literal - -import pandas as pd -import pyarrow as pa - -import cudf -from cudf.core.column import StructColumn, as_column -from cudf.core.dtypes import IntervalDtype - -if TYPE_CHECKING: - from typing_extensions import Self - - from cudf._typing import ScalarLike - from cudf.core.buffer import Buffer - from cudf.core.column import ColumnBase - - -class IntervalColumn(StructColumn): - def __init__( - self, - data: None, - size: int, - dtype: IntervalDtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple[ColumnBase, ColumnBase] = (), # type: ignore[assignment] - ): - if len(children) != 2: - raise ValueError( - "children must be a tuple of two columns (left edges, right edges)." - ) - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - @staticmethod - def _validate_dtype_instance(dtype: IntervalDtype) -> IntervalDtype: - if not isinstance(dtype, IntervalDtype): - raise ValueError("dtype must be a IntervalDtype.") - return dtype - - @classmethod - def from_arrow(cls, data: pa.Array) -> Self: - new_col = super().from_arrow(data.storage) - size = len(data) - dtype = IntervalDtype.from_arrow(data.type) - mask = data.buffers()[0] - if mask is not None: - mask = cudf.utils.utils.pa_mask_buffer_to_mask(mask, len(data)) - - offset = data.offset - null_count = data.null_count - children = new_col.children - - return cls( - data=None, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, # type: ignore[arg-type] - ) - - def to_arrow(self) -> pa.Array: - typ = self.dtype.to_arrow() - struct_arrow = super().to_arrow() - if len(struct_arrow) == 0: - # struct arrow is pa.struct array with null children types - # we need to make sure its children have non-null type - struct_arrow = pa.array([], typ.storage_type) - return pa.ExtensionArray.from_storage(typ, struct_arrow) - - @classmethod - def from_struct_column( - cls, - struct_column: StructColumn, - closed: Literal["left", "right", "both", "neither"] = "right", - ) -> Self: - first_field_name = next(iter(struct_column.dtype.fields.keys())) - return cls( - data=None, - size=struct_column.size, - dtype=IntervalDtype( - struct_column.dtype.fields[first_field_name], closed - ), - mask=struct_column.base_mask, - offset=struct_column.offset, - null_count=struct_column.null_count, - children=struct_column.base_children, # type: ignore[arg-type] - ) - - def copy(self, deep: bool = True) -> Self: - struct_copy = super().copy(deep=deep) - return IntervalColumn( # type: ignore[return-value] - data=None, - size=struct_copy.size, - dtype=IntervalDtype( - struct_copy.dtype.fields["left"], self.dtype.closed - ), - mask=struct_copy.base_mask, - offset=struct_copy.offset, - null_count=struct_copy.null_count, - children=struct_copy.base_children, # type: ignore[arg-type] - ) - - @property - def is_empty(self) -> ColumnBase: - left_equals_right = (self.right == self.left).fillna(False) - not_closed_both = as_column( - self.dtype.closed != "both", length=len(self) - ) - return left_equals_right & not_closed_both - - @property - def is_non_overlapping_monotonic(self) -> bool: - raise NotImplementedError( - "is_overlapping is currently not implemented." - ) - - @property - def is_overlapping(self) -> bool: - raise NotImplementedError( - "is_overlapping is currently not implemented." - ) - - @property - def length(self) -> ColumnBase: - return self.right - self.left - - @property - def left(self) -> ColumnBase: - return self.children[0] - - @property - def mid(self) -> ColumnBase: - try: - return 0.5 * (self.left + self.right) - except TypeError: - # datetime safe version - return self.left + 0.5 * self.length - - @property - def right(self) -> ColumnBase: - return self.children[1] - - def overlaps(other) -> ColumnBase: - raise NotImplementedError("overlaps is not currently implemented.") - - def set_closed( - self, closed: Literal["left", "right", "both", "neither"] - ) -> Self: - return IntervalColumn( # type: ignore[return-value] - data=None, - size=self.size, - dtype=IntervalDtype(self.dtype.fields["left"], closed), - mask=self.base_mask, - offset=self.offset, - null_count=self.null_count, - children=self.base_children, # type: ignore[arg-type] - ) - - def as_interval_column(self, dtype: IntervalDtype) -> Self: # type: ignore[override] - if isinstance(dtype, IntervalDtype): - return IntervalColumn( # type: ignore[return-value] - data=None, - size=self.size, - dtype=dtype, - mask=self.mask, - offset=self.offset, - null_count=self.null_count, - children=tuple( # type: ignore[arg-type] - child.astype(dtype.subtype) for child in self.children - ), - ) - else: - raise ValueError("dtype must be IntervalDtype") - - def to_pandas( - self, - *, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Index: - # Note: This does not handle null values in the interval column. - # However, this exact sequence (calling __from_arrow__ on the output of - # self.to_arrow) is currently the best known way to convert interval - # types into pandas (trying to convert the underlying numerical columns - # directly is problematic), so we're stuck with this for now. - if nullable: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - elif arrow_type: - raise NotImplementedError(f"{arrow_type=} is not implemented.") - - pd_type = self.dtype.to_pandas() - return pd.Index(pd_type.__from_arrow__(self.to_arrow()), dtype=pd_type) - - def element_indexing(self, index: int): - result = super().element_indexing(index) - if cudf.get_option("mode.pandas_compatible"): - return pd.Interval(**result, closed=self.dtype.closed) - return result - - def _reduce( - self, - op: str, - skipna: bool | None = None, - min_count: int = 0, - *args, - **kwargs, - ) -> ScalarLike: - result = super()._reduce(op, skipna, min_count, *args, **kwargs) - if cudf.get_option("mode.pandas_compatible"): - return pd.Interval(**result, closed=self.dtype.closed) - return result diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py deleted file mode 100644 index c6a39199e3b..00000000000 --- a/python/cudf/cudf/core/column/lists.py +++ /dev/null @@ -1,772 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast - -import numpy as np -import pandas as pd -import pyarrow as pa -from typing_extensions import Self - -import cudf -from cudf._lib.copying import segmented_gather -from cudf._lib.lists import ( - concatenate_list_elements, - concatenate_rows, - contains_scalar, - count_elements, - distinct, - extract_element_column, - extract_element_scalar, - index_of_column, - index_of_scalar, - sort_lists, -) -from cudf._lib.strings.convert.convert_lists import format_list_column -from cudf._lib.types import size_type_dtype -from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar -from cudf.core.column import ColumnBase, as_column, column -from cudf.core.column.methods import ColumnMethods, ParentType -from cudf.core.column.numerical import NumericalColumn -from cudf.core.dtypes import ListDtype -from cudf.core.missing import NA - -if TYPE_CHECKING: - from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike - from cudf.core.buffer import Buffer - - -class ListColumn(ColumnBase): - _VALID_BINARY_OPERATIONS = {"__add__", "__radd__"} - - def __init__( - self, - data: None, - size: int, - dtype: ListDtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple[NumericalColumn, ColumnBase] = (), # type: ignore[assignment] - ): - if data is not None: - raise ValueError("data must be None") - if not isinstance(dtype, ListDtype): - raise ValueError("dtype must be a cudf.ListDtype") - if not ( - len(children) == 2 - and isinstance(children[0], NumericalColumn) - # TODO: Enforce int32_t (size_type) used in libcudf? - and children[0].dtype.kind == "i" - and isinstance(children[1], ColumnBase) - ): - raise ValueError( - "children must a tuple of 2 columns of (signed integer offsets, list values)" - ) - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - @cached_property - def memory_usage(self): - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - - child0_size = (self.size + 1) * self.base_children[0].dtype.itemsize - current_base_child = self.base_children[1] - current_offset = self.offset - n += child0_size - while type(current_base_child) is ListColumn: - child0_size = ( - current_base_child.size + 1 - current_offset - ) * current_base_child.base_children[0].dtype.itemsize - n += child0_size - current_offset_col = current_base_child.base_children[0] - if not len(current_offset_col): - # See https://github.com/rapidsai/cudf/issues/16164 why - # offset column can be uninitialized - break - current_offset = current_offset_col.element_indexing( - current_offset - ) - current_base_child = current_base_child.base_children[1] - - n += ( - current_base_child.size - current_offset - ) * current_base_child.dtype.itemsize - - if current_base_child.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes( - current_base_child.size - ) - return n - - def __setitem__(self, key, value): - if isinstance(value, list): - value = cudf.Scalar(value) - if isinstance(value, cudf.Scalar): - if value.dtype != self.dtype: - raise TypeError("list nesting level mismatch") - elif value is NA: - value = cudf.Scalar(value, dtype=self.dtype) - else: - raise ValueError(f"Can not set {value} into ListColumn") - super().__setitem__(key, value) - - @property - def base_size(self): - # in some cases, libcudf will return an empty ListColumn with no - # indices; in these cases, we must manually set the base_size to 0 to - # avoid it being negative - return max(0, len(self.base_children[0]) - 1) - - def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: - # Lists only support __add__, which concatenates lists. - reflect, op = self._check_reflected_op(op) - other = self._wrap_binop_normalization(other) - if other is NotImplemented: - return NotImplemented - if isinstance(other.dtype, ListDtype): - if op == "__add__": - return concatenate_rows([self, other]) - else: - raise NotImplementedError( - "Lists concatenation for this operation is not yet" - "supported" - ) - else: - raise TypeError("can only concatenate list to list") - - @property - def elements(self) -> ColumnBase: - """ - Column containing the elements of each list (may itself be a - ListColumn) - """ - return self.children[1] - - @property - def offsets(self) -> NumericalColumn: - """ - Integer offsets to elements specifying each row of the ListColumn - """ - return cast(NumericalColumn, self.children[0]) - - def to_arrow(self): - offsets = self.offsets.to_arrow() - elements = ( - pa.nulls(len(self.elements)) - if len(self.elements) == self.elements.null_count - else self.elements.to_arrow() - ) - pa_type = pa.list_(elements.type) - - if self.nullable: - nbuf = pa.py_buffer(self.mask.memoryview()) - buffers = (nbuf, offsets.buffers()[1]) - else: - buffers = offsets.buffers() - return pa.ListArray.from_buffers( - pa_type, len(self), buffers, children=[elements] - ) - - def set_base_data(self, value): - if value is not None: - raise RuntimeError( - "ListColumn's do not use data attribute of Column, use " - "`set_base_children` instead" - ) - else: - super().set_base_data(value) - - def set_base_children(self, value: tuple[NumericalColumn, ColumnBase]): # type: ignore[override] - super().set_base_children(value) - self._dtype = cudf.ListDtype(element_type=value[1].dtype) - - @property - def __cuda_array_interface__(self): - raise NotImplementedError( - "Lists are not yet supported via `__cuda_array_interface__`" - ) - - def normalize_binop_value(self, other): - if not isinstance(other, ListColumn): - return NotImplemented - return other - - def _with_type_metadata( - self: "cudf.core.column.ListColumn", dtype: Dtype - ) -> "cudf.core.column.ListColumn": - if isinstance(dtype, ListDtype): - elements = self.base_children[1]._with_type_metadata( - dtype.element_type - ) - return ListColumn( - data=None, - dtype=dtype, - mask=self.base_mask, - size=self.size, - offset=self.offset, - null_count=self.null_count, - children=(self.base_children[0], elements), # type: ignore[arg-type] - ) - - return self - - def copy(self, deep: bool = True): - # Since list columns are immutable, both deep and shallow copies share - # the underlying device data and mask. - return super().copy(deep=False) - - def leaves(self): - if isinstance(self.elements, ListColumn): - return self.elements.leaves() - else: - return self.elements - - @classmethod - def from_sequences( - cls, arbitrary: Sequence[ColumnLike] - ) -> "cudf.core.column.ListColumn": - """ - Create a list column for list of column-like sequences - """ - data_col = column.column_empty(0) - mask_col = [] - offset_vals = [0] - offset = 0 - - # Build Data, Mask & Offsets - for data in arbitrary: - if cudf._lib.scalar._is_null_host_scalar(data): - mask_col.append(False) - offset_vals.append(offset) - else: - mask_col.append(True) - data_col = data_col.append(as_column(data)) - offset += len(data) - offset_vals.append(offset) - - offset_col = cast( - NumericalColumn, - column.as_column(offset_vals, dtype=size_type_dtype), - ) - - # Build ListColumn - res = cls( - data=None, - size=len(arbitrary), - dtype=cudf.ListDtype(data_col.dtype), - mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)), - offset=0, - null_count=0, - children=(offset_col, data_col), - ) - return res - - def as_string_column(self) -> cudf.core.column.StringColumn: - """ - Create a strings column from a list column - """ - lc = self._transform_leaves(lambda col: col.as_string_column()) - - # Separator strings to match the Python format - separators = as_column([", ", "[", "]"]) - - # Call libcudf to format the list column - return format_list_column(lc, separators) - - def _transform_leaves(self, func, *args, **kwargs) -> Self: - # return a new list column with the same nested structure - # as ``self``, but with the leaf column transformed - # by applying ``func`` to it - - cc: list[ListColumn] = [] - c: ColumnBase = self - - while isinstance(c, ListColumn): - cc.insert(0, c) - c = c.children[1] - - lc = func(c, *args, **kwargs) - - # Rebuild the list column replacing just the leaf child - for c in cc: - o = c.children[0] - lc = cudf.core.column.ListColumn( # type: ignore - data=None, - size=c.size, - dtype=cudf.ListDtype(lc.dtype), - mask=c.mask, - offset=c.offset, - null_count=c.null_count, - children=(o, lc), # type: ignore[arg-type] - ) - return lc - - def to_pandas( - self, - *, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Index: - if arrow_type or nullable: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - else: - return pd.Index(self.to_arrow().tolist(), dtype="object") - - -class ListMethods(ColumnMethods): - """ - List methods for Series - """ - - _column: ListColumn - - def __init__(self, parent: ParentType): - if not isinstance(parent.dtype, ListDtype): - raise AttributeError( - "Can only use .list accessor with a 'list' dtype" - ) - super().__init__(parent=parent) - - def get( - self, - index: int | ColumnLike, - default: ScalarLike | ColumnLike | None = None, - ) -> ParentType: - """ - Extract element at the given index from each list in a Series of lists. - - ``index`` can be an integer or a sequence of integers. If - ``index`` is an integer, the element at position ``index`` is - extracted from each list. If ``index`` is a sequence, it must - be of the same length as the Series, and ``index[i]`` - specifies the position of the element to extract from the - ``i``-th list in the Series. - - If the index is out of bounds for any list, return or, if - provided, ``default``. Thus, this method never raises an - ``IndexError``. - - Parameters - ---------- - index : int or sequence of ints - default : scalar, optional - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]]) - >>> s.list.get(-1) - 0 3 - 1 5 - 2 6 - dtype: int64 - - >>> s = cudf.Series([[1, 2], [3, 4, 5], [4, 5, 6]]) - >>> s.list.get(2) - 0 - 1 5 - 2 6 - dtype: int64 - - >>> s.list.get(2, default=0) - 0 0 - 1 5 - 2 6 - dtype: int64 - - >>> s.list.get([0, 1, 2]) - 0 1 - 1 4 - 2 6 - dtype: int64 - """ - if is_scalar(index): - out = extract_element_scalar(self._column, cudf.Scalar(index)) - else: - index = as_column(index) - out = extract_element_column(self._column, as_column(index)) - - if not (default is None or default is NA): - # determine rows for which `index` is out-of-bounds - lengths = count_elements(self._column) - out_of_bounds_mask = (np.negative(index) > lengths) | ( - index >= lengths - ) - - # replace the value in those rows (should be NA) with `default` - if out_of_bounds_mask.any(): - out = out._scatter_by_column( - out_of_bounds_mask, cudf.Scalar(default) - ) - if out.dtype != self._column.dtype.element_type: - # libcudf doesn't maintain struct labels so we must transfer over - # manually from the input column if we lost some information - # somewhere. Not doing this unilaterally since the cost is - # non-zero.. - out = out._with_type_metadata(self._column.dtype.element_type) - return self._return_or_inplace(out) - - def contains(self, search_key: ScalarLike) -> ParentType: - """ - Returns boolean values indicating whether the specified scalar - is an element of each row. - - Parameters - ---------- - search_key : scalar - element being searched for in each row of the list column - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]]) - >>> s.list.contains(4) - Series([False, True, True]) - dtype: bool - """ - return self._return_or_inplace( - contains_scalar(self._column, cudf.Scalar(search_key)) - ) - - def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: - """ - Returns integers representing the index of the search key for each row. - - If ``search_key`` is a sequence, it must be the same length as the - Series and ``search_key[i]`` represents the search key for the - ``i``-th row of the Series. - - If the search key is not contained in a row, -1 is returned. If either - the row or the search key are null, is returned. If the search key - is contained multiple times, the smallest matching index is returned. - - Parameters - ---------- - search_key : scalar or sequence of scalars - Element or elements being searched for in each row of the list - column - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]]) - >>> s.list.index(4) - 0 -1 - 1 1 - 2 0 - dtype: int32 - - >>> s = cudf.Series([["a", "b", "c"], ["x", "y", "z"]]) - >>> s.list.index(["b", "z"]) - 0 1 - 1 2 - dtype: int32 - - >>> s = cudf.Series([[4, 5, 6], None, [-3, -2, -1]]) - >>> s.list.index([None, 3, -2]) - 0 - 1 - 2 1 - dtype: int32 - """ - - if is_scalar(search_key): - return self._return_or_inplace( - index_of_scalar(self._column, cudf.Scalar(search_key)) - ) - else: - return self._return_or_inplace( - index_of_column(self._column, as_column(search_key)) - ) - - @property - def leaves(self) -> ParentType: - """ - From a Series of (possibly nested) lists, obtain the elements from - the innermost lists as a flat Series (one value per row). - - Returns - ------- - Series or Index - - Examples - -------- - >>> a = cudf.Series([[[1, None], [3, 4]], None, [[5, 6]]]) - >>> a.list.leaves - 0 1 - 1 - 2 3 - 3 4 - 4 5 - 5 6 - dtype: int64 - """ - return self._return_or_inplace( - self._column.leaves(), retain_index=False - ) - - def len(self) -> ParentType: - """ - Computes the length of each element in the Series/Index. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = cudf.Series([[1, 2, 3], None, [4, 5]]) - >>> s - 0 [1, 2, 3] - 1 None - 2 [4, 5] - dtype: list - >>> s.list.len() - 0 3 - 1 - 2 2 - dtype: int32 - """ - return self._return_or_inplace(count_elements(self._column)) - - def take(self, lists_indices: ColumnLike) -> ParentType: - """ - Collect list elements based on given indices. - - Parameters - ---------- - lists_indices: Series-like of lists - Specifies what to collect from each row - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = cudf.Series([[1, 2, 3], None, [4, 5]]) - >>> s - 0 [1, 2, 3] - 1 None - 2 [4, 5] - dtype: list - >>> s.list.take([[0, 1], [], []]) - 0 [1, 2] - 1 None - 2 [] - dtype: list - """ - - lists_indices_col = as_column(lists_indices) - if not isinstance(lists_indices_col, ListColumn): - raise ValueError("lists_indices should be list type array.") - if not lists_indices_col.size == self._column.size: - raise ValueError( - "lists_indices and list column is of different " "size." - ) - if ( - not _is_non_decimal_numeric_dtype( - lists_indices_col.children[1].dtype - ) - or lists_indices_col.children[1].dtype.kind not in "iu" - ): - raise TypeError( - "lists_indices should be column of values of index types." - ) - - return self._return_or_inplace( - segmented_gather(self._column, lists_indices_col) - ) - - def unique(self) -> ParentType: - """ - Returns the unique elements in each list. - The ordering of elements is not guaranteed. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []]) - >>> s - 0 [1.0, 1.0, 2.0, nan, nan] - 1 None - 2 [4.0, 4.0] - 3 [] - dtype: list - >>> s.list.unique() # Order of list element is not guaranteed - 0 [1.0, 2.0, nan] - 1 None - 2 [4.0] - 3 [] - dtype: list - """ - - if isinstance(self._column.children[1].dtype, ListDtype): - raise NotImplementedError("Nested lists unique is not supported.") - - return self._return_or_inplace( - distinct(self._column, nulls_equal=True, nans_all_equal=True) - ) - - def sort_values( - self, - ascending: bool = True, - inplace: bool = False, - kind: str = "quicksort", - na_position: str = "last", - ignore_index: bool = False, - ) -> ParentType: - """ - Sort each list by the values. - - Sort the lists in ascending or descending order by some criterion. - - Parameters - ---------- - ascending : bool, default True - If True, sort values in ascending order, otherwise descending. - na_position : {'first', 'last'}, default 'last' - 'first' puts nulls at the beginning, 'last' puts nulls at the end. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, ..., n - 1. - - Returns - ------- - Series or Index with each list sorted - - Examples - -------- - >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) - >>> s.list.sort_values(ascending=True, na_position="last") - 0 [2.0, 4.0, 9.0, nan] - 1 [2.0, 8.0, 8.0] - 2 [1.0, 2.0] - dtype: list - - .. pandas-compat:: - `pandas.Series.list.sort_values` - - This method does not exist in pandas but it can be run - as: - - >>> import pandas as pd - >>> s = pd.Series([[3, 2, 1], [2, 4, 3]]) - >>> print(s.apply(sorted)) - 0 [1, 2, 3] - 1 [2, 3, 4] - dtype: object - """ - if inplace: - raise NotImplementedError("`inplace` not currently implemented.") - if kind != "quicksort": - raise NotImplementedError("`kind` not currently implemented.") - if na_position not in {"first", "last"}: - raise ValueError(f"Unknown `na_position` value {na_position}") - if isinstance(self._column.children[1].dtype, ListDtype): - raise NotImplementedError("Nested lists sort is not supported.") - - return self._return_or_inplace( - sort_lists(self._column, ascending, na_position), - retain_index=not ignore_index, - ) - - def concat(self, dropna=True) -> ParentType: - """ - For a column with at least one level of nesting, concatenate the - lists in each row. - - Parameters - ---------- - dropna: bool, optional - If True (default), ignores top-level null elements in each row. - If False, and top-level null elements are present, the resulting - row in the output is null. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s1 - 0 [[1.0, 2.0], [3.0, 4.0, 5.0]] - 1 [[6.0, None], [7.0], [8.0, 9.0]] - dtype: list - >>> s1.list.concat() - 0 [1.0, 2.0, 3.0, 4.0, 5.0] - 1 [6.0, None, 7.0, 8.0, 9.0] - dtype: list - - Null values at the top-level in each row are dropped by default: - - >>> s2 - 0 [[1.0, 2.0], None, [3.0, 4.0, 5.0]] - 1 [[6.0, None], [7.0], [8.0, 9.0]] - dtype: list - >>> s2.list.concat() - 0 [1.0, 2.0, 3.0, 4.0, 5.0] - 1 [6.0, None, 7.0, 8.0, 9.0] - dtype: list - - Use ``dropna=False`` to produce a null instead: - - >>> s2.list.concat(dropna=False) - 0 None - 1 [6.0, nan, 7.0, 8.0, 9.0] - dtype: list - """ - return self._return_or_inplace( - concatenate_list_elements(self._column, dropna=dropna) - ) - - def astype(self, dtype): - """ - Return a new list Series with the leaf values casted - to the specified data type. - - Parameters - ---------- - dtype: data type to cast leaves values to - - Returns - ------- - A new Series of lists - - Examples - -------- - >>> s = cudf.Series([[1, 2], [3, 4]]) - >>> s.dtype - ListDtype(int64) - >>> s2 = s.list.astype("float64") - >>> s2.dtype - ListDtype(float64) - """ - return self._return_or_inplace( - self._column._transform_leaves( - lambda col, dtype: col.astype(dtype), dtype - ) - ) diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py deleted file mode 100644 index 05a0ab2e09a..00000000000 --- a/python/cudf/cudf/core/column/methods.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -from typing import Union, overload - -from typing_extensions import Literal - -import cudf -import cudf.core.column -import cudf.core.column_accessor -from cudf.utils.utils import NotIterable - -ParentType = Union["cudf.Series", "cudf.core.index.Index"] - - -class ColumnMethods(NotIterable): - _parent: ParentType - - def __init__(self, parent: ParentType): - self._parent = parent - self._column = self._parent._column - - @overload - def _return_or_inplace( - self, - new_col, - inplace: Literal[True], - expand: bool = False, - retain_index: bool = True, - ) -> None: ... - - @overload - def _return_or_inplace( - self, - new_col, - inplace: Literal[False], - expand: bool = False, - retain_index: bool = True, - ) -> ParentType: ... - - @overload - def _return_or_inplace( - self, - new_col, - expand: bool = False, - retain_index: bool = True, - ) -> ParentType: ... - - @overload - def _return_or_inplace( - self, - new_col, - inplace: bool = False, - expand: bool = False, - retain_index: bool = True, - ) -> ParentType | None: ... - - def _return_or_inplace( - self, new_col, inplace=False, expand=False, retain_index=True - ): - """ - Returns an object of the type of the column owner or updates the column - of the owner (Series or Index) to mimic an inplace operation - """ - if inplace: - self._parent._mimic_inplace( - type(self._parent)._from_column( - new_col, name=self._parent.name - ), - inplace=True, - ) - return None - else: - if expand: - # This branch indicates the passed as new_col - # is a Table - table = new_col - - if isinstance(self._parent, cudf.BaseIndex): - idx = self._parent._constructor_expanddim._from_data(table) - idx.names = None - return idx - else: - return self._parent._constructor_expanddim._from_data( - data=table, index=self._parent.index - ) - elif isinstance(self._parent, cudf.Series): - return cudf.Series._from_column( - new_col, - name=self._parent.name, - index=self._parent.index if retain_index else None, - ) - elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index._from_column(new_col, name=self._parent.name) - else: - return self._parent._mimic_inplace(new_col, inplace=False) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py deleted file mode 100644 index 78d2814ed26..00000000000 --- a/python/cudf/cudf/core/column/numerical.py +++ /dev/null @@ -1,789 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import functools -from typing import TYPE_CHECKING, Any, Sequence, cast - -import numpy as np -import pandas as pd -from typing_extensions import Self - -import pylibcudf - -import cudf -from cudf import _lib as libcudf -from cudf.api.types import is_integer, is_scalar -from cudf.core.column import ColumnBase, as_column, column, string -from cudf.core.dtypes import CategoricalDtype -from cudf.core.mixins import BinaryOperand -from cudf.errors import MixedTypeError -from cudf.utils.dtypes import ( - find_common_type, - min_column_type, - min_signed_type, - np_dtypes_to_pandas_dtypes, -) - -from .numerical_base import NumericalBaseColumn - -if TYPE_CHECKING: - from collections.abc import Callable - - from cudf._typing import ( - ColumnBinaryOperand, - ColumnLike, - Dtype, - DtypeObj, - ScalarLike, - ) - from cudf.core.buffer import Buffer - -_unaryop_map = { - "ASIN": "ARCSIN", - "ACOS": "ARCCOS", - "ATAN": "ARCTAN", - "INVERT": "BIT_INVERT", -} - - -class NumericalColumn(NumericalBaseColumn): - """ - A Column object for Numeric types. - - Parameters - ---------- - data : Buffer - dtype : np.dtype - The dtype associated with the data Buffer - mask : Buffer, optional - """ - - _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS - - def __init__( - self, - data: Buffer, - size: int | None, - dtype: np.dtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - if not (isinstance(dtype, np.dtype) and dtype.kind in "iufb"): - raise ValueError( - "dtype must be a floating, integer or boolean numpy dtype." - ) - - if data.size % dtype.itemsize: - raise ValueError("Buffer size must be divisible by element size") - if size is None: - size = (data.size // dtype.itemsize) - offset - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - def _clear_cache(self): - super()._clear_cache() - try: - del self.nan_count - except AttributeError: - pass - - def __contains__(self, item: ScalarLike) -> bool: - """ - Returns True if column contains item, else False. - """ - # Handles improper item types - # Fails if item is of type None, so the handler. - try: - search_item = self.dtype.type(item) - if search_item != item and self.dtype.kind != "f": - return False - except (TypeError, ValueError): - return False - # TODO: Use `scalar`-based `contains` wrapper - return libcudf.search.contains( - self, column.as_column([search_item], dtype=self.dtype) - ).any() - - def indices_of(self, value: ScalarLike) -> NumericalColumn: - if isinstance(value, (bool, np.bool_)) and self.dtype.kind != "b": - raise ValueError( - f"Cannot use a {type(value).__name__} to find an index of " - f"a {self.dtype} Index." - ) - if ( - value is not None - and self.dtype.kind in {"c", "f"} - and np.isnan(value) - ): - nan_col = libcudf.unary.is_nan(self) - return nan_col.indices_of(True) - else: - return super().indices_of(value) - - def has_nulls(self, include_nan: bool = False) -> bool: - return bool(self.null_count != 0) or ( - include_nan and bool(self.nan_count != 0) - ) - - def __setitem__(self, key: Any, value: Any): - """ - Set the value of ``self[key]`` to ``value``. - - If ``value`` and ``self`` are of different types, ``value`` is coerced - to ``self.dtype``. - """ - - # Normalize value to scalar/column - device_value: cudf.Scalar | ColumnBase = ( - cudf.Scalar( - value, - dtype=self.dtype - if cudf._lib.scalar._is_null_host_scalar(value) - else None, - ) - if is_scalar(value) - else as_column(value) - ) - - if self.dtype.kind != "b" and device_value.dtype.kind == "b": - raise TypeError(f"Invalid value {value} for dtype {self.dtype}") - else: - device_value = device_value.astype(self.dtype) - - out: ColumnBase | None # If None, no need to perform mimic inplace. - if isinstance(key, slice): - out = self._scatter_by_slice(key, device_value) - else: - key = as_column( - key, - dtype="float64" - if isinstance(key, list) and len(key) == 0 - else None, - ) - if not isinstance(key, cudf.core.column.NumericalColumn): - raise ValueError(f"Invalid scatter map type {key.dtype}.") - out = self._scatter_by_column(key, device_value) - - if out: - self._mimic_inplace(out, inplace=True) - - def unary_operator(self, unaryop: str | Callable) -> ColumnBase: - if callable(unaryop): - return libcudf.transform.transform(self, unaryop) - - unaryop = unaryop.upper() - unaryop = _unaryop_map.get(unaryop, unaryop) - unaryop = pylibcudf.unary.UnaryOperator[unaryop] - return libcudf.unary.unary_operation(self, unaryop) - - def __invert__(self): - if self.dtype.kind in "ui": - return self.unary_operator("invert") - elif self.dtype.kind == "b": - return self.unary_operator("not") - else: - return super().__invert__() - - def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: - int_float_dtype_mapping = { - np.int8: np.float32, - np.int16: np.float32, - np.int32: np.float32, - np.int64: np.float64, - np.uint8: np.float32, - np.uint16: np.float32, - np.uint32: np.float64, - np.uint64: np.float64, - np.bool_: np.float32, - } - - out_dtype = None - if op in {"__truediv__", "__rtruediv__"}: - # Division with integer types results in a suitable float. - if truediv_type := int_float_dtype_mapping.get(self.dtype.type): - return self.astype(truediv_type)._binaryop(other, op) - elif op in { - "__lt__", - "__gt__", - "__le__", - "__ge__", - "__eq__", - "__ne__", - }: - out_dtype = "bool" - - # If `other` is a Python integer and it is out-of-bounds - # promotion could fail but we can trivially define the result - # in terms of `notnull` or `NULL_NOT_EQUALS`. - if type(other) is int and self.dtype.kind in "iu": # noqa: E721 - truthiness = None - iinfo = np.iinfo(self.dtype) - if iinfo.min > other: - truthiness = op in {"__ne__", "__gt__", "__ge__"} - elif iinfo.max < other: - truthiness = op in {"__ne__", "__lt__", "__le__"} - - # Compare with minimum value so that the result is true/false - if truthiness is True: - other = iinfo.min - op = "__ge__" - elif truthiness is False: - other = iinfo.min - op = "__lt__" - - elif op in {"NULL_EQUALS", "NULL_NOT_EQUALS"}: - out_dtype = "bool" - - reflect, op = self._check_reflected_op(op) - if (other := self._wrap_binop_normalization(other)) is NotImplemented: - return NotImplemented - - if out_dtype is not None: - pass # out_dtype was already set to bool - if other is None: - # not a binary operator, so no need to promote - out_dtype = self.dtype - elif out_dtype is None: - out_dtype = np.result_type(self.dtype, other.dtype) - if op in {"__mod__", "__floordiv__"}: - tmp = self if reflect else other - # Guard against division by zero for integers. - if ( - tmp.dtype.type in int_float_dtype_mapping - and tmp.dtype.kind != "b" - ): - if isinstance(tmp, NumericalColumn) and 0 in tmp: - out_dtype = cudf.dtype("float64") - elif isinstance(tmp, cudf.Scalar): - if tmp.is_valid() and tmp == 0: - # tmp == 0 can return NA - out_dtype = cudf.dtype("float64") - elif is_scalar(tmp) and tmp == 0: - out_dtype = cudf.dtype("float64") - - if op in {"__and__", "__or__", "__xor__"}: - if self.dtype.kind == "f" or other.dtype.kind == "f": - raise TypeError( - f"Operation 'bitwise {op[2:-2]}' not supported between " - f"{self.dtype.type.__name__} and " - f"{other.dtype.type.__name__}" - ) - if self.dtype.kind == "b" or other.dtype.kind == "b": - out_dtype = "bool" - - elif ( - op == "__pow__" - and self.dtype.kind in "iu" - and (is_integer(other) or other.dtype.kind in "iu") - ): - op = "INT_POW" - - lhs, rhs = (other, self) if reflect else (self, other) - - return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - - def nans_to_nulls(self: Self) -> Self: - # Only floats can contain nan. - if self.dtype.kind != "f" or self.nan_count == 0: - return self - newmask = libcudf.transform.nans_to_nulls(self) - return self.set_mask(newmask) - - def normalize_binop_value( - self, other: ScalarLike - ) -> ColumnBase | cudf.Scalar: - if isinstance(other, ColumnBase): - if not isinstance(other, NumericalColumn): - return NotImplemented - return other - if isinstance(other, cudf.Scalar): - if self.dtype == other.dtype: - return other - - # expensive device-host transfer just to - # adjust the dtype - other = other.value - - # NumPy 2 needs a Python scalar to do weak promotion, but - # pandas forces weak promotion always - # TODO: We could use 0, 0.0, and 0j for promotion to avoid copies. - if other.dtype.kind in "ifc": - other = other.item() - elif not isinstance(other, (int, float, complex)): - # Go via NumPy to get the value - other = np.array(other) - if other.dtype.kind in "ifc": - other = other.item() - - # Try and match pandas and hence numpy. Deduce the common - # dtype via the _value_ of other, and the dtype of self on NumPy 1.x - # with NumPy 2, we force weak promotion even for our/NumPy scalars - # to match pandas 2.2. - # Weak promotion is not at all simple: - # np.result_type(0, np.uint8) - # => np.uint8 - # np.result_type(np.asarray([0], dtype=np.int64), np.uint8) - # => np.int64 - # np.promote_types(np.int64(0), np.uint8) - # => np.int64 - # np.promote_types(np.asarray([0], dtype=np.int64).dtype, np.uint8) - # => np.int64 - common_dtype = np.result_type(self.dtype, other) - if common_dtype.kind in {"b", "i", "u", "f"}: - if self.dtype.kind == "b": - common_dtype = min_signed_type(other) - return cudf.Scalar(other, dtype=common_dtype) - else: - return NotImplemented - - def int2ip(self) -> "cudf.core.column.StringColumn": - if self.dtype != cudf.dtype("uint32"): - raise TypeError("Only uint32 type can be converted to ip") - - return libcudf.string_casting.int2ip(self) - - def as_string_column(self) -> cudf.core.column.StringColumn: - if len(self) > 0: - return string._numeric_to_str_typecast_functions[ - cudf.dtype(self.dtype) - ](self) - else: - return cast( - cudf.core.column.StringColumn, - column.column_empty(0, dtype="object"), - ) - - def as_datetime_column( - self, dtype: Dtype - ) -> cudf.core.column.DatetimeColumn: - return cudf.core.column.DatetimeColumn( - data=self.astype("int64").base_data, # type: ignore[arg-type] - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - size=self.size, - ) - - def as_timedelta_column( - self, dtype: Dtype - ) -> cudf.core.column.TimeDeltaColumn: - return cudf.core.column.TimeDeltaColumn( - data=self.astype("int64").base_data, # type: ignore[arg-type] - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - size=self.size, - ) - - def as_decimal_column( - self, dtype: Dtype - ) -> "cudf.core.column.DecimalBaseColumn": - return libcudf.unary.cast(self, dtype) - - def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: - dtype = cudf.dtype(dtype) - if dtype == self.dtype: - return self - return libcudf.unary.cast(self, dtype) - - def all(self, skipna: bool = True) -> bool: - # If all entries are null the result is True, including when the column - # is empty. - result_col = self.nans_to_nulls() if skipna else self - - if result_col.null_count == result_col.size: - return True - - return libcudf.reduce.reduce("all", result_col) - - def any(self, skipna: bool = True) -> bool: - # Early exit for fast cases. - result_col = self.nans_to_nulls() if skipna else self - - if not skipna and result_col.has_nulls(): - return True - elif skipna and result_col.null_count == result_col.size: - return False - - return libcudf.reduce.reduce("any", result_col) - - @functools.cached_property - def nan_count(self) -> int: - if self.dtype.kind != "f": - return 0 - nan_col = libcudf.unary.is_nan(self) - return nan_col.sum() - - def _process_values_for_isin( - self, values: Sequence - ) -> tuple[ColumnBase, ColumnBase]: - lhs = cast("cudf.core.column.ColumnBase", self) - try: - rhs = as_column(values, nan_as_null=False) - except (MixedTypeError, TypeError) as e: - # There is a corner where `values` can be of `object` dtype - # but have values of homogeneous type. - inferred_dtype = cudf.api.types.infer_dtype(values) - if ( - self.dtype.kind in {"i", "u"} and inferred_dtype == "integer" - ) or ( - self.dtype.kind == "f" - and inferred_dtype in {"floating", "integer"} - ): - rhs = as_column(values, nan_as_null=False, dtype=self.dtype) - elif self.dtype.kind == "f" and inferred_dtype == "integer": - rhs = as_column(values, nan_as_null=False, dtype="int") - elif ( - self.dtype.kind in {"i", "u"} and inferred_dtype == "floating" - ): - rhs = as_column(values, nan_as_null=False, dtype="float") - else: - raise e - else: - if isinstance(rhs, NumericalColumn): - rhs = rhs.astype(dtype=self.dtype) - - if lhs.null_count == len(lhs): - lhs = lhs.astype(rhs.dtype) - elif rhs.null_count == len(rhs): - rhs = rhs.astype(lhs.dtype) - - return lhs, rhs - - def _can_return_nan(self, skipna: bool | None = None) -> bool: - return not skipna and self.has_nulls(include_nan=True) - - def _process_for_reduction( - self, skipna: bool | None = None, min_count: int = 0 - ) -> NumericalColumn | ScalarLike: - skipna = True if skipna is None else skipna - - if self._can_return_nan(skipna=skipna): - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - col = self.nans_to_nulls() if skipna else self - return super(NumericalColumn, col)._process_for_reduction( - skipna=skipna, min_count=min_count - ) - - def find_and_replace( - self, - to_replace: ColumnLike, - replacement: ColumnLike, - all_nan: bool = False, - ) -> NumericalColumn: - """ - Return col with *to_replace* replaced with *value*. - """ - - # If all of `to_replace`/`replacement` are `None`, - # dtype of `to_replace_col`/`replacement_col` - # is inferred as `string`, but this is a valid - # float64 column too, Hence we will need to type-cast - # to self.dtype. - to_replace_col = column.as_column(to_replace) - if to_replace_col.null_count == len(to_replace_col): - to_replace_col = to_replace_col.astype(self.dtype) - - replacement_col = column.as_column(replacement) - if replacement_col.null_count == len(replacement_col): - replacement_col = replacement_col.astype(self.dtype) - - if not isinstance(to_replace_col, type(replacement_col)): - raise TypeError( - f"to_replace and value should be of same types," - f"got to_replace dtype: {to_replace_col.dtype} and " - f"value dtype: {replacement_col.dtype}" - ) - - if not isinstance(to_replace_col, NumericalColumn) and not isinstance( - replacement_col, NumericalColumn - ): - return self.copy() - - to_replace_col = _normalize_find_and_replace_input( - self.dtype, to_replace - ) - if all_nan: - replacement_col = column.as_column(replacement, dtype=self.dtype) - else: - replacement_col = _normalize_find_and_replace_input( - self.dtype, replacement - ) - if len(replacement_col) == 1 and len(to_replace_col) > 1: - replacement_col = column.as_column( - replacement[0], length=len(to_replace_col), dtype=self.dtype - ) - elif len(replacement_col) == 1 and len(to_replace_col) == 0: - return self.copy() - common_type = find_common_type( - (to_replace_col.dtype, replacement_col.dtype, self.dtype) - ) - replaced = self.astype(common_type) - df = cudf.DataFrame._from_data( - { - "old": to_replace_col.astype(common_type), - "new": replacement_col.astype(common_type), - } - ) - df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) - if df._data["old"].null_count == 1: - replaced = replaced.fillna( - df._data["new"] - .apply_boolean_mask(df._data["old"].isnull()) - .element_indexing(0) - ) - df = df.dropna(subset=["old"]) - - return libcudf.replace.replace( - replaced, df._data["old"], df._data["new"] - ) - - def _validate_fillna_value( - self, fill_value: ScalarLike | ColumnLike - ) -> cudf.Scalar | ColumnBase: - """Align fill_value for .fillna based on column type.""" - if is_scalar(fill_value): - cudf_obj: cudf.Scalar | ColumnBase = cudf.Scalar(fill_value) - if not as_column(cudf_obj).can_cast_safely(self.dtype): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(fill_value).__name__} to {self.dtype.name}" - ) - else: - cudf_obj = as_column(fill_value, nan_as_null=False) - if not cudf_obj.can_cast_safely(self.dtype): # type: ignore[attr-defined] - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{cudf_obj.dtype.type.__name__} to " - f"{self.dtype.type.__name__}" - ) - return cudf_obj.astype(self.dtype) - - def can_cast_safely(self, to_dtype: DtypeObj) -> bool: - """ - Returns true if all the values in self can be - safely cast to dtype - """ - if self.dtype.kind == to_dtype.kind: - if self.dtype <= to_dtype: - return True - else: - # Kinds are the same but to_dtype is smaller - if "float" in to_dtype.name: - finfo = np.finfo(to_dtype) - lower_, upper_ = finfo.min, finfo.max - elif "int" in to_dtype.name: - iinfo = np.iinfo(to_dtype) - lower_, upper_ = iinfo.min, iinfo.max - - if self.dtype.kind == "f": - # Exclude 'np.inf', '-np.inf' - not_inf = (self != np.inf) & (self != -np.inf) - col = self.apply_boolean_mask(not_inf) - else: - col = self - - min_ = col.min() - # TODO: depending on implementation of cudf scalar and future - # refactor of min/max, change the test method - if np.isnan(min_): - # Column contains only infs - return True - - return (min_ >= lower_) and (col.max() < upper_) - - # want to cast int to uint - elif self.dtype.kind == "i" and to_dtype.kind == "u": - i_max_ = np.iinfo(self.dtype).max - u_max_ = np.iinfo(to_dtype).max - - return (self.min() >= 0) and ( - (i_max_ <= u_max_) or (self.max() < u_max_) - ) - - # want to cast uint to int - elif self.dtype.kind == "u" and to_dtype.kind == "i": - u_max_ = np.iinfo(self.dtype).max - i_max_ = np.iinfo(to_dtype).max - - return (u_max_ <= i_max_) or (self.max() < i_max_) - - # want to cast int to float - elif self.dtype.kind in {"i", "u"} and to_dtype.kind == "f": - info = np.finfo(to_dtype) - biggest_exact_int = 2 ** (info.nmant + 1) - if (self.min() >= -biggest_exact_int) and ( - self.max() <= biggest_exact_int - ): - return True - else: - filled = self.fillna(0) - return ( - filled.astype(to_dtype).astype(filled.dtype) == filled - ).all() - - # want to cast float to int: - elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}: - if self.nan_count > 0: - return False - iinfo = np.iinfo(to_dtype) - min_, max_ = iinfo.min, iinfo.max - - # best we can do is hope to catch it here and avoid compare - # Use Python floats, which have precise comparison for float64. - # NOTE(seberg): it would make sense to limit to the mantissa range. - if (float(self.min()) >= min_) and (float(self.max()) <= max_): - filled = self.fillna(0) - return (filled % 1 == 0).all() - else: - return False - - return False - - def _with_type_metadata(self: Self, dtype: Dtype) -> ColumnBase: - if isinstance(dtype, CategoricalDtype): - codes = cudf.core.column.categorical.as_unsigned_codes( - len(dtype.categories), self - ) - return cudf.core.column.CategoricalColumn( - data=None, - size=self.size, - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - null_count=self.null_count, - children=(codes,), - ) - return self - - def to_pandas( - self, - *, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Index: - if arrow_type and nullable: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - elif arrow_type: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - elif ( - nullable - and ( - pandas_nullable_dtype := np_dtypes_to_pandas_dtypes.get( - self.dtype - ) - ) - is not None - ): - arrow_array = self.to_arrow() - pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) # type: ignore[attr-defined] - return pd.Index(pandas_array, copy=False) - elif self.dtype.kind in set("iuf") and not self.has_nulls(): - return pd.Index(self.values_host, copy=False) - else: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - - def _reduction_result_dtype(self, reduction_op: str) -> Dtype: - if reduction_op in {"sum", "product"}: - if self.dtype.kind == "f": - return self.dtype - return np.dtype("int64") - elif reduction_op == "sum_of_squares": - return np.result_dtype(self.dtype, np.dtype("uint64")) - elif reduction_op in {"var", "std", "mean"}: - return np.dtype("float64") - - return super()._reduction_result_dtype(reduction_op) - - -def _normalize_find_and_replace_input( - input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list -) -> ColumnBase: - normalized_column = column.as_column( - col_to_normalize, - dtype=input_column_dtype if len(col_to_normalize) <= 0 else None, - ) - col_to_normalize_dtype = normalized_column.dtype - if isinstance(col_to_normalize, list): - if normalized_column.null_count == len(normalized_column): - normalized_column = normalized_column.astype(input_column_dtype) - col_to_normalize_dtype = min_column_type( - normalized_column, input_column_dtype - ) - # Scalar case - if len(col_to_normalize) == 1: - if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]): - return normalized_column.astype(input_column_dtype) - if np.isinf(col_to_normalize[0]): - return normalized_column - col_to_normalize_casted = np.array(col_to_normalize[0]).astype( - input_column_dtype - ) - - if not np.isnan(col_to_normalize_casted) and ( - col_to_normalize_casted != col_to_normalize[0] - ): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{col_to_normalize[0]} " - f"to {input_column_dtype.name}" - ) - else: - col_to_normalize_dtype = input_column_dtype - elif hasattr(col_to_normalize, "dtype"): - col_to_normalize_dtype = col_to_normalize.dtype - else: - raise TypeError(f"Type {type(col_to_normalize)} not supported") - - if ( - col_to_normalize_dtype.kind == "f" - and input_column_dtype.kind in {"i", "u"} - ) or (col_to_normalize_dtype.num > input_column_dtype.num): - raise TypeError( - f"Potentially unsafe cast for non-equivalent " - f"{col_to_normalize_dtype.name} " - f"to {input_column_dtype.name}" - ) - return normalized_column.astype(input_column_dtype) - - -def digitize( - column: ColumnBase, bins: np.ndarray, right: bool = False -) -> ColumnBase: - """Return the indices of the bins to which each value in column belongs. - - Parameters - ---------- - column : Column - Input column. - bins : Column-like - 1-D column-like object of bins with same type as `column`, should be - monotonically increasing. - right : bool - Indicates whether interval contains the right or left bin edge. - - Returns - ------- - A column containing the indices - """ - if not column.dtype == bins.dtype: - raise ValueError( - "Digitize() expects bins and input column have the same dtype." - ) - - bin_col = as_column(bins, dtype=bins.dtype) - if bin_col.nullable: - raise ValueError("`bins` cannot contain null entries.") - - return as_column(libcudf.sort.digitize([column], [bin_col], right)) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py deleted file mode 100644 index 3b8dd05c13a..00000000000 --- a/python/cudf/cudf/core/column/numerical_base.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. -"""Define an interface for columns that can perform numerical operations.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, cast - -import numpy as np - -import cudf -from cudf import _lib as libcudf -from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase -from cudf.core.missing import NA -from cudf.core.mixins import Scannable - -if TYPE_CHECKING: - from cudf._typing import ScalarLike - from cudf.core.column.decimal import DecimalDtype - - -class NumericalBaseColumn(ColumnBase, Scannable): - """ - A column composed of numerical (bool, integer, float, decimal) data. - - This class encodes a standard interface for different types of columns - containing numerical types of data. In particular, mathematical operations - that make sense whether a column is integral or real, fixed or floating - point, should be encoded here. - """ - - _VALID_REDUCTIONS = { - "sum", - "product", - "sum_of_squares", - "mean", - "var", - "std", - } - - _VALID_SCANS = { - "cumsum", - "cumprod", - "cummin", - "cummax", - } - - def __init__( - self, - data: Buffer, - size: int, - dtype: DecimalDtype | np.dtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - if not isinstance(data, Buffer): - raise ValueError("data must be a Buffer instance.") - if len(children) != 0: - raise ValueError(f"{type(self).__name__} must have no children.") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - def _can_return_nan(self, skipna: bool | None = None) -> bool: - return not skipna and self.has_nulls() - - def kurtosis(self, skipna: bool | None = None) -> float: - skipna = True if skipna is None else skipna - - if len(self) == 0 or self._can_return_nan(skipna=skipna): - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - self = self.nans_to_nulls().dropna() - - if len(self) < 4: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - n = len(self) - miu = self.mean() - m4_numerator = ((self - miu) ** self.normalize_binop_value(4)).sum() - V = self.var() - - if V == 0: - return 0 - - term_one_section_one = (n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3)) - term_one_section_two = m4_numerator / (V**2) - term_two = ((n - 1) ** 2) / ((n - 2) * (n - 3)) - kurt = term_one_section_one * term_one_section_two - 3 * term_two - return kurt - - def skew(self, skipna: bool | None = None) -> ScalarLike: - skipna = True if skipna is None else skipna - - if len(self) == 0 or self._can_return_nan(skipna=skipna): - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - self = self.nans_to_nulls().dropna() - - if len(self) < 3: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - n = len(self) - miu = self.mean() - m3 = (((self - miu) ** self.normalize_binop_value(3)).sum()) / n - m2 = self.var(ddof=0) - - if m2 == 0: - return 0 - - unbiased_coef = ((n * (n - 1)) ** 0.5) / (n - 2) - skew = unbiased_coef * m3 / (m2 ** (3 / 2)) - return skew - - def quantile( - self, - q: np.ndarray, - interpolation: str, - exact: bool, - return_scalar: bool, - ) -> NumericalBaseColumn: - if np.logical_or(q < 0, q > 1).any(): - raise ValueError( - "percentiles should all be in the interval [0, 1]" - ) - # Beyond this point, q either being scalar or list-like - # will only have values in range [0, 1] - if len(self) == 0: - result = cast( - NumericalBaseColumn, - cudf.core.column.column_empty( - row_count=len(q), dtype=self.dtype, masked=True - ), - ) - else: - # get sorted indices and exclude nulls - indices = libcudf.sort.order_by( - [self], [True], "first", stable=True - ).slice(self.null_count, len(self)) - result = libcudf.quantiles.quantile( - self, q, interpolation, indices, exact - ) - if return_scalar: - scalar_result = result.element_indexing(0) - if interpolation in {"lower", "higher", "nearest"}: - try: - new_scalar = self.dtype.type(scalar_result) - scalar_result = ( - new_scalar - if new_scalar == scalar_result - else scalar_result - ) - except (TypeError, ValueError): - pass - return ( - cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - if scalar_result is NA - else scalar_result - ) - return result - - def mean( - self, - skipna: bool | None = None, - min_count: int = 0, - ): - return self._reduce("mean", skipna=skipna, min_count=min_count) - - def var( - self, - skipna: bool | None = None, - min_count: int = 0, - ddof=1, - ): - return self._reduce( - "var", skipna=skipna, min_count=min_count, ddof=ddof - ) - - def std( - self, - skipna: bool | None = None, - min_count: int = 0, - ddof=1, - ): - return self._reduce( - "std", skipna=skipna, min_count=min_count, ddof=ddof - ) - - def median(self, skipna: bool | None = None) -> NumericalBaseColumn: - skipna = True if skipna is None else skipna - - if self._can_return_nan(skipna=skipna): - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - # enforce linear in case the default ever changes - return self.quantile( - np.array([0.5]), - interpolation="linear", - exact=True, - return_scalar=True, - ) - - def cov(self, other: NumericalBaseColumn) -> float: - if ( - len(self) == 0 - or len(other) == 0 - or (len(self) == 1 and len(other) == 1) - ): - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - result = (self - self.mean()) * (other - other.mean()) - cov_sample = result.sum() / (len(self) - 1) - return cov_sample - - def corr(self, other: NumericalBaseColumn) -> float: - if len(self) == 0 or len(other) == 0: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - cov = self.cov(other) - lhs_std, rhs_std = self.std(), other.std() - - if not cov or lhs_std == 0 or rhs_std == 0: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - return cov / lhs_std / rhs_std - - def round( - self, decimals: int = 0, how: str = "half_even" - ) -> NumericalBaseColumn: - if not cudf.api.types.is_integer(decimals): - raise TypeError("Values in decimals must be integers") - """Round the values in the Column to the given number of decimals.""" - return libcudf.round.round(self, decimal_places=decimals, how=how) - - def _scan(self, op: str) -> ColumnBase: - return libcudf.reduce.scan( - op.replace("cum", ""), self, True - )._with_type_metadata(self.dtype) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py deleted file mode 100644 index 4463e3280df..00000000000 --- a/python/cudf/cudf/core/column/string.py +++ /dev/null @@ -1,6062 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import re -import warnings -from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast, overload - -import numpy as np -import pandas as pd -import pyarrow as pa - -import cudf -import cudf.api.types -from cudf import _lib as libcudf -from cudf._lib import string_casting as str_cast, strings as libstrings -from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype -from cudf.api.types import is_integer, is_scalar, is_string_dtype -from cudf.core.column import column, datetime -from cudf.core.column.column import ColumnBase -from cudf.core.column.methods import ColumnMethods -from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column - - -def str_to_boolean(column: StringColumn): - """Takes in string column and returns boolean column""" - return ( - libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8") - ).fillna(False) - - -if TYPE_CHECKING: - import cupy - import numba.cuda - - from cudf._typing import ( - ColumnBinaryOperand, - ColumnLike, - Dtype, - ScalarLike, - SeriesOrIndex, - ) - from cudf.core.buffer import Buffer - - -_str_to_numeric_typecast_functions = { - cudf.api.types.dtype("int8"): str_cast.stoi8, - cudf.api.types.dtype("int16"): str_cast.stoi16, - cudf.api.types.dtype("int32"): str_cast.stoi, - cudf.api.types.dtype("int64"): str_cast.stol, - cudf.api.types.dtype("uint8"): str_cast.stoui8, - cudf.api.types.dtype("uint16"): str_cast.stoui16, - cudf.api.types.dtype("uint32"): str_cast.stoui, - cudf.api.types.dtype("uint64"): str_cast.stoul, - cudf.api.types.dtype("float32"): str_cast.stof, - cudf.api.types.dtype("float64"): str_cast.stod, - cudf.api.types.dtype("bool"): str_to_boolean, -} - -_numeric_to_str_typecast_functions = { - cudf.api.types.dtype("int8"): str_cast.i8tos, - cudf.api.types.dtype("int16"): str_cast.i16tos, - cudf.api.types.dtype("int32"): str_cast.itos, - cudf.api.types.dtype("int64"): str_cast.ltos, - cudf.api.types.dtype("uint8"): str_cast.ui8tos, - cudf.api.types.dtype("uint16"): str_cast.ui16tos, - cudf.api.types.dtype("uint32"): str_cast.uitos, - cudf.api.types.dtype("uint64"): str_cast.ultos, - cudf.api.types.dtype("float32"): str_cast.ftos, - cudf.api.types.dtype("float64"): str_cast.dtos, - cudf.api.types.dtype("bool"): str_cast.from_booleans, -} - -_datetime_to_str_typecast_functions = { - # TODO: support Date32 UNIX days - # cudf.api.types.dtype("datetime64[D]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[s]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[ms]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[us]"): str_cast.int2timestamp, - cudf.api.types.dtype("datetime64[ns]"): str_cast.int2timestamp, -} - -_timedelta_to_str_typecast_functions = { - cudf.api.types.dtype("timedelta64[s]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[ms]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[us]"): str_cast.int2timedelta, - cudf.api.types.dtype("timedelta64[ns]"): str_cast.int2timedelta, -} - - -def _is_supported_regex_flags(flags): - return flags == 0 or ( - (flags & (re.MULTILINE | re.DOTALL) != 0) - and (flags & ~(re.MULTILINE | re.DOTALL) == 0) - ) - - -class StringMethods(ColumnMethods): - """ - Vectorized string functions for Series and Index. - - This mimics pandas ``df.str`` interface. nulls stay null - unless handled otherwise by a particular method. - Patterned after Python's string methods, with some - inspiration from R's stringr package. - """ - - _column: StringColumn - - def __init__(self, parent): - value_type = ( - parent.dtype.leaf_type - if isinstance(parent.dtype, cudf.ListDtype) - else parent.dtype - ) - if not is_string_dtype(value_type): - raise AttributeError( - "Can only use .str accessor with string values" - ) - super().__init__(parent=parent) - - def htoi(self) -> SeriesOrIndex: - """ - Returns integer value represented by each hex string. - String is interpreted to have hex (base-16) characters. - - Returns - ------- - Series/Index of str dtype - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["1234", "ABCDEF", "1A2", "cafe"]) - >>> s.str.htoi() - 0 4660 - 1 11259375 - 2 418 - 3 51966 - dtype: int64 - """ - - out = str_cast.htoi(self._column) - - return self._return_or_inplace(out, inplace=False) - - hex_to_int = htoi - - def ip2int(self) -> SeriesOrIndex: - """ - This converts ip strings to integers - - Returns - ------- - Series/Index of str dtype - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["12.168.1.1", "10.0.0.1"]) - >>> s.str.ip2int() - 0 212336897 - 1 167772161 - dtype: int64 - - Returns 0's if any string is not an IP. - - >>> s = cudf.Series(["12.168.1.1", "10.0.0.1", "abc"]) - >>> s.str.ip2int() - 0 212336897 - 1 167772161 - 2 0 - dtype: int64 - """ - - out = str_cast.ip2int(self._column) - - return self._return_or_inplace(out, inplace=False) - - ip_to_int = ip2int - - def __getitem__(self, key): - if isinstance(key, slice): - return self.slice(start=key.start, stop=key.stop, step=key.step) - else: - return self.get(key) - - def len(self) -> SeriesOrIndex: - r""" - Computes the length of each element in the Series/Index. - - Returns - ------- - Series or Index of int - A Series or Index of integer values - indicating the length of each element in the Series or Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["dog", "", "\n", None]) - >>> s.str.len() - 0 3 - 1 0 - 2 1 - 3 - dtype: int32 - """ - - return self._return_or_inplace( - libstrings.count_characters(self._column) - ) - - def byte_count(self) -> SeriesOrIndex: - """ - Computes the number of bytes of each string in the Series/Index. - - Returns - ------- - Series or Index of int - A Series or Index of integer values - indicating the number of bytes of each strings in the - Series or Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["abc","d","ef"]) - >>> s.str.byte_count() - 0 3 - 1 1 - 2 2 - dtype: int32 - >>> s = cudf.Series(["Hello", "Bye", "Thanks 😊"]) - >>> s.str.byte_count() - 0 5 - 1 3 - 2 11 - dtype: int32 - """ - return self._return_or_inplace( - libstrings.count_bytes(self._column), - ) - - @overload - def cat( - self, sep: str | None = None, na_rep: str | None = None - ) -> str: ... - - @overload - def cat( - self, others, sep: str | None = None, na_rep: str | None = None - ) -> SeriesOrIndex | "cudf.core.column.string.StringColumn": ... - - def cat(self, others=None, sep=None, na_rep=None): - """ - Concatenate strings in the Series/Index with given separator. - - If ``others`` is specified, this function concatenates the Series/Index - and elements of others element-wise. If others is not passed, then all - values in the Series/Index are concatenated into a single string with - a given sep. - - Parameters - ---------- - others : Series or List of str - Strings to be appended. - The number of strings must match ``size()`` of this instance. - This must be either a Series of string dtype or a Python - list of strings. - - sep : str - If specified, this separator will be appended to each string - before appending the others. - - na_rep : str - This character will take the place of any null strings - (not empty strings) in either list. - - - If ``na_rep`` is ``None``, and ``others`` is ``None``, - missing values in the Series/Index are - omitted from the result. - - - If ``na_rep`` is ``None``, and ``others`` is - not ``None``, a row containing a missing value - in any of the columns (before concatenation) - will have a missing value in the result. - - Returns - ------- - concat : str or Series/Index of str dtype - If ``others`` is ``None``, ``str`` is returned, - otherwise a ``Series/Index`` (same type as caller) - of str dtype is returned. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['a', 'b', None, 'd']) - >>> s.str.cat(sep=' ') - 'a b d' - - By default, NA values in the Series are ignored. Using na_rep, they - can be given a representation: - - >>> s.str.cat(sep=' ', na_rep='?') - 'a b ? d' - - If others is specified, corresponding values are concatenated with - the separator. Result will be a Series of strings. - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') - 0 a,A - 1 b,B - 2 - 3 d,D - dtype: object - - Missing values will remain missing in the result, but can again be - represented using na_rep - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') - 0 a,A - 1 b,B - 2 -,C - 3 d,D - dtype: object - - If sep is not specified, the values are concatenated without - separation. - - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') - 0 aA - 1 bB - 2 -C - 3 dD - dtype: object - """ - if sep is None: - sep = "" - - if others is None: - data = libstrings.join( - self._column, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), - ) - else: - other_cols = _get_cols_list(self._parent, others) - all_cols = [self._column] + other_cols - data = libstrings.concatenate( - all_cols, - cudf.Scalar(sep), - cudf.Scalar(na_rep, "str"), - ) - - if len(data) == 1 and data.null_count == 1: - data = cudf.core.column.as_column("", length=len(data)) - # We only want to keep the index if we are adding something to each - # row, not if we are joining all the rows into a single string. - out = self._return_or_inplace(data, retain_index=others is not None) - if len(out) == 1 and others is None: - if isinstance(out, cudf.Series): - out = out.iloc[0] - else: - out = out[0] - return out - - def join( - self, sep=None, string_na_rep=None, sep_na_rep=None - ) -> SeriesOrIndex: - """ - Join lists contained as elements in the Series/Index with passed - delimiter. - - If the elements of a Series are lists themselves, join the content of - these lists using the delimiter passed to the function. - This function is an equivalent to :meth:`str.join`. - In the special case that the lists in the Series contain only ``None``, - a ``/`None` value will always be returned. - - Parameters - ---------- - sep : str or array-like - If str, the delimiter is used between list entries. - If array-like, the string at a position is used as a - delimiter for corresponding row of the list entries. - string_na_rep : str, default None - This character will take the place of null strings - (not empty strings) in the Series but will be considered - only if the Series contains list elements and those lists have - at least one non-null string. If ``string_na_rep`` is ``None``, - it defaults to empty space "". - sep_na_rep : str, default None - This character will take the place of any null strings - (not empty strings) in `sep`. This parameter can be used - only if `sep` is array-like. If ``sep_na_rep`` is ``None``, - it defaults to empty space "". - - Returns - ------- - Series/Index: object - The list entries concatenated by intervening occurrences of - the delimiter. - - Raises - ------ - ValueError - - If ``sep_na_rep`` is supplied when ``sep`` is str. - - If ``sep`` is array-like and not of equal length with Series/Index. - TypeError - - If ``string_na_rep`` or ``sep_na_rep`` are not scalar values. - - If ``sep`` is not of following types: str or array-like. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([['a', 'b', 'c'], ['d', 'e'], ['f'], ['g', ' ', 'h']]) - >>> ser - 0 [a, b, c] - 1 [d, e] - 2 [f] - 3 [g, , h] - dtype: list - >>> ser.str.join(sep='-') - 0 a-b-c - 1 d-e - 2 f - 3 g- -h - dtype: object - - ``sep`` can an array-like input: - - >>> ser.str.join(sep=['-', '+', '.', '=']) - 0 a-b-c - 1 d+e - 2 f - 3 g= =h - dtype: object - - If the actual series doesn't have lists, each character is joined - by `sep`: - - >>> ser = cudf.Series(['abc', 'def', 'ghi']) - >>> ser - 0 abc - 1 def - 2 ghi - dtype: object - >>> ser.str.join(sep='_') - 0 a_b_c - 1 d_e_f - 2 g_h_i - dtype: object - - We can replace ``/`None` values present in lists using - ``string_na_rep`` if the lists contain at least one valid string - (lists containing all `None` will result in a ``/`None` value): - - >>> ser = cudf.Series([['a', 'b', None], [None, None, None], None, ['c', 'd']]) - >>> ser - 0 [a, b, None] - 1 [None, None, None] - 2 None - 3 [c, d] - dtype: list - >>> ser.str.join(sep='_', string_na_rep='k') - 0 a_b_k - 1 - 2 - 3 c_d - dtype: object - - We can replace ``/`None` values present in lists of ``sep`` - using ``sep_na_rep``: - - >>> ser.str.join(sep=[None, '^', '.', '-'], sep_na_rep='+') - 0 a+b+ - 1 - 2 - 3 c-d - dtype: object - """ # noqa E501 - if sep is None: - sep = "" - - if string_na_rep is None: - string_na_rep = "" - - if is_scalar(sep) and sep_na_rep: - raise ValueError( - "sep_na_rep cannot be defined when `sep` is scalar." - ) - - if sep_na_rep is None: - sep_na_rep = "" - - if not is_scalar(string_na_rep): - raise TypeError( - f"string_na_rep should be a string scalar, got {string_na_rep}" - f" of type : {type(string_na_rep)}" - ) - - if isinstance(self._column, cudf.core.column.ListColumn): - strings_column = self._column - else: - # If self._column is not a ListColumn, we will have to - # split each row by character and create a ListColumn out of it. - strings_column = self._split_by_character() - - if is_scalar(sep): - data = libstrings.join_lists_with_scalar( - strings_column, cudf.Scalar(sep), cudf.Scalar(string_na_rep) - ) - elif can_convert_to_column(sep): - sep_column = column.as_column(sep) - if len(sep_column) != len(strings_column): - raise ValueError( - f"sep should be of similar size to the series, " - f"got: {len(sep_column)}, expected: {len(strings_column)}" - ) - if not is_scalar(sep_na_rep): - raise TypeError( - f"sep_na_rep should be a string scalar, got {sep_na_rep} " - f"of type: {type(sep_na_rep)}" - ) - - data = libstrings.join_lists_with_column( - strings_column, - sep_column, - cudf.Scalar(string_na_rep), - cudf.Scalar(sep_na_rep), - ) - else: - raise TypeError( - f"sep should be an str, array-like or Series object, " - f"found {type(sep)}" - ) - - return self._return_or_inplace(data) - - def _split_by_character(self): - col = self._column.fillna("") # sanitize nulls - result_col = libstrings.character_tokenize(col) - - offset_col = col.children[0] - - return cudf.core.column.ListColumn( - data=None, - size=len(col), - dtype=cudf.ListDtype(col.dtype), - mask=col.mask, - offset=0, - null_count=0, - children=(offset_col, result_col), - ) - - def extract( - self, pat: str, flags: int = 0, expand: bool = True - ) -> SeriesOrIndex: - r""" - Extract capture groups in the regex `pat` as columns in a DataFrame. - - For each subject string in the Series, extract groups from the first - match of regular expression `pat`. - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - Flags to pass through to the regex engine (e.g. re.MULTILINE) - expand : bool, default True - If True, return DataFrame with one column per capture group. - If False, return a Series/Index if there is one capture group or - DataFrame if there are multiple capture groups. - - Returns - ------- - DataFrame or Series/Index - A DataFrame with one row for each subject string, and one column - for each group. If `expand=False` and `pat` has only one capture - group, then return a Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 - - A pattern with one group will return a DataFrame with one - column if expand=True. - - >>> s.str.extract(r'[ab](\d)', expand=True) - 0 - 0 1 - 1 2 - 2 - - A pattern with one group will return a Series if expand=False. - - >>> s.str.extract(r'[ab](\d)', expand=False) - 0 1 - 1 2 - 2 - dtype: object - - .. pandas-compat:: - :meth:`pandas.Series.str.extract` - - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - """ # noqa W605 - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) - - data = libstrings.extract(self._column, pat, flags) - if len(data) == 1 and expand is False: - _, data = data.popitem() - return self._return_or_inplace(data, expand=expand) - - def contains( - self, - pat: str | Sequence, - case: bool = True, - flags: int = 0, - na=np.nan, - regex: bool = True, - ) -> SeriesOrIndex: - r""" - Test if pattern or regex is contained within a string of a Series or - Index. - - Return boolean Series or Index based on whether a given pattern or - regex is contained within a string of a Series or Index. - - Parameters - ---------- - pat : str or list-like - Character sequence or regular expression. - If ``pat`` is list-like then regular expressions are not - accepted. - flags : int, default 0 (no flags) - Flags to pass through to the regex engine (e.g. re.MULTILINE) - regex : bool, default True - If True, assumes the pattern is a regular expression. - If False, treats the pattern as a literal string. - - Returns - ------- - Series/Index of bool dtype - A Series/Index of boolean dtype indicating whether the given - pattern is contained within the string of each element of the - Series/Index. - - Examples - -------- - >>> import cudf - >>> s1 = cudf.Series(['Mouse', 'dog', 'house and parrot', '23', None]) - >>> s1 - 0 Mouse - 1 dog - 2 house and parrot - 3 23 - 4 - dtype: object - >>> s1.str.contains('og', regex=False) - 0 False - 1 True - 2 False - 3 False - 4 - dtype: bool - - Returning an Index of booleans using only a literal pattern. - - >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.nan] - >>> idx = cudf.Index(data) - >>> idx - Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object') - >>> idx.str.contains('23', regex=False) - Index([False, False, False, True, ], dtype='bool') - - Returning 'house' or 'dog' when either expression occurs in a string. - - >>> s1.str.contains('house|dog', regex=True) - 0 False - 1 True - 2 True - 3 False - 4 - dtype: bool - - Returning any digit using regular expression. - - >>> s1.str.contains('\d', regex=True) - 0 False - 1 False - 2 False - 3 True - 4 - dtype: bool - - Ensure ``pat`` is a not a literal pattern when ``regex`` is set - to True. Note in the following example one might expect - only `s2[1]` and `s2[3]` to return True. However, - '.0' as a regex matches any character followed by a 0. - - >>> s2 = cudf.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) - 0 True - 1 True - 2 False - 3 True - 4 False - dtype: bool - - The ``pat`` may also be a sequence of strings in which case - the individual strings are searched in corresponding rows. - - >>> s2 = cudf.Series(['house', 'dog', 'and', '', '']) - >>> s1.str.contains(s2) - 0 False - 1 True - 2 True - 3 True - 4 - dtype: bool - - .. pandas-compat:: - :meth:`pandas.Series.str.contains` - - The parameters `case` and `na` are not yet supported and will - raise a NotImplementedError if anything other than the default - value is set. - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - """ # noqa W605 - if na is not np.nan: - raise NotImplementedError("`na` parameter is not yet supported") - if regex and isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) - if regex and not case: - raise NotImplementedError( - "`case=False` only supported when `regex=False`" - ) - - if is_scalar(pat): - if regex: - result_col = libstrings.contains_re(self._column, pat, flags) - else: - if case is False: - input_column = libstrings.to_lower(self._column) - pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore - else: - input_column = self._column - pat = cudf.Scalar(pat, dtype="str") # type: ignore - result_col = libstrings.contains(input_column, pat) - else: - # TODO: we silently ignore the `regex=` flag here - if case is False: - input_column = libstrings.to_lower(self._column) - col_pat = libstrings.to_lower( - column.as_column(pat, dtype="str") - ) - else: - input_column = self._column - col_pat = column.as_column(pat, dtype="str") - result_col = libstrings.contains_multiple(input_column, col_pat) - return self._return_or_inplace(result_col) - - def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: - """ - Test if a like pattern matches a string of a Series or Index. - - Return boolean Series or Index based on whether a given pattern - matches strings in a Series or Index. - - Parameters - ---------- - pat : str - Pattern for matching. Use '%' for any number of any character - including no characters. Use '_' for any single character. - - esc : str - Character to use if escape is necessary to match '%' or '_' - literals. - - Returns - ------- - Series/Index of bool dtype - A Series/Index of boolean dtype indicating whether the given - pattern matches the string of each element of the Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['abc', 'a', 'b' ,'ddbc', '%bb']) - >>> s.str.like('%b_') - 0 False - 1 False - 2 False - 3 True - 4 True - dtype: boolean - - Parameter `esc` can be used to match a wildcard literal. - - >>> s.str.like('/%b_', esc='/' ) - 0 False - 1 False - 2 False - 3 False - 4 True - dtype: boolean - """ - if not isinstance(pat, str): - raise TypeError( - f"expected a string object, not {type(pat).__name__}" - ) - - if esc is None: - esc = "" - - if not isinstance(esc, str): - raise TypeError( - f"expected a string object, not {type(esc).__name__}" - ) - - if len(esc) > 1: - raise ValueError( - "expected esc to contain less than or equal to 1 characters" - ) - - result_col = libstrings.like( - self._column, cudf.Scalar(pat, "str"), cudf.Scalar(esc, "str") - ) - - return self._return_or_inplace(result_col) - - def repeat( - self, - repeats: int | Sequence, - ) -> SeriesOrIndex: - """ - Duplicate each string in the Series or Index. - Equivalent to `str.repeat() - `_. - - Parameters - ---------- - repeats : int or sequence of int - Same value for all (int) or different value per (sequence). - - Returns - ------- - Series or Index of object - Series or Index of repeated string objects specified by - input parameter repeats. - - Examples - -------- - >>> s = cudf.Series(['a', 'b', 'c']) - >>> s - 0 a - 1 b - 2 c - dtype: object - - Single int repeats string in Series - - >>> s.str.repeat(repeats=2) - 0 aa - 1 bb - 2 cc - dtype: object - - Sequence of int repeats corresponding string in Series - - >>> s.str.repeat(repeats=[1, 2, 3]) - 0 a - 1 bb - 2 ccc - dtype: object - """ - if can_convert_to_column(repeats): - return self._return_or_inplace( - libstrings.repeat_sequence( - self._column, - column.as_column(repeats, dtype="int"), - ), - ) - - return self._return_or_inplace( - libstrings.repeat_scalar(self._column, repeats) - ) - - def replace( - self, - pat: str | Sequence, - repl: str | Sequence, - n: int = -1, - case=None, - flags: int = 0, - regex: bool = True, - ) -> SeriesOrIndex: - """ - Replace occurrences of pattern/regex in the Series/Index with some - other string. Equivalent to `str.replace() - `_ - or `re.sub() - `_. - - Parameters - ---------- - pat : str or list-like - String(s) to be replaced as a character sequence or regular - expression. - repl : str or list-like - String(s) to be used as replacement. - n : int, default -1 (all) - Number of replacements to make from the start. - regex : bool, default True - If True, assumes the pattern is a regular expression. - If False, treats the pattern as a literal string. - - Returns - ------- - Series/Index of str dtype - A copy of the object with all matching occurrences of pat replaced - by repl. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['foo', 'fuz', None]) - >>> s - 0 foo - 1 fuz - 2 - dtype: object - - When pat is a string and regex is True (the default), the given pat - is compiled as a regex. When repl is a string, it replaces matching - regex patterns as with ``re.sub()``. NaN value(s) in the Series - are left as is: - - >>> s.str.replace('f.', 'ba', regex=True) - 0 bao - 1 baz - 2 - dtype: object - - When pat is a string and `regex` is False, every pat is replaced - with repl as with ``str.replace()``: - - >>> s.str.replace('f.', 'ba', regex=False) - 0 foo - 1 fuz - 2 - dtype: object - - .. pandas-compat:: - :meth:`pandas.Series.str.replace` - - The parameters `case` and `flags` are not yet supported and will - raise a `NotImplementedError` if anything other than the default - value is set. - """ - if case is not None: - raise NotImplementedError("`case` parameter is not yet supported") - if flags != 0: - raise NotImplementedError("`flags` parameter is not yet supported") - - if can_convert_to_column(pat) and can_convert_to_column(repl): - if n != -1: - warnings.warn( - "`n` parameter is not supported when " - "`pat` and `repl` are list-like inputs" - ) - - return self._return_or_inplace( - libstrings.replace_multi_re( - self._column, - pat, - column.as_column(repl, dtype="str"), - ) - if regex - else libstrings.replace_multi( - self._column, - column.as_column(pat, dtype="str"), - column.as_column(repl, dtype="str"), - ), - ) - # Pandas treats 0 as all - if n == 0: - n = -1 - - # If 'pat' is re.Pattern then get the pattern string from it - if regex and isinstance(pat, re.Pattern): - pat = pat.pattern - - # Pandas forces non-regex replace when pat is a single-character - return self._return_or_inplace( - libstrings.replace_re( - self._column, pat, cudf.Scalar(repl, "str"), n - ) - if regex is True and len(pat) > 1 - else libstrings.replace( - self._column, - cudf.Scalar(pat, "str"), - cudf.Scalar(repl, "str"), - n, - ), - ) - - def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: - r""" - Use the ``repl`` back-ref template to create a new string - with the extracted elements found using the ``pat`` expression. - - Parameters - ---------- - pat : str or compiled regex - Regex with groupings to identify extract sections. - This should not be a compiled regex. - repl : str - String template containing back-reference indicators. - - Returns - ------- - Series/Index of str dtype - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["A543","Z756"]) - >>> s.str.replace_with_backrefs('(\\d)(\\d)', 'V\\2\\1') - 0 AV453 - 1 ZV576 - dtype: object - """ - - # If 'pat' is re.Pattern then get the pattern string from it - if isinstance(pat, re.Pattern): - pat = pat.pattern - - return self._return_or_inplace( - libstrings.replace_with_backrefs(self._column, pat, repl) - ) - - def slice( - self, - start: int | None = None, - stop: int | None = None, - step: int | None = None, - ) -> SeriesOrIndex: - """ - Slice substrings from each element in the Series or Index. - - Parameters - ---------- - start : int, optional - Start position for slice operation. - stop : int, optional - Stop position for slice operation. - step : int, optional - Step size for slice operation. - - Returns - ------- - Series/Index of str dtype - Series or Index from sliced substring from - original string object. - - See Also - -------- - slice_replace - Replace a slice with a string. - - get - Return element at position. Equivalent - to ``Series.str.slice(start=i, stop=i+1)`` - with ``i`` being the position. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["koala", "fox", "chameleon"]) - >>> s - 0 koala - 1 fox - 2 chameleon - dtype: object - >>> s.str.slice(start=1) - 0 oala - 1 ox - 2 hameleon - dtype: object - >>> s.str.slice(start=-1) - 0 a - 1 x - 2 n - dtype: object - >>> s.str.slice(stop=2) - 0 ko - 1 fo - 2 ch - dtype: object - >>> s.str.slice(step=2) - 0 kaa - 1 fx - 2 caeen - dtype: object - >>> s.str.slice(start=0, stop=5, step=3) - 0 kl - 1 f - 2 cm - dtype: object - """ - - return self._return_or_inplace( - libstrings.slice_strings(self._column, start, stop, step), - ) - - def isinteger(self) -> SeriesOrIndex: - """ - Check whether all characters in each string form integer. - - If a string has zero characters, False is returned for - that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isalpha - Check whether all characters are alphabetic. - - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - islower - Check whether all characters are lowercase. - - isspace - Check whether all characters are whitespace. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["1", "0.1", "+100", "-15", "abc"]) - >>> s.str.isinteger() - 0 True - 1 False - 2 True - 3 True - 4 False - dtype: bool - >>> s = cudf.Series(["this is plan text", "", "10 10"]) - >>> s.str.isinteger() - 0 False - 1 False - 2 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_integer(self._column)) - - def ishex(self) -> SeriesOrIndex: - """ - Check whether all characters in each string form a hex integer. - - If a string has zero characters, False is returned for - that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - See Also - -------- - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["", "123DEF", "0x2D3", "-15", "abc"]) - >>> s.str.ishex() - 0 False - 1 True - 2 True - 3 False - 4 True - dtype: bool - """ - return self._return_or_inplace(str_cast.is_hex(self._column)) - - def istimestamp(self, format: str) -> SeriesOrIndex: - """ - Check whether all characters in each string can be converted to - a timestamp using the given format. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["20201101", "192011", "18200111", "2120-11-01"]) - >>> s.str.istimestamp("%Y%m%d") - 0 True - 1 False - 2 True - 3 False - dtype: bool - """ - return self._return_or_inplace( - str_cast.istimestamp(self._column, format) - ) - - def isfloat(self) -> SeriesOrIndex: - r""" - Check whether all characters in each string form floating value. - - If a string has zero characters, False is returned for - that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isalpha - Check whether all characters are alphabetic. - - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isinteger - Check whether all characters are integer. - - isnumeric - Check whether all characters are numeric. - - islower - Check whether all characters are lowercase. - - isspace - Check whether all characters are whitespace. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["1.1", "0.123213", "+0.123", "-100.0001", "234", - ... "3-"]) - >>> s.str.isfloat() - 0 True - 1 True - 2 True - 3 True - 4 True - 5 False - dtype: bool - >>> s = cudf.Series(["this is plain text", "\t\n", "9.9", "9.9.9"]) - >>> s.str.isfloat() - 0 False - 1 False - 2 True - 3 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_float(self._column)) - - def isdecimal(self) -> SeriesOrIndex: - """ - Check whether all characters in each string are decimal. - - This is equivalent to running the Python string method - `str.isdecimal() - `_ - for each element of the Series/Index. - If a string has zero characters, False is returned for - that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isalpha - Check whether all characters are alphabetic. - - isdigit - Check whether all characters are digits. - - isinteger - Check whether all characters are integer. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - islower - Check whether all characters are lowercase. - - isspace - Check whether all characters are whitespace. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s3 = cudf.Series(['23', '³', '⅕', '']) - - The s3.str.isdecimal method checks for characters used to form - numbers in base 10. - - >>> s3.str.isdecimal() - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_decimal(self._column)) - - def isalnum(self) -> SeriesOrIndex: - """ - Check whether all characters in each string are alphanumeric. - - This is equivalent to running the Python string method - `str.isalnum() - `_ - for each element of the Series/Index. If a string has zero - characters, False is returned for that check. - - Equivalent to: ``isalpha() or isdigit() or isnumeric() or isdecimal()`` - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the - same length as the original Series/Index. - - See Also - -------- - isalpha - Check whether all characters are alphabetic. - - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isinteger - Check whether all characters are integer. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - islower - Check whether all characters are lowercase. - - isspace - Check whether all characters are whitespace. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s1 = cudf.Series(['one', 'one1', '1', '']) - >>> s1.str.isalnum() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - Note that checks against characters mixed with - any additional punctuation or whitespace will - evaluate to false for an alphanumeric check. - - >>> s2 = cudf.Series(['A B', '1.5', '3,000']) - >>> s2.str.isalnum() - 0 False - 1 False - 2 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_alnum(self._column)) - - def isalpha(self) -> SeriesOrIndex: - """ - Check whether all characters in each string are alphabetic. - - This is equivalent to running the Python string method - `str.isalpha() - `_ - for each element of the Series/Index. - If a string has zero characters, False is returned for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length - as the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isinteger - Check whether all characters are integer. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - islower - Check whether all characters are lowercase. - - isspace - Check whether all characters are whitespace. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s1 = cudf.Series(['one', 'one1', '1', '']) - >>> s1.str.isalpha() - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_alpha(self._column)) - - def isdigit(self) -> SeriesOrIndex: - """ - Check whether all characters in each string are digits. - - This is equivalent to running the Python string method - `str.isdigit() - `_ - for each element of the Series/Index. - If a string has zero characters, False is returned - for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isalpha - Check whether all characters are alphabetic. - - isdecimal - Check whether all characters are decimal. - - isinteger - Check whether all characters are integer. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - islower - Check whether all characters are lowercase. - - isspace - Check whether all characters are whitespace. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['23', '³', '⅕', '']) - - The ``s.str.isdigit`` method is the same as ``s.str.isdecimal`` but - also includes special digits, like superscripted and - subscripted digits in unicode. - - >>> s.str.isdigit() - 0 True - 1 True - 2 False - 3 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_digit(self._column)) - - def isnumeric(self) -> SeriesOrIndex: - """ - Check whether all characters in each string are numeric. - - This is equivalent to running the Python string method - `str.isnumeric() - `_ - for each element of the Series/Index. If a - string has zero characters, False is returned for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isalpha - Check whether all characters are alphabetic. - - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isinteger - Check whether all characters are integer. - - isfloat - Check whether all characters are float. - - islower - Check whether all characters are lowercase. - - isspace - Check whether all characters are whitespace. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s1 = cudf.Series(['one', 'one1', '1', '']) - >>> s1.str.isnumeric() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - The ``s1.str.isnumeric`` method is the same as ``s2.str.isdigit`` but - also includes other characters that can represent - quantities such as unicode fractions. - - >>> s2 = pd.Series(['23', '³', '⅕', ''], dtype='str') - >>> s2.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_numeric(self._column)) - - def isupper(self) -> SeriesOrIndex: - """ - Check whether all characters in each string are uppercase. - - This is equivalent to running the Python string method - `str.isupper() - `_ - for each element of the Series/Index. - If a string has zero characters, False is returned - for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isalpha - Check whether all characters are alphabetic. - - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isinteger - Check whether all characters are integer. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - islower - Check whether all characters are lowercase. - - isspace - Check whether all characters are whitespace. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - >>> s.str.isupper() - 0 False - 1 False - 2 True - 3 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_upper(self._column)) - - def islower(self) -> SeriesOrIndex: - """ - Check whether all characters in each string are lowercase. - - This is equivalent to running the Python string method - `str.islower() - `_ - for each element of the Series/Index. - If a string has zero characters, False is returned - for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isalpha - Check whether all characters are alphabetic. - - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isinteger - Check whether all characters are integer. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - isspace - Check whether all characters are whitespace. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - >>> s.str.islower() - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_lower(self._column)) - - def isipv4(self) -> SeriesOrIndex: - """ - Check whether all characters in each string form an IPv4 address. - - If a string has zero characters, False is returned for - that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same - length as the original Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["", "127.0.0.1", "255.255.255.255", "123.456"]) - >>> s.str.isipv4() - 0 False - 1 True - 2 True - 3 False - dtype: bool - """ - return self._return_or_inplace(str_cast.is_ipv4(self._column)) - - def lower(self) -> SeriesOrIndex: - """ - Converts all characters to lowercase. - - Equivalent to `str.lower() - `_. - - Returns - ------- - Series or Index of object - A copy of the object with all strings converted to lowercase. - - See Also - -------- - upper - Converts all characters to uppercase. - - title - Converts first character of each word to uppercase and remaining - to lowercase. - - capitalize - Converts first character to uppercase and remaining to lowercase. - - swapcase - Converts uppercase to lowercase and lowercase to uppercase. - - Examples - -------- - >>> import cudf - >>> data = ['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'] - >>> s = cudf.Series(data) - >>> s.str.lower() - 0 lower - 1 capitals - 2 this is a sentence - 3 swapcase - dtype: object - """ - return self._return_or_inplace(libstrings.to_lower(self._column)) - - def upper(self) -> SeriesOrIndex: - """ - Convert each string to uppercase. - This only applies to ASCII characters at this time. - - Equivalent to `str.upper() - `_. - - Returns - ------- - Series or Index of object - - See Also - -------- - lower - Converts all characters to lowercase. - - upper - Converts all characters to uppercase. - - title - Converts first character of each word to uppercase and - remaining to lowercase. - - capitalize - Converts first character to uppercase and remaining to - lowercase. - - swapcase - Converts uppercase to lowercase and lowercase to uppercase. - - Examples - -------- - >>> import cudf - >>> data = ['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'] - >>> s = cudf.Series(data) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - >>> s.str.upper() - 0 LOWER - 1 CAPITALS - 2 THIS IS A SENTENCE - 3 SWAPCASE - dtype: object - """ - return self._return_or_inplace(libstrings.to_upper(self._column)) - - def capitalize(self) -> SeriesOrIndex: - """ - Convert strings in the Series/Index to be capitalized. - This only applies to ASCII characters at this time. - - Returns - ------- - Series or Index of object - - Examples - -------- - >>> import cudf - >>> data = ['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'] - >>> s = cudf.Series(data) - >>> s.str.capitalize() - 0 Lower - 1 Capitals - 2 This is a sentence - 3 Swapcase - dtype: object - >>> s = cudf.Series(["hello, friend","goodbye, friend"]) - >>> s.str.capitalize() - 0 Hello, friend - 1 Goodbye, friend - dtype: object - """ - return self._return_or_inplace(libstrings.capitalize(self._column)) - - def swapcase(self) -> SeriesOrIndex: - """ - Change each lowercase character to uppercase and vice versa. - This only applies to ASCII characters at this time. - - Equivalent to `str.swapcase() - `_. - - Returns - ------- - Series or Index of object - - See Also - -------- - lower - Converts all characters to lowercase. - - upper - Converts all characters to uppercase. - - title - Converts first character of each word to uppercase and remaining - to lowercase. - - capitalize - Converts first character to uppercase and remaining to lowercase. - - Examples - -------- - >>> import cudf - >>> data = ['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'] - >>> s = cudf.Series(data) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - >>> s.str.swapcase() - 0 LOWER - 1 capitals - 2 THIS IS A SENTENCE - 3 sWaPcAsE - dtype: object - """ - return self._return_or_inplace(libstrings.swapcase(self._column)) - - def title(self) -> SeriesOrIndex: - """ - Uppercase the first letter of each letter after a space - and lowercase the rest. - This only applies to ASCII characters at this time. - - Equivalent to `str.title() - `_. - - Returns - ------- - Series or Index of object - - See Also - -------- - lower - Converts all characters to lowercase. - - upper - Converts all characters to uppercase. - - capitalize - Converts first character to uppercase and remaining to lowercase. - - swapcase - Converts uppercase to lowercase and lowercase to uppercase. - - Examples - -------- - >>> import cudf - >>> data = ['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) - >>> s = cudf.Series(data) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - >>> s.str.title() - 0 Lower - 1 Capitals - 2 This Is A Sentence - 3 Swapcase - dtype: object - """ - return self._return_or_inplace(libstrings.title(self._column)) - - def istitle(self) -> SeriesOrIndex: - """ - Check whether each string is title formatted. - The first letter of each word should be uppercase and the rest - should be lowercase. - - Equivalent to :meth:`str.istitle`. - - Returns - ------- - Series or Index of object - - Examples - -------- - >>> import cudf - >>> data = ['leopard', 'Golden Eagle', 'SNAKE', '']) - >>> s = cudf.Series(data) - >>> s.str.istitle() - 0 False - 1 True - 2 False - 3 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_title(self._column)) - - def filter_alphanum( - self, repl: str | None = None, keep: bool = True - ) -> SeriesOrIndex: - """ - Remove non-alphanumeric characters from strings in this column. - - Parameters - ---------- - repl : str - Optional string to use in place of removed characters. - keep : bool - Set to False to remove all alphanumeric characters instead - of keeping them. - - Returns - ------- - Series/Index of str dtype - Strings with only alphanumeric characters. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["pears £12", "plums $34", "Temp 72℉", "100K℧"]) - >>> s.str.filter_alphanum(" ") - 0 pears 12 - 1 plums 34 - 2 Temp 72 - 3 100K - dtype: object - """ - if repl is None: - repl = "" - - return self._return_or_inplace( - libstrings.filter_alphanum( - self._column, cudf.Scalar(repl, "str"), keep - ), - ) - - def slice_from( - self, starts: "cudf.Series", stops: "cudf.Series" - ) -> SeriesOrIndex: - """ - Return substring of each string using positions for each string. - - The starts and stops parameters are of Column type. - - Parameters - ---------- - starts : Series - Beginning position of each the string to extract. - Default is beginning of the each string. - stops : Series - Ending position of the each string to extract. - Default is end of each string. - Use -1 to specify to the end of that string. - - Returns - ------- - Series/Index of str dtype - A substring of each string using positions for each string. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["hello","there"]) - >>> s - 0 hello - 1 there - dtype: object - >>> starts = cudf.Series([1, 3]) - >>> stops = cudf.Series([5, 5]) - >>> s.str.slice_from(starts, stops) - 0 ello - 1 re - dtype: object - """ - - return self._return_or_inplace( - libstrings.slice_from( - self._column, - column.as_column(starts), - column.as_column(stops), - ), - ) - - def slice_replace( - self, - start: int | None = None, - stop: int | None = None, - repl: str | None = None, - ) -> SeriesOrIndex: - """ - Replace the specified section of each string with a new string. - - Parameters - ---------- - start : int, optional - Beginning position of the string to replace. - Default is beginning of the each string. - stop : int, optional - Ending position of the string to replace. - Default is end of each string. - repl : str, optional - String to insert into the specified position values. - - Returns - ------- - Series/Index of str dtype - A new string with the specified section of the string - replaced with `repl` string. - - See Also - -------- - slice - Just slicing without replacement. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) - >>> s - 0 a - 1 ab - 2 abc - 3 abdc - 4 abcde - dtype: object - - Specify just `start`, meaning replace `start` until the `end` of - the string with `repl`. - - >>> s.str.slice_replace(1, repl='X') - 0 aX - 1 aX - 2 aX - 3 aX - 4 aX - dtype: object - - Specify just `stop`, meaning the `start` of the string to `stop` - is replaced with `repl`, and the rest of the string is included. - - >>> s.str.slice_replace(stop=2, repl='X') - 0 X - 1 X - 2 Xc - 3 Xdc - 4 Xcde - dtype: object - - Specify `start` and `stop`, meaning the slice from `start` - to `stop` is replaced with `repl`. Everything before or - after `start` and `stop` is included as is. - - >>> s.str.slice_replace(start=1, stop=3, repl='X') - 0 aX - 1 aX - 2 aX - 3 aXc - 4 aXde - dtype: object - """ - if start is None: - start = 0 - - if stop is None: - stop = -1 - - if repl is None: - repl = "" - - return self._return_or_inplace( - libstrings.slice_replace( - self._column, start, stop, cudf.Scalar(repl, "str") - ), - ) - - def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: - """ - Insert the specified string into each string in the specified - position. - - Parameters - ---------- - start : int - Beginning position of the string to replace. - Default is beginning of the each string. - Specify -1 to insert at the end of each string. - repl : str - String to insert into the specified position value. - - Returns - ------- - Series/Index of str dtype - A new string series with the specified string - inserted at the specified position. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["abcdefghij", "0123456789"]) - >>> s.str.insert(2, '_') - 0 ab_cdefghij - 1 01_23456789 - dtype: object - - When no `repl` is passed, nothing is inserted. - - >>> s.str.insert(2) - 0 abcdefghij - 1 0123456789 - dtype: object - - Negative values are also supported for `start`. - - >>> s.str.insert(-1,'_') - 0 abcdefghij_ - 1 0123456789_ - dtype: object - """ - if repl is None: - repl = "" - - return self._return_or_inplace( - libstrings.insert(self._column, start, cudf.Scalar(repl, "str")), - ) - - def get(self, i: int = 0) -> SeriesOrIndex: - """ - Extract element from each component at specified position. - - Parameters - ---------- - i : int - Position of element to extract. - - Returns - ------- - Series/Index of str dtype - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["hello world", "rapids", "cudf"]) - >>> s - 0 hello world - 1 rapids - 2 cudf - dtype: object - >>> s.str.get(10) - 0 d - 1 - 2 - dtype: object - >>> s.str.get(1) - 0 e - 1 a - 2 u - dtype: object - - ``get`` also accepts negative index number. - - >>> s.str.get(-1) - 0 d - 1 s - 2 f - dtype: object - """ - - return self._return_or_inplace(libstrings.get(self._column, i)) - - def get_json_object( - self, - json_path, - *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False, - ): - r""" - Applies a JSONPath string to an input strings column - where each row in the column is a valid json string - - Parameters - ---------- - json_path : str - The JSONPath string to be applied to each row - of the input column - allow_single_quotes : bool, default False - If True, representing strings with single - quotes is allowed. - If False, strings must only be represented - with double quotes. - strip_quotes_from_single_strings : bool, default True - If True, strip the quotes from the return value of - a given row if it is a string. - If False, values returned for a given row include - quotes if they are strings. - missing_fields_as_nulls : bool, default False - If True, when an object is queried for a field - it does not contain, "null" is returned. - If False, when an object is queried for a field - it does not contain, None is returned. - - Returns - ------- - Column: New strings column containing the retrieved json object strings - - Examples - -------- - >>> import cudf - >>> s = cudf.Series( - [ - \"\"\" - { - "store":{ - "book":[ - { - "category":"reference", - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - \"\"\" - ]) - >>> s - 0 {"store": {\n "book": [\n { "cat... - dtype: object - >>> s.str.get_json_object("$.store.book") - 0 [\n { "category": "reference",\n ... - dtype: object - """ - - options = libstrings.GetJsonObjectOptions( - allow_single_quotes=allow_single_quotes, - strip_quotes_from_single_strings=( - strip_quotes_from_single_strings - ), - missing_fields_as_nulls=missing_fields_as_nulls, - ) - return self._return_or_inplace( - libstrings.get_json_object( - self._column, cudf.Scalar(json_path, "str"), options - ) - ) - - def split( - self, - pat: str | None = None, - n: int = -1, - expand: bool = False, - regex: bool | None = None, - ) -> SeriesOrIndex: - """ - Split strings around given separator/delimiter. - - Splits the string in the Series/Index from the beginning, at the - specified delimiter string. Similar to `str.split() - `_. - - Parameters - ---------- - pat : str, default None - String or regular expression to split on. If not specified, split - on whitespace. - n : int, default -1 (all) - Limit number of splits in output. `None`, 0, and -1 will all be - interpreted as "all splits". - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding - dimensionality. - * If ``False``, return Series/Index, containing lists - of strings. - regex : bool, default None - Determines if the passed-in pattern is a regular expression: - - * If ``True``, assumes the passed-in pattern is a regular - expression - * If ``False``, treats the pattern as a literal string. - * If pat length is 1, treats pat as a literal string. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - See Also - -------- - rsplit - Splits string around given separator/delimiter, starting from - the right. - - str.split - Standard library version for split. - - str.rsplit - Standard library version for rsplit. - - Notes - ----- - The handling of the n keyword depends on the number - of found splits: - - - If found splits > n, make first n splits only - - If found splits <= n, make all splits - - If for a certain row the number of found - splits < n, append None for padding up to n - if ``expand=True``. - - If using ``expand=True``, Series and Index callers return - DataFrame and MultiIndex objects, respectively. - - Examples - -------- - >>> import cudf - >>> data = ["this is a regular sentence", - ... "https://docs.python.org/index.html", None] - >>> s = cudf.Series(data) - >>> s - 0 this is a regular sentence - 1 https://docs.python.org/index.html - 2 - dtype: object - - In the default setting, the string is split by whitespace. - - >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/index.html] - 2 None - dtype: list - - Without the ``n`` parameter, the outputs of ``rsplit`` - and ``split`` are identical. - - >>> s.str.rsplit() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/index.html] - 2 None - dtype: list - - The `n` parameter can be used to limit the number of - splits on the delimiter. - - >>> s.str.split(n=2) - 0 [this, is, a regular sentence] - 1 [https://docs.python.org/index.html] - 2 None - dtype: list - - The `pat` parameter can be used to split by other characters. - - >>> s.str.split(pat="/") - 0 [this is a regular sentence] - 1 [https:, , docs.python.org, index.html] - 2 None - dtype: list - - When using ``expand=True``, the split elements will expand out - into separate columns. If ```` value is present, it is propagated - throughout the columns during the split. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 https://docs.python.org/index.html - 2 - """ - - if expand not in (True, False): - raise ValueError( - f"expand parameter accepts only : [True, False], " - f"got {expand}" - ) - - # Pandas treats 0 as all - if n is None or n == 0: - n = -1 - - if pat is None: - pat = "" - - if regex and isinstance(pat, re.Pattern): - pat = pat.pattern - - if len(str(pat)) <= 1: - regex = False - - if expand: - if self._column.null_count == len(self._column): - result_table = {0: self._column.copy()} - else: - if regex is True: - data, _ = libstrings.split_re(self._column, pat, n) - else: - data, _ = libstrings.split( - self._column, cudf.Scalar(pat, "str"), n - ) - if len(data) == 1 and data[0].null_count == len(self._column): - result_table = {} - else: - result_table = data - else: - if regex is True: - result_table = libstrings.split_record_re(self._column, pat, n) - else: - result_table = libstrings.split_record( - self._column, cudf.Scalar(pat, "str"), n - ) - - return self._return_or_inplace(result_table, expand=expand) - - def rsplit( - self, - pat: str | None = None, - n: int = -1, - expand: bool = False, - regex: bool | None = None, - ) -> SeriesOrIndex: - """ - Split strings around given separator/delimiter. - - Splits the string in the Series/Index from the end, at the - specified delimiter string. Similar to `str.rsplit() - `_. - - Parameters - ---------- - pat : str, default ' ' (space) - String to split on, does not yet support regular expressions. - n : int, default -1 (all) - Limit number of splits in output. `None`, 0, and -1 will all be - interpreted as "all splits". - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding - dimensionality. - * If ``False``, return Series/Index, containing lists - of strings. - regex : bool, default None - Determines if the passed-in pattern is a regular expression: - - * If ``True``, assumes the passed-in pattern is a regular - expression - * If ``False``, treats the pattern as a literal string. - * If pat length is 1, treats pat as a literal string. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - See Also - -------- - split - Split strings around given separator/delimiter. - - str.split - Standard library version for split. - - str.rsplit - Standard library version for rsplit. - - Notes - ----- - The handling of the n keyword depends on the number of - found splits: - - - If found splits > n, make first n splits only - - If found splits <= n, make all splits - - If for a certain row the number of found splits < n, - append None for padding up to n if ``expand=True``. - - If using ``expand=True``, Series and Index callers return - DataFrame and MultiIndex objects, respectively. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series( - ... [ - ... "this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... None - ... ] - ... ) - >>> s - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html - 2 - dtype: object - - In the default setting, the string is split by whitespace. - - >>> s.str.rsplit() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 None - dtype: list - - Without the ``n`` parameter, the outputs of ``rsplit`` - and ``split`` are identical. - - >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 None - dtype: list - - The n parameter can be used to limit the number of - splits on the delimiter. The outputs of split and rsplit are different. - - >>> s.str.rsplit(n=2) - 0 [this is a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 None - dtype: list - >>> s.str.split(n=2) - 0 [this, is, a regular sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 None - dtype: list - - When using ``expand=True``, the split elements will expand - out into separate columns. If ```` value is present, - it is propagated throughout the columns during the split. - - >>> s.str.rsplit(n=2, expand=True) - 0 1 2 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html - 2 - - For slightly more complex use cases like splitting the - html document name from a url, a combination of parameter - settings can be used. - - >>> s.str.rsplit("/", n=1, expand=True) - 0 1 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial index.html - 2 - """ - - if expand not in (True, False): - raise ValueError( - f"expand parameter accepts only : [True, False], " - f"got {expand}" - ) - - # Pandas treats 0 as all - if n == 0: - n = -1 - - if pat is None: - pat = "" - - if regex and isinstance(pat, re.Pattern): - pat = pat.pattern - - if expand: - if self._column.null_count == len(self._column): - result_table = {0: self._column.copy()} - else: - if regex is True: - data, _ = libstrings.rsplit_re(self._column, pat, n) - else: - data, _ = libstrings.rsplit( - self._column, cudf.Scalar(pat, "str"), n - ) - if len(data) == 1 and data[0].null_count == len(self._column): - result_table = {} - else: - result_table = data - else: - if regex is True: - result_table = libstrings.rsplit_record_re( - self._column, pat, n - ) - else: - result_table = libstrings.rsplit_record( - self._column, cudf.Scalar(pat, "str"), n - ) - - return self._return_or_inplace(result_table, expand=expand) - - def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: - """ - Split the string at the first occurrence of sep. - - This method splits the string at the first occurrence - of sep, and returns 3 elements containing the part - before the separator, the separator itself, and the - part after the separator. If the separator is not found, - return 3 elements containing the string itself, followed - by two empty strings. - - Parameters - ---------- - sep : str, default ' ' (whitespace) - String to split on. - - Returns - ------- - DataFrame or MultiIndex - Returns a DataFrame / MultiIndex - - See Also - -------- - rpartition - Split the string at the last occurrence of sep. - - split - Split strings around given separators. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['Linda van der Berg', 'George Pitt-Rivers']) - >>> s - 0 Linda van der Berg - 1 George Pitt-Rivers - dtype: object - - >>> s.str.partition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by something different than a space: - - >>> s.str.partition('-') - 0 1 2 - 0 Linda van der Berg - 1 George Pitt - Rivers - - Also available on indices: - - >>> idx = cudf.Index(['X 123', 'Y 999']) - >>> idx - Index(['X 123', 'Y 999'], dtype='object') - - Which will create a MultiIndex: - - >>> idx.str.partition() - MultiIndex([('X', ' ', '123'), - ('Y', ' ', '999')], - ) - - .. pandas-compat:: - :meth:`pandas.Series.str.partition` - - The parameter `expand` is not yet supported and will raise a - `NotImplementedError` if anything other than the default - value is set. - - """ - if expand is not True: - raise NotImplementedError( - "`expand=False` is currently not supported" - ) - - if sep is None: - sep = " " - - return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0], - expand=expand, - ) - - def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: - """ - Split the string at the last occurrence of sep. - - This method splits the string at the last occurrence - of sep, and returns 3 elements containing the part - before the separator, the separator itself, and the - part after the separator. If the separator is not - found, return 3 elements containing two empty strings, - followed by the string itself. - - Parameters - ---------- - sep : str, default ' ' (whitespace) - String to split on. - - Returns - ------- - DataFrame or MultiIndex - Returns a DataFrame / MultiIndex - - Notes - ----- - The parameter `expand` is not yet supported and will raise a - `NotImplementedError` if anything other than the default value is set. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['Linda van der Berg', 'George Pitt-Rivers']) - >>> s - 0 Linda van der Berg - 1 George Pitt-Rivers - dtype: object - >>> s.str.rpartition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - Also available on indices: - - >>> idx = cudf.Index(['X 123', 'Y 999']) - >>> idx - Index(['X 123', 'Y 999'], dtype='object') - - Which will create a MultiIndex: - - >>> idx.str.rpartition() - MultiIndex([('X', ' ', '123'), - ('Y', ' ', '999')], - ) - """ - if expand is not True: - raise NotImplementedError( - "`expand=False` is currently not supported" - ) - - if sep is None: - sep = " " - - return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0], - expand=expand, - ) - - def pad( - self, width: int, side: str = "left", fillchar: str = " " - ) -> SeriesOrIndex: - """ - Pad strings in the Series/Index up to width. - - Parameters - ---------- - width : int - Minimum width of resulting string; - additional characters will be filled with - character defined in fillchar. - - side : {'left', 'right', 'both'}, default 'left' - Side from which to fill resulting string. - - fillchar : str, default ' ' (whitespace) - Additional character for filling, default is whitespace. - - Returns - ------- - Series/Index of object - Returns Series or Index with minimum number - of char in object. - - See Also - -------- - rjust - Fills the left side of strings with an arbitrary character. - Equivalent to ``Series.str.pad(side='left')``. - - ljust - Fills the right side of strings with an arbitrary character. - Equivalent to ``Series.str.pad(side='right')``. - - center - Fills both sides of strings with an arbitrary character. - Equivalent to ``Series.str.pad(side='both')``. - - zfill - Pad strings in the Series/Index by prepending '0' character. - Equivalent to ``Series.str.pad(side='left', fillchar='0')``. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["caribou", "tiger"]) - - >>> s.str.pad(width=10) - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10, side='right', fillchar='-') - 0 caribou--- - 1 tiger----- - dtype: object - - >>> s.str.pad(width=10, side='both', fillchar='-') - 0 -caribou-- - 1 --tiger--- - dtype: object - """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - try: - side = libstrings.SideType[side.upper()] - except KeyError: - raise ValueError( - "side has to be either one of {'left', 'right', 'both'}" - ) - - return self._return_or_inplace( - libstrings.pad(self._column, width, fillchar, side) - ) - - def zfill(self, width: int) -> SeriesOrIndex: - """ - Pad strings in the Series/Index by prepending '0' characters. - - Strings in the Series/Index are padded with '0' characters - on the left of the string to reach a total string length - width. Strings in the Series/Index with length greater - or equal to width are unchanged. - - The sign character is preserved if it appears in the first - position of the string. - - Parameters - ---------- - width : int - Minimum length of resulting string; - strings with length less than width - be prepended with '0' characters. - - Returns - ------- - Series/Index of str dtype - Returns Series or Index with prepended '0' characters. - - See Also - -------- - rjust - Fills the left side of strings with an arbitrary character. - - ljust - Fills the right side of strings with an arbitrary character. - - pad - Fills the specified sides of strings with an arbitrary character. - - center - Fills both sides of strings with an arbitrary character. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['-1', '1', '1000', None]) - >>> s - 0 -1 - 1 1 - 2 1000 - 3 - dtype: object - - Note that ``None`` is not string, therefore it is converted - to ``None``. ``1000`` remains unchanged as - it is longer than width. - - >>> s.str.zfill(3) - 0 -01 - 1 001 - 2 1000 - 3 - dtype: object - """ - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace(libstrings.zfill(self._column, width)) - - def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: - """ - Filling left and right side of strings in the Series/Index with an - additional character. - - Parameters - ---------- - width : int - Minimum width of resulting string; - additional characters will be filled - with fillchar. - - fillchar : str, default is ' ' (whitespace) - Additional character for filling. - - Returns - ------- - Series/Index of str dtype - Returns Series or Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['a', 'b', None, 'd']) - >>> s.str.center(1) - 0 a - 1 b - 2 - 3 d - dtype: object - >>> s.str.center(1, fillchar='-') - 0 a - 1 b - 2 - 3 d - dtype: object - >>> s.str.center(2, fillchar='-') - 0 a- - 1 b- - 2 - 3 d- - dtype: object - >>> s.str.center(5, fillchar='-') - 0 --a-- - 1 --b-- - 2 - 3 --d-- - dtype: object - >>> s.str.center(6, fillchar='-') - 0 --a--- - 1 --b--- - 2 - 3 --d--- - dtype: object - """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.center(self._column, width, fillchar) - ) - - def ljust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: - """ - Filling right side of strings in the Series/Index with an additional - character. Equivalent to `str.ljust() - `_. - - Parameters - ---------- - width : int - Minimum width of resulting string; - additional characters will be filled - with ``fillchar``. - - fillchar : str, default ' ' (whitespace) - Additional character for filling, default is whitespace. - - Returns - ------- - Series/Index of str dtype - Returns Series or Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["hello world", "rapids ai"]) - >>> s.str.ljust(10, fillchar="_") - 0 hello world - 1 rapids ai_ - dtype: object - >>> s = cudf.Series(["a", "", "ab", "__"]) - >>> s.str.ljust(1, fillchar="-") - 0 a - 1 - - 2 ab - 3 __ - dtype: object - """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.ljust(self._column, width, fillchar) - ) - - def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: - """ - Filling left side of strings in the Series/Index with an additional - character. Equivalent to `str.rjust() - `_. - - Parameters - ---------- - width : int - Minimum width of resulting string; - additional characters will be filled - with fillchar. - - fillchar : str, default ' ' (whitespace) - Additional character for filling, default is whitespace. - - Returns - ------- - Series/Index of str dtype - Returns Series or Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["hello world", "rapids ai"]) - >>> s.str.rjust(20, fillchar="_") - 0 _________hello world - 1 ___________rapids ai - dtype: object - >>> s = cudf.Series(["a", "", "ab", "__"]) - >>> s.str.rjust(1, fillchar="-") - 0 a - 1 - - 2 ab - 3 __ - dtype: object - """ - if not isinstance(fillchar, str): - msg = ( - f"fillchar must be a character, not {type(fillchar).__name__}" - ) - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - return self._return_or_inplace( - libstrings.rjust(self._column, width, fillchar) - ) - - def strip(self, to_strip: str | None = None) -> SeriesOrIndex: - r""" - Remove leading and trailing characters. - - Strip whitespaces (including newlines) or a set of - specified characters from each string in the Series/Index - from left and right sides. Equivalent to `str.strip() - `_. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to be removed. - All combinations of this set of characters - will be stripped. If None then whitespaces are removed. - - Returns - ------- - Series/Index of str dtype - Returns Series or Index. - - See Also - -------- - lstrip - Remove leading characters in Series/Index. - - rstrip - Remove trailing characters in Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) - >>> s - 0 1. Ant. - 1 2. Bee!\n - 2 3. Cat?\t - 3 - dtype: object - >>> s.str.strip() - 0 1. Ant. - 1 2. Bee! - 2 3. Cat? - 3 - dtype: object - >>> s.str.strip('123.!? \n\t') - 0 Ant - 1 Bee - 2 Cat - 3 - dtype: object - """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.strip(self._column, cudf.Scalar(to_strip, "str")) - ) - - def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: - r""" - Remove leading and trailing characters. - - Strip whitespaces (including newlines) - or a set of specified characters from - each string in the Series/Index from left side. - Equivalent to `str.lstrip() - `_. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to be removed. - All combinations of this set of characters will - be stripped. If None then whitespaces are removed. - - Returns - ------- - Series or Index of object - - See Also - -------- - strip - Remove leading and trailing characters in Series/Index. - - rstrip - Remove trailing characters in Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) - >>> s.str.lstrip('123.') - 0 Ant. - 1 Bee!\n - 2 Cat?\t - 3 - dtype: object - """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str")) - ) - - def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: - r""" - Remove leading and trailing characters. - - Strip whitespaces (including newlines) - or a set of specified characters from each - string in the Series/Index from right side. - Equivalent to `str.rstrip() - `_. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to - be removed. All combinations of this - set of characters will be stripped. - If None then whitespaces are removed. - - Returns - ------- - Series/Index of str dtype - Returns Series or Index. - - See Also - -------- - strip - Remove leading and trailing characters in Series/Index. - - lstrip - Remove leading characters in Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) - >>> s - 0 1. Ant. - 1 2. Bee!\n - 2 3. Cat?\t - 3 - dtype: object - >>> s.str.rstrip('.!? \n\t') - 0 1. Ant - 1 2. Bee - 2 3. Cat - 3 - dtype: object - """ - if to_strip is None: - to_strip = "" - - return self._return_or_inplace( - libstrings.rstrip(self._column, cudf.Scalar(to_strip, "str")) - ) - - def wrap(self, width: int, **kwargs) -> SeriesOrIndex: - r""" - Wrap long strings in the Series/Index to be formatted in - paragraphs with length less than a given width. - - Parameters - ---------- - width : int - Maximum line width. - - Returns - ------- - Series or Index - - Notes - ----- - The parameters `expand_tabsbool`, `replace_whitespace`, - `drop_whitespace`, `break_long_words`, `break_on_hyphens`, - `expand_tabsbool` are not yet supported and will raise a - NotImplementedError if they are set to any value. - - This method currently achieves behavior matching R's - stringr library ``str_wrap`` function, the equivalent - pandas implementation can be obtained using the - following parameter setting: - - expand_tabs = False - - replace_whitespace = True - - drop_whitespace = True - - break_long_words = False - - break_on_hyphens = False - - Examples - -------- - >>> import cudf - >>> data = ['line to be wrapped', 'another line to be wrapped'] - >>> s = cudf.Series(data) - >>> s.str.wrap(12) - 0 line to be\nwrapped - 1 another line\nto be\nwrapped - dtype: object - """ - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - expand_tabs = kwargs.get("expand_tabs", None) - if expand_tabs is True: - raise NotImplementedError("`expand_tabs=True` is not supported") - elif expand_tabs is None: - warnings.warn( - "wrap current implementation defaults to `expand_tabs`=False" - ) - - replace_whitespace = kwargs.get("replace_whitespace", True) - if not replace_whitespace: - raise NotImplementedError( - "`replace_whitespace=False` is not supported" - ) - - drop_whitespace = kwargs.get("drop_whitespace", True) - if not drop_whitespace: - raise NotImplementedError( - "`drop_whitespace=False` is not supported" - ) - - break_long_words = kwargs.get("break_long_words", None) - if break_long_words is True: - raise NotImplementedError( - "`break_long_words=True` is not supported" - ) - elif break_long_words is None: - warnings.warn( - "wrap current implementation defaults to " - "`break_long_words`=False" - ) - - break_on_hyphens = kwargs.get("break_on_hyphens", None) - if break_long_words is True: - raise NotImplementedError( - "`break_on_hyphens=True` is not supported" - ) - elif break_on_hyphens is None: - warnings.warn( - "wrap current implementation defaults to " - "`break_on_hyphens`=False" - ) - - return self._return_or_inplace(libstrings.wrap(self._column, width)) - - def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: - r""" - Count occurrences of pattern in each string of the Series/Index. - - This function is used to count the number of times a particular - regex pattern is repeated in each of the string elements of the Series. - - Parameters - ---------- - pat : str or compiled regex - Valid regular expression. - flags : int, default 0 (no flags) - Flags to pass through to the regex engine (e.g. re.MULTILINE) - - Returns - ------- - Series or Index - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['A', 'B', 'Aaba', 'Baca', None, 'CABA', 'cat']) - >>> s.str.count('a') - 0 0 - 1 0 - 2 2 - 3 2 - 4 - 5 0 - 6 1 - dtype: int32 - - Escape ``'$'`` to find the literal dollar sign. - - >>> s = cudf.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\$') - 0 1 - 1 0 - 2 1 - 3 2 - 4 2 - 5 0 - dtype: int32 - - This is also available on Index. - - >>> index = cudf.Index(['A', 'A', 'Aaba', 'cat']) - >>> index.str.count('a') - Index([0, 0, 2, 1], dtype='int64') - - .. pandas-compat:: - :meth:`pandas.Series.str.count` - - - `flags` parameter currently only supports re.DOTALL - and re.MULTILINE. - - Some characters need to be escaped when passing - in pat. e.g. ``'$'`` has a special meaning in regex - and must be escaped when finding this literal character. - """ # noqa W605 - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) - - return self._return_or_inplace( - libstrings.count_re(self._column, pat, flags) - ) - - def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: - """ - Find all occurrences of pattern or regular expression in the - Series/Index. - - Parameters - ---------- - pat : str - Pattern or regular expression. - flags : int, default 0 (no flags) - Flags to pass through to the regex engine (e.g. re.MULTILINE) - - Returns - ------- - DataFrame - All non-overlapping matches of pattern or - regular expression in each string of this Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['Lion', 'Monkey', 'Rabbit']) - - The search for the pattern 'Monkey' returns one match: - - >>> s.str.findall('Monkey') - 0 [] - 1 [Monkey] - 2 [] - dtype: list - - When the pattern matches more than one string - in the Series, all matches are returned: - - >>> s.str.findall('on') - 0 [on] - 1 [on] - 2 [] - dtype: list - - Regular expressions are supported too. For instance, - the search for all the strings ending with - the word 'on' is shown next: - - >>> s.str.findall('on$') - 0 [on] - 1 [] - 2 [] - dtype: list - - If the pattern is found more than once in the same - string, then multiple strings are returned: - - >>> s.str.findall('b') - 0 [] - 1 [] - 2 [b, b] - dtype: list - - .. pandas-compat:: - :meth:`pandas.Series.str.findall` - - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - """ - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) - - data = libstrings.findall(self._column, pat, flags) - return self._return_or_inplace(data) - - def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: - """ - Find all first occurrences of patterns in the Series/Index. - - Parameters - ---------- - patterns : array-like, Sequence or Series - Patterns to search for in the given Series/Index. - - Returns - ------- - Series - A Series with a list of indices of each pattern's first occurrence. - If a pattern is not found, -1 is returned for that index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["strings", "to", "search", "in"]) - >>> s - 0 strings - 1 to - 2 search - 3 in - dtype: object - >>> t = cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]) - >>> t - 0 a - 1 string - 2 g - 3 inn - 4 o - 5 r - 6 sea - dtype: object - >>> s.str.find_multiple(t) - 0 [-1, 0, 5, -1, -1, 2, -1] - 1 [-1, -1, -1, -1, 1, -1, -1] - 2 [2, -1, -1, -1, -1, 3, 0] - 3 [-1, -1, -1, -1, -1, -1, -1] - dtype: list - """ - if can_convert_to_column(patterns): - patterns_column = column.as_column(patterns) - else: - raise TypeError( - "patterns should be an array-like or a Series object, " - f"found {type(patterns)}" - ) - - if not isinstance(patterns_column, StringColumn): - raise TypeError( - "patterns can only be of 'string' dtype, " - f"got: {patterns_column.dtype}" - ) - - return cudf.Series._from_column( - libstrings.find_multiple(self._column, patterns_column), - name=self._parent.name, - index=self._parent.index - if isinstance(self._parent, cudf.Series) - else self._parent, - ) - - def isempty(self) -> SeriesOrIndex: - """ - Check whether each string is an empty string. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length as - the original Series/Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["1", "abc", "", " ", None]) - >>> s.str.isempty() - 0 False - 1 False - 2 True - 3 False - 4 False - dtype: bool - """ - return self._return_or_inplace( - # mypy can't deduce that the return value of - # StringColumn.__eq__ is ColumnBase because the binops are - # dynamically added by a mixin class - cast(ColumnBase, self._column == "").fillna(False) - ) - - def isspace(self) -> SeriesOrIndex: - r""" - Check whether all characters in each string are whitespace. - - This is equivalent to running the Python string method - `str.isspace() - `_ - for each element of the Series/Index. - If a string has zero characters, False is returned - for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length as - the original Series/Index. - - See Also - -------- - isalnum - Check whether all characters are alphanumeric. - - isalpha - Check whether all characters are alphabetic. - - isdecimal - Check whether all characters are decimal. - - isdigit - Check whether all characters are digits. - - isinteger - Check whether all characters are integer. - - isnumeric - Check whether all characters are numeric. - - isfloat - Check whether all characters are float. - - islower - Check whether all characters are lowercase. - - isupper - Check whether all characters are uppercase. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([' ', '\t\r\n ', '']) - >>> s.str.isspace() - 0 True - 1 True - 2 False - dtype: bool - """ - return self._return_or_inplace(libstrings.is_space(self._column)) - - def endswith(self, pat: str) -> SeriesOrIndex: - """ - Test if the end of each string element matches a pattern. - - Parameters - ---------- - pat : str or list-like - If `str` is an `str`, evaluates whether each string of - series ends with `pat`. - If `pat` is a list-like, evaluates whether `self[i]` - ends with `pat[i]`. - Regular expressions are not accepted. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given - pattern matches the end of each string element. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['bat', 'bear', 'caT', None]) - >>> s - 0 bat - 1 bear - 2 caT - 3 - dtype: object - >>> s.str.endswith('t') - 0 True - 1 False - 2 False - 3 - dtype: bool - - .. pandas-compat:: - :meth:`pandas.Series.str.endswith` - - `na` parameter is not yet supported, as cudf uses - native strings instead of Python objects. - """ - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): - result_col = libstrings.endswith( - self._column, cudf.Scalar(pat, "str") - ) - else: - result_col = libstrings.endswith_multiple( - self._column, column.as_column(pat, dtype="str") - ) - - return self._return_or_inplace(result_col) - - def startswith(self, pat: str | Sequence) -> SeriesOrIndex: - """ - Test if the start of each string element matches a pattern. - - Equivalent to `str.startswith() - `_. - - Parameters - ---------- - pat : str or list-like - If `str` is an `str`, evaluates whether each string of - series starts with `pat`. - If `pat` is a list-like, evaluates whether `self[i]` - starts with `pat[i]`. - Regular expressions are not accepted. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given - pattern matches the start of each string element. - - See Also - -------- - endswith - Same as startswith, but tests the end of string. - - contains - Tests if string element contains a pattern. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['bat', 'Bear', 'cat', None]) - >>> s - 0 bat - 1 Bear - 2 cat - 3 - dtype: object - >>> s.str.startswith('b') - 0 True - 1 False - 2 False - 3 - dtype: bool - """ - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): - result_col = libstrings.startswith( - self._column, cudf.Scalar(pat, "str") - ) - else: - result_col = libstrings.startswith_multiple( - self._column, column.as_column(pat, dtype="str") - ) - - return self._return_or_inplace(result_col) - - def removesuffix(self, suffix: str) -> SeriesOrIndex: - """ - Remove a suffix from an object series. - - If the suffix is not present, the original string will be returned. - - Parameters - ---------- - suffix : str - Remove the suffix of the string. - - Returns - ------- - Series/Index: object - The Series or Index with given suffix removed. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["foo_str", "bar_str", "no_suffix"]) - >>> s - 0 foo_str - 1 bar_str - 2 no_suffix - dtype: object - >>> s.str.removesuffix("_str") - 0 foo - 1 bar - 2 no_suffix - dtype: object - """ - if suffix is None or len(suffix) == 0: - return self._return_or_inplace(self._column) - ends_column = libstrings.endswith( - self._column, cudf.Scalar(suffix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, 0, -len(suffix), None - ) - result = cudf._lib.copying.copy_if_else( - removed_column, self._column, ends_column - ) - return self._return_or_inplace(result) - - def removeprefix(self, prefix: str) -> SeriesOrIndex: - """ - Remove a prefix from an object series. - - If the prefix is not present, the original string will be returned. - - Parameters - ---------- - prefix : str - Remove the prefix of the string. - - Returns - ------- - Series/Index: object - The Series or Index with given prefix removed. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["str_foo", "str_bar", "no_prefix"]) - >>> s - 0 str_foo - 1 str_bar - 2 no_prefix - dtype: object - >>> s.str.removeprefix("str_") - 0 foo - 1 bar - 2 no_prefix - dtype: object - """ - if prefix is None or len(prefix) == 0: - return self._return_or_inplace(self._column) - starts_column = libstrings.startswith( - self._column, cudf.Scalar(prefix, "str") - ) - removed_column = libstrings.slice_strings( - self._column, len(prefix), None, None - ) - result = cudf._lib.copying.copy_if_else( - removed_column, self._column, starts_column - ) - return self._return_or_inplace(result) - - def find( - self, sub: str, start: int = 0, end: int | None = None - ) -> SeriesOrIndex: - """ - Return lowest indexes in each strings in the Series/Index - where the substring is fully contained between ``[start:end]``. - Return -1 on failure. - - Parameters - ---------- - sub : str - Substring being searched. - - start : int - Left edge index. - - end : int - Right edge index. - - Returns - ------- - Series or Index of int - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['abc', 'a','b' ,'ddb']) - >>> s.str.find('b') - 0 1 - 1 -1 - 2 0 - 3 2 - dtype: int32 - - Parameters such as `start` and `end` can also be used. - - >>> s.str.find('b', start=1, end=5) - 0 1 - 1 -1 - 2 -1 - 3 2 - dtype: int32 - """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - return self._return_or_inplace(result_col) - - def rfind( - self, sub: str, start: int = 0, end: int | None = None - ) -> SeriesOrIndex: - """ - Return highest indexes in each strings in the Series/Index - where the substring is fully contained between ``[start:end]``. - Return -1 on failure. Equivalent to standard `str.rfind() - `_. - - Parameters - ---------- - sub : str - Substring being searched. - - start : int - Left edge index. - - end : int - Right edge index. - - Returns - ------- - Series or Index of int - - See Also - -------- - find - Return lowest indexes in each strings. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["abc", "hello world", "rapids ai"]) - >>> s.str.rfind('a') - 0 0 - 1 -1 - 2 7 - dtype: int32 - - Using `start` and `end` parameters. - - >>> s.str.rfind('a', start=2, end=5) - 0 -1 - 1 -1 - 2 -1 - dtype: int32 - """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - return self._return_or_inplace(result_col) - - def index( - self, sub: str, start: int = 0, end: int | None = None - ) -> SeriesOrIndex: - """ - Return lowest indexes in each strings where the substring - is fully contained between ``[start:end]``. This is the same - as str.find except instead of returning -1, it raises a ValueError - when the substring is not found. - - Parameters - ---------- - sub : str - Substring being searched. - - start : int - Left edge index. - - end : int - Right edge index. - - Returns - ------- - Series or Index of object - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['abc', 'a','b' ,'ddb']) - >>> s.str.index('b') - Traceback (most recent call last): - File "", line 1, in - ValueError: substring not found - - Parameters such as `start` and `end` can also be used. - - >>> s = cudf.Series(['abc', 'abb','ab' ,'ddb']) - >>> s.str.index('b', start=1, end=5) - 0 1 - 1 1 - 2 1 - 3 2 - dtype: int32 - """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.find( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - result = self._return_or_inplace(result_col) - - if (result == -1).any(): - raise ValueError("substring not found") - else: - return result - - def rindex( - self, sub: str, start: int = 0, end: int | None = None - ) -> SeriesOrIndex: - """ - Return highest indexes in each strings where the substring - is fully contained between ``[start:end]``. This is the same - as ``str.rfind`` except instead of returning -1, it raises a - ``ValueError`` when the substring is not found. - - Parameters - ---------- - sub : str - Substring being searched. - - start : int - Left edge index. - - end : int - Right edge index. - - Returns - ------- - Series or Index of object - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['abc', 'a','b' ,'ddb']) - >>> s.str.rindex('b') - Traceback (most recent call last): - File "", line 1, in - ValueError: substring not found - - Parameters such as `start` and `end` can also be used. - - >>> s = cudf.Series(['abc', 'abb','ab' ,'ddb']) - >>> s.str.rindex('b', start=1, end=5) - 0 1 - 1 2 - 2 1 - 3 2 - dtype: int32 - """ - if not isinstance(sub, str): - raise TypeError( - f"expected a string object, not {type(sub).__name__}" - ) - - if end is None: - end = -1 - - result_col = libstrings.rfind( - self._column, cudf.Scalar(sub, "str"), start, end - ) - - result = self._return_or_inplace(result_col) - - if (result == -1).any(): - raise ValueError("substring not found") - else: - return result - - def match( - self, pat: str, case: bool = True, flags: int = 0 - ) -> SeriesOrIndex: - """ - Determine if each string matches a regular expression. - - Parameters - ---------- - pat : str or compiled regex - Character sequence or regular expression. - flags : int, default 0 (no flags) - Flags to pass through to the regex engine (e.g. re.MULTILINE) - - Returns - ------- - Series or Index of boolean values. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["rapids", "ai", "cudf"]) - - Checking for strings starting with `a`. - - >>> s.str.match('a') - 0 False - 1 True - 2 False - dtype: bool - - Checking for strings starting with any of `a` or `c`. - - >>> s.str.match('[ac]') - 0 False - 1 True - 2 True - dtype: bool - - .. pandas-compat:: - :meth:`pandas.Series.str.match` - - Parameters `case` and `na` are currently not supported. - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - """ - if case is not True: - raise NotImplementedError("`case` parameter is not yet supported") - if isinstance(pat, re.Pattern): - flags = pat.flags & ~re.U - pat = pat.pattern - if not _is_supported_regex_flags(flags): - raise NotImplementedError( - "unsupported value for `flags` parameter" - ) - - return self._return_or_inplace( - libstrings.match_re(self._column, pat, flags) - ) - - def url_decode(self) -> SeriesOrIndex: - """ - Returns a URL-decoded format of each string. - No format checking is performed. All characters - are expected to be encoded as UTF-8 hex values. - - Returns - ------- - Series or Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['A%2FB-C%2FD', 'e%20f.g', '4-5%2C6']) - >>> s.str.url_decode() - 0 A/B-C/D - 1 e f.g - 2 4-5,6 - dtype: object - >>> data = ["https%3A%2F%2Frapids.ai%2Fstart.html", - ... "https%3A%2F%2Fmedium.com%2Frapids-ai"] - >>> s = cudf.Series(data) - >>> s.str.url_decode() - 0 https://rapids.ai/start.html - 1 https://medium.com/rapids-ai - dtype: object - """ - - return self._return_or_inplace(libstrings.url_decode(self._column)) - - def url_encode(self) -> SeriesOrIndex: - """ - Returns a URL-encoded format of each string. - No format checking is performed. - All characters are encoded except for ASCII letters, - digits, and these characters: ``'.','_','-','~'``. - Encoding converts to hex using UTF-8 encoded bytes. - - Returns - ------- - Series or Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['A/B-C/D', 'e f.g', '4-5,6']) - >>> s.str.url_encode() - 0 A%2FB-C%2FD - 1 e%20f.g - 2 4-5%2C6 - dtype: object - >>> data = ["https://rapids.ai/start.html", - ... "https://medium.com/rapids-ai"] - >>> s = cudf.Series(data) - >>> s.str.url_encode() - 0 https%3A%2F%2Frapids.ai%2Fstart.html - 1 https%3A%2F%2Fmedium.com%2Frapids-ai - dtype: object - """ - return self._return_or_inplace(libstrings.url_encode(self._column)) - - def code_points(self) -> SeriesOrIndex: - """ - Returns an array by filling it with the UTF-8 code point - values for each character of each string. - This function uses the ``len()`` method to determine - the size of each sub-array of integers. - - Returns - ------- - Series or Index. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["a","xyz", "éee"]) - >>> s.str.code_points() - 0 97 - 1 120 - 2 121 - 3 122 - 4 50089 - 5 101 - 6 101 - dtype: int32 - >>> s = cudf.Series(["abc"]) - >>> s.str.code_points() - 0 97 - 1 98 - 2 99 - dtype: int32 - """ - return self._return_or_inplace( - libstrings.code_points(self._column), retain_index=False - ) - - def translate(self, table: dict) -> SeriesOrIndex: - """ - Map all characters in the string through the given - mapping table. - - Equivalent to standard `str.translate() - `_. - - Parameters - ---------- - table : dict - Table is a mapping of Unicode ordinals to Unicode - ordinals, strings, or None. - Unmapped characters are left untouched. - `str.maketrans() - `_ - is a helper function for making translation tables. - - Returns - ------- - Series or Index. - - Examples - -------- - >>> import cudf - >>> data = ['lower', 'CAPITALS', 'this is a sentence','SwApCaSe'] - >>> s = cudf.Series(data) - >>> s.str.translate({'a': "1"}) - 0 lower - 1 CAPITALS - 2 this is 1 sentence - 3 SwApC1Se - dtype: object - >>> s.str.translate({'a': "1", "e":"#"}) - 0 low#r - 1 CAPITALS - 2 this is 1 s#nt#nc# - 3 SwApC1S# - dtype: object - """ - table = str.maketrans(table) - return self._return_or_inplace( - libstrings.translate(self._column, table) - ) - - def filter_characters( - self, table: dict, keep: bool = True, repl: str | None = None - ) -> SeriesOrIndex: - """ - Remove characters from each string using the character ranges - in the given mapping table. - - Parameters - ---------- - table : dict - This table is a range of Unicode ordinals to filter. - The minimum value is the key and the maximum value is the value. - You can use `str.maketrans() - `_ - as a helper function for making the filter table. - Overlapping ranges will cause undefined results. - Range values are inclusive. - keep : boolean - If False, the character ranges in the ``table`` are removed. - If True, the character ranges not in the ``table`` are removed. - Default is True. - repl : str - Optional replacement string to use in place of removed characters. - - Returns - ------- - Series or Index. - - Examples - -------- - >>> import cudf - >>> data = ['aeiou', 'AEIOU', '0123456789'] - >>> s = cudf.Series(data) - >>> s.str.filter_characters({'a':'l', 'M':'Z', '4':'6'}) - 0 aei - 1 OU - 2 456 - dtype: object - >>> s.str.filter_characters({'a':'l', 'M':'Z', '4':'6'}, False, "_") - 0 ___ou - 1 AEI__ - 2 0123___789 - dtype: object - """ - if repl is None: - repl = "" - table = str.maketrans(table) - return self._return_or_inplace( - libstrings.filter_characters( - self._column, table, keep, cudf.Scalar(repl, "str") - ), - ) - - def normalize_spaces(self) -> SeriesOrIndex: - r""" - Remove extra whitespace between tokens and trim whitespace - from the beginning and the end of each string. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series(["hello \\t world"," test string "]) - >>> ser.str.normalize_spaces() - 0 hello world - 1 test string - dtype: object - """ - return self._return_or_inplace( - libstrings.normalize_spaces(self._column) - ) - - def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: - r""" - Normalizes strings characters for tokenizing. - - This uses the normalizer that is built into the - subword_tokenize function which includes: - - - adding padding around punctuation (unicode category starts with - "P") as well as certain ASCII symbols like "^" and "$" - - adding padding around the CJK Unicode block characters - - changing whitespace (e.g. ``\t``, ``\n``, ``\r``) to space - - removing control characters (unicode categories "Cc" and "Cf") - - If `do_lower_case = true`, lower-casing also removes the accents. - The accents cannot be removed from upper-case characters without - lower-casing and lower-casing cannot be performed without also - removing accents. However, if the accented character is already - lower-case, then only the accent is removed. - - Parameters - ---------- - do_lower : bool, Default is True - If set to True, characters will be lower-cased and accents - will be removed. If False, accented and upper-case characters - are not transformed. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series(["héllo, \tworld","ĂĆCĖÑTED","$99"]) - >>> ser.str.normalize_characters() - 0 hello , world - 1 accented - 2 $ 99 - dtype: object - >>> ser.str.normalize_characters(do_lower=False) - 0 héllo , world - 1 ĂĆCĖÑTED - 2 $ 99 - dtype: object - """ - return self._return_or_inplace( - libstrings.normalize_characters(self._column, do_lower) - ) - - def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: - """ - Each string is split into tokens using the provided delimiter(s). - The sequence returned contains the tokens in the order - they were found. - - Parameters - ---------- - delimiter : str or list of strs, Default is whitespace. - The string used to locate the split points of each string. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> data = ["hello world", "goodbye world", "hello goodbye"] - >>> ser = cudf.Series(data) - >>> ser.str.tokenize() - 0 hello - 0 world - 1 goodbye - 1 world - 2 hello - 2 goodbye - dtype: object - """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - - if isinstance(delimiter, Column): - result = self._return_or_inplace( - libstrings._tokenize_column(self._column, delimiter), - retain_index=False, - ) - elif isinstance(delimiter, cudf.Scalar): - result = self._return_or_inplace( - libstrings._tokenize_scalar(self._column, delimiter), - retain_index=False, - ) - else: - raise TypeError( - f"Expected a Scalar or Column\ - for delimiters, but got {type(delimiter)}" - ) - if isinstance(self._parent, cudf.Series): - result.index = self._parent.index.repeat( # type: ignore - self.token_count(delimiter=delimiter) - ) - return result - - def detokenize( - self, indices: "cudf.Series", separator: str = " " - ) -> SeriesOrIndex: - """ - Combines tokens into strings by concatenating them in the order - in which they appear in the ``indices`` column. The ``separator`` is - concatenated between each token. - - Parameters - ---------- - indices : Series - Each value identifies the output row for the corresponding token. - separator : str - The string concatenated between each token in an output row. - Default is space. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> strs = cudf.Series(["hello", "world", "one", "two", "three"]) - >>> indices = cudf.Series([0, 0, 1, 1, 2]) - >>> strs.str.detokenize(indices) - 0 hello world - 1 one two - 2 three - dtype: object - """ - separator = _massage_string_arg(separator, "separator") - return self._return_or_inplace( - libstrings.detokenize(self._column, indices._column, separator), - retain_index=False, - ) - - def character_tokenize(self) -> SeriesOrIndex: - """ - Each string is split into individual characters. - The sequence returned contains each character as an individual string. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> data = ["hello world", None, "goodbye, thank you."] - >>> ser = cudf.Series(data) - >>> ser.str.character_tokenize() - 0 h - 0 e - 0 l - 0 l - 0 o - 0 - 0 w - 0 o - 0 r - 0 l - 0 d - 2 g - 2 o - 2 o - 2 d - 2 b - 2 y - 2 e - 2 , - 2 - 2 t - 2 h - 2 a - 2 n - 2 k - 2 - 2 y - 2 o - 2 u - 2 . - dtype: object - """ - result_col = libstrings.character_tokenize(self._column) - if isinstance(self._parent, cudf.Series): - lengths = self.len().fillna(0) - index = self._parent.index.repeat(lengths) - return cudf.Series._from_column( - result_col, name=self._parent.name, index=index - ) - elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index._from_column(result_col, name=self._parent.name) - else: - return result_col - - def token_count(self, delimiter: str = " ") -> SeriesOrIndex: - """ - Each string is split into tokens using the provided delimiter. - The returned integer sequence is the number of tokens in each string. - - Parameters - ---------- - delimiter : str or list of strs, Default is whitespace. - The characters or strings used to locate the - split points of each string. - - Returns - ------- - Series or Index. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series(["hello world","goodbye",""]) - >>> ser.str.token_count() - 0 2 - 1 1 - 2 0 - dtype: int32 - """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): - return self._return_or_inplace( - libstrings._count_tokens_column(self._column, delimiter) - ) - - elif isinstance(delimiter, cudf.Scalar): - return self._return_or_inplace( - libstrings._count_tokens_scalar(self._column, delimiter) - ) - else: - raise TypeError( - f"Expected a Scalar or Column\ - for delimiters, but got {type(delimiter)}" - ) - - def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: - """ - Generate the n-grams from a set of tokens, each record - in series is treated a token. - - You can generate tokens from a Series instance using - the ``Series.str.tokenize()`` function. - - Parameters - ---------- - n : int - The degree of the n-gram (number of consecutive tokens). - Default of 2 for bigrams. - separator : str - The separator to use between within an n-gram. - Default is '_'. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> str_series.str.ngrams(2, "_") - 0 this is my_favorite book - dtype: object - >>> str_series = cudf.Series(['abc','def','xyz','hhh']) - >>> str_series.str.ngrams(2, "_") - 0 abc_def - 1 def_xyz - 2 xyz_hhh - dtype: object - """ - separator = _massage_string_arg(separator, "separator") - return self._return_or_inplace( - libstrings.generate_ngrams(self._column, n, separator), - retain_index=False, - ) - - def character_ngrams( - self, n: int = 2, as_list: bool = False - ) -> SeriesOrIndex: - """ - Generate the n-grams from characters in a column of strings. - - Parameters - ---------- - n : int - The degree of the n-gram (number of consecutive characters). - Default of 2 for bigrams. - as_list : bool - Set to True to return ngrams in a list column where each - list element is the ngrams for each string. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['abcd','efgh','xyz']) - >>> str_series.str.character_ngrams(2) - 0 ab - 0 bc - 0 cd - 1 ef - 1 fg - 1 gh - 2 xy - 2 yz - dtype: object - >>> str_series.str.character_ngrams(3) - 0 abc - 0 bcd - 1 efg - 1 fgh - 2 xyz - dtype: object - >>> str_series.str.character_ngrams(3,True) - 0 [abc, bcd] - 1 [efg, fgh] - 2 [xyz] - dtype: list - """ - result = self._return_or_inplace( - libstrings.generate_character_ngrams(self._column, n), - retain_index=True, - ) - if isinstance(result, cudf.Series) and not as_list: - # before exploding, removes those lists which have 0 length - result = result[result.list.len() > 0] - return result.explode() # type: ignore - return result - - def hash_character_ngrams( - self, n: int = 5, as_list: bool = False - ) -> SeriesOrIndex: - """ - Generate hashes of n-grams from characters in a column of strings. - The MurmurHash32 algorithm is used to produce the hash results. - - Parameters - ---------- - n : int - The degree of the n-gram (number of consecutive characters). - Default is 5. - as_list : bool - Set to True to return the hashes in a list column where each - list element is the hashes for each string. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['abcdefg','stuvwxyz']) - >>> str_series.str.hash_character_ngrams(5, True) - 0 [3902511862, 570445242, 4202475763] - 1 [556054766, 3166857694, 3760633458, 192452857] - dtype: list - >>> str_series.str.hash_character_ngrams(5) - 0 3902511862 - 0 570445242 - 0 4202475763 - 1 556054766 - 1 3166857694 - 1 3760633458 - 1 192452857 - dtype: uint32 - """ - - result = self._return_or_inplace( - libstrings.hash_character_ngrams(self._column, n), - retain_index=True, - ) - if isinstance(result, cudf.Series) and not as_list: - return result.explode() - return result - - def ngrams_tokenize( - self, n: int = 2, delimiter: str = " ", separator: str = "_" - ) -> SeriesOrIndex: - """ - Generate the n-grams using tokens from each string. - This will tokenize each string and then generate ngrams for each - string. - - Parameters - ---------- - n : int, Default 2. - The degree of the n-gram (number of consecutive tokens). - delimiter : str, Default is white-space. - The character used to locate the split points of each string. - sep : str, Default is '_'. - The separator to use between tokens within an n-gram. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series(['this is the', 'best book']) - >>> ser.str.ngrams_tokenize(n=2, sep='_') - 0 this_is - 1 is_the - 2 best_book - dtype: object - """ - delimiter = _massage_string_arg(delimiter, "delimiter") - separator = _massage_string_arg(separator, "separator") - return self._return_or_inplace( - libstrings.ngrams_tokenize(self._column, n, delimiter, separator), - retain_index=False, - ) - - def replace_tokens( - self, targets, replacements, delimiter: str | None = None - ) -> SeriesOrIndex: - """ - The targets tokens are searched for within each string in the series - and replaced with the corresponding replacements if found. - Tokens are identified by the delimiter character provided. - - Parameters - ---------- - targets : array-like, Sequence or Series - The tokens to search for inside each string. - - replacements : array-like, Sequence, Series or str - The strings to replace for each found target token found. - Alternately, this can be a single str instance and would be - used as replacement for each string found. - - delimiter : str - The character used to locate the tokens of each string. - Default is whitespace. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> sr = cudf.Series(["this is me", "theme music", ""]) - >>> targets = cudf.Series(["is", "me"]) - >>> sr.str.replace_tokens(targets=targets, replacements="_") - 0 this _ _ - 1 theme music - 2 - dtype: object - >>> sr = cudf.Series(["this;is;me", "theme;music", ""]) - >>> sr.str.replace_tokens(targets=targets, replacements=":") - 0 this;is;me - 1 theme;music - 2 - dtype: object - """ - if can_convert_to_column(targets): - targets_column = column.as_column(targets) - else: - raise TypeError( - f"targets should be an array-like or a Series object, " - f"found {type(targets)}" - ) - - if is_scalar(replacements): - replacements_column = column.as_column([replacements]) - elif can_convert_to_column(replacements): - replacements_column = column.as_column(replacements) - if len(targets_column) != len(replacements_column): - raise ValueError( - "targets and replacements should be same size" - " sequences unless replacements is a string." - ) - else: - raise TypeError( - f"replacements should be an str, array-like or Series object, " - f"found {type(replacements)}" - ) - - if delimiter is None: - delimiter = "" - elif not is_scalar(delimiter): - raise TypeError( - f"Type of delimiter should be a string," - f" found {type(delimiter)}" - ) - - return self._return_or_inplace( - libstrings.replace_tokens( - self._column, - targets_column, - replacements_column, - cudf.Scalar(delimiter, dtype="str"), - ), - ) - - def filter_tokens( - self, - min_token_length: int, - replacement: str | None = None, - delimiter: str | None = None, - ) -> SeriesOrIndex: - """ - Remove tokens from within each string in the series that are - smaller than min_token_length and optionally replace them - with the replacement string. - Tokens are identified by the delimiter character provided. - - Parameters - ---------- - min_token_length: int - Minimum number of characters for a token to be retained - in the output string. - - replacement : str - String used in place of removed tokens. - - delimiter : str - The character(s) used to locate the tokens of each string. - Default is whitespace. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> sr = cudf.Series(["this is me", "theme music", ""]) - >>> sr.str.filter_tokens(3, replacement="_") - 0 this _ _ - 1 theme music - 2 - dtype: object - >>> sr = cudf.Series(["this;is;me", "theme;music", ""]) - >>> sr.str.filter_tokens(5,None,";") - 0 ;; - 1 theme;music - 2 - dtype: object - """ - - if replacement is None: - replacement = "" - elif not is_scalar(replacement): - raise TypeError( - f"Type of replacement should be a string," - f" found {type(replacement)}" - ) - - if delimiter is None: - delimiter = "" - elif not is_scalar(delimiter): - raise TypeError( - f"Type of delimiter should be a string," - f" found {type(delimiter)}" - ) - - return self._return_or_inplace( - libstrings.filter_tokens( - self._column, - min_token_length, - cudf.Scalar(replacement, dtype="str"), - cudf.Scalar(delimiter, dtype="str"), - ), - ) - - def porter_stemmer_measure(self) -> SeriesOrIndex: - """ - Compute the Porter Stemmer measure for each string. - The Porter Stemmer algorithm is described `here - `_. - - Returns - ------- - Series or Index of object. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series(["hello", "super"]) - >>> ser.str.porter_stemmer_measure() - 0 1 - 1 2 - dtype: int32 - """ - return self._return_or_inplace( - libstrings.porter_stemmer_measure(self._column) - ) - - def is_consonant(self, position) -> SeriesOrIndex: - """ - Return true for strings where the character at ``position`` is a - consonant. The ``position`` parameter may also be a list of integers - to check different characters per string. - If the ``position`` is larger than the string length, False is - returned for that string. - - Parameters - ---------- - position: int or list-like - The character position to check within each string. - - Returns - ------- - Series or Index of bool dtype. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series(["toy", "trouble"]) - >>> ser.str.is_consonant(1) - 0 False - 1 True - dtype: bool - >>> positions = cudf.Series([2, 3]) - >>> ser.str.is_consonant(positions) - 0 True - 1 False - dtype: bool - """ - ltype = libstrings.LetterType.CONSONANT - - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - - return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) - ) - - def is_vowel(self, position) -> SeriesOrIndex: - """ - Return true for strings where the character at ``position`` is a - vowel -- not a consonant. The ``position`` parameter may also be - a list of integers to check different characters per string. - If the ``position`` is larger than the string length, False is - returned for that string. - - Parameters - ---------- - position: int or list-like - The character position to check within each string. - - Returns - ------- - Series or Index of bool dtype. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series(["toy", "trouble"]) - >>> ser.str.is_vowel(1) - 0 True - 1 False - dtype: bool - >>> positions = cudf.Series([2, 3]) - >>> ser.str.is_vowel(positions) - 0 False - 1 True - dtype: bool - """ - ltype = libstrings.LetterType.VOWEL - - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - - return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) - ) - - def edit_distance(self, targets) -> SeriesOrIndex: - """ - The ``targets`` strings are measured against the strings in this - instance using the Levenshtein edit distance algorithm. - https://www.cuelogic.com/blog/the-levenshtein-algorithm - - The ``targets`` parameter may also be a single string in which - case the edit distance is computed for all the strings against - that single string. - - Parameters - ---------- - targets : array-like, Sequence or Series or str - The string(s) to measure against each string. - - Returns - ------- - Series or Index of int32. - - Examples - -------- - >>> import cudf - >>> sr = cudf.Series(["puppy", "doggy", "kitty"]) - >>> targets = cudf.Series(["pup", "dogie", "kitten"]) - >>> sr.str.edit_distance(targets=targets) - 0 2 - 1 2 - 2 2 - dtype: int32 - >>> sr.str.edit_distance("puppy") - 0 0 - 1 4 - 2 4 - dtype: int32 - """ - if is_scalar(targets): - targets_column = column.as_column([targets]) - elif can_convert_to_column(targets): - targets_column = column.as_column(targets) - else: - raise TypeError( - f"targets should be an str, array-like or Series object, " - f"found {type(targets)}" - ) - - return self._return_or_inplace( - libstrings.edit_distance(self._column, targets_column) - ) - - def edit_distance_matrix(self) -> SeriesOrIndex: - """Computes the edit distance between strings in the series. - - The series to compute the matrix should have more than 2 strings and - should not contain nulls. - - Edit distance is measured based on the `Levenshtein edit distance - algorithm `_. - - Returns - ------- - Series of ListDtype(int64) - Assume ``N`` is the length of this series. The return series - contains ``N`` lists of size ``N``, where the ``j`` th number in - the ``i`` th row of the series tells the edit distance between the - ``i`` th string and the ``j`` th string of this series. The matrix - is symmetric. Diagonal elements are 0. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['abc', 'bc', 'cba']) - >>> s.str.edit_distance_matrix() - 0 [0, 1, 2] - 1 [1, 0, 2] - 2 [2, 2, 0] - dtype: list - """ - if self._column.size < 2: - raise ValueError( - "Require size >= 2 to compute edit distance matrix." - ) - if self._column.has_nulls(): - raise ValueError( - "Cannot compute edit distance between null strings. " - "Consider removing them using `dropna` or fill with `fillna`." - ) - return self._return_or_inplace( - libstrings.edit_distance_matrix(self._column) - ) - - def minhash( - self, seeds: ColumnLike | None = None, width: int = 4 - ) -> SeriesOrIndex: - """ - Compute the minhash of a strings column. - This uses the MurmurHash3_x86_32 algorithm for the hash function. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint32. - width : int - The width of the substring to hash. - Default is 4 characters. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> seeds = cudf.Series([0], dtype=np.uint32) - >>> str_series.str.minhash(seeds) - 0 [21141582] - 1 [962346254] - dtype: list - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - >>> str_series.str.minhash(seeds) - 0 [21141582, 403093213, 1258052021] - 1 [962346254, 677440381, 122618762] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint32: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash(self._column, seeds_column, width) - ) - - def minhash64( - self, seeds: ColumnLike | None = None, width: int = 4 - ) -> SeriesOrIndex: - """ - Compute the minhash of a strings column. - This uses the MurmurHash3_x64_128 algorithm for the hash function. - This function generates 2 uint64 values but only the first - uint64 value is used. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint64. - width : int - The width of the substring to hash. - Default is 4 characters. - - Examples - -------- - >>> import cudf - >>> str_series = cudf.Series(['this is my', 'favorite book']) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> str_series.str.minhash64(seeds) - 0 [3232308021562742685, 4445611509348165860, 586435843695903598] - 1 [23008204270530356, 1281229757012344693, 153762819128779913] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint64: - raise ValueError( - f"Expecting a Series with dtype uint64, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.minhash64(self._column, seeds_column, width) - ) - - def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: - """ - Compute the minhash of a list column of strings. - This uses the MurmurHash3_x86_32 algorithm for the hash function. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint32. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - >>> ls.str.word_minhash(seeds=seeds) - 0 [21141582, 1232889953, 1268336794] - 1 [962346254, 2321233602, 1354839212] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint32, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint32: - raise ValueError( - f"Expecting a Series with dtype uint32, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.word_minhash(self._column, seeds_column) - ) - - def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: - """ - Compute the minhash of a list column of strings. - This uses the MurmurHash3_x64_128 algorithm for the hash function. - This function generates 2 uint64 values but only the first - uint64 value is used. - - Parameters - ---------- - seeds : ColumnLike - The seeds used for the hash algorithm. - Must be of type uint64. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - >>> ls.str.word_minhash64(seeds) - 0 [2603139454418834912, 8644371945174847701, 5541030711534384340] - 1 [5240044617220523711, 5847101123925041457, 153762819128779913] - dtype: list - """ - if seeds is None: - seeds_column = column.as_column(0, dtype=np.uint64, length=1) - else: - seeds_column = column.as_column(seeds) - if seeds_column.dtype != np.uint64: - raise ValueError( - f"Expecting a Series with dtype uint64, got {type(seeds)}" - ) - return self._return_or_inplace( - libstrings.word_minhash64(self._column, seeds_column) - ) - - def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: - """ - Compute the Jaccard index between this column and the given - input strings column. - - Parameters - ---------- - input : Series - The input strings column to compute the Jaccard index against. - Must have the same number of strings as this column. - width : int - The number of characters for the sliding window calculation. - - Examples - -------- - >>> import cudf - >>> str1 = cudf.Series(["the brown dog", "jumped about"]) - >>> str2 = cudf.Series(["the black cat", "jumped around"]) - >>> str1.str.jaccard_index(str2, 5) - 0 0.058824 - 1 0.307692 - dtype: float32 - """ - - return self._return_or_inplace( - libstrings.jaccard_index(self._column, input._column, width), - ) - - -def _massage_string_arg(value, name, allow_col=False): - if isinstance(value, cudf.Scalar): - return value - - if isinstance(value, str): - return cudf.Scalar(value, dtype="str") - - allowed_types = ["Scalar"] - - if allow_col: - if isinstance(value, list): - return column.as_column(value, dtype="str") - - if isinstance(value, Column) and is_string_dtype(value.dtype): - return value - - allowed_types.append("Column") - - raise ValueError( - f"Expected {_expected_types_format(allowed_types)} " - f"for {name} but got {type(value)}" - ) - - -def _expected_types_format(types): - if len(types) == 1: - return types[0] - - return ", ".join(types[:-1]) + ", or " + types[-1] - - -class StringColumn(column.ColumnBase): - """ - Implements operations for Columns of String type - - Parameters - ---------- - mask : Buffer - The validity mask - offset : int - Data offset - children : Tuple[Column] - Two non-null columns containing the string data and offsets - respectively - """ - - _start_offset: int | None - _end_offset: int | None - - _VALID_BINARY_OPERATIONS = { - "__eq__", - "__ne__", - "__lt__", - "__le__", - "__gt__", - "__ge__", - "__add__", - "__radd__", - # These operators aren't actually supported, they only exist to allow - # empty column binops with scalars of arbitrary other dtypes. See - # the _binaryop method for more information. - "__sub__", - "__mul__", - "__mod__", - "__pow__", - "__truediv__", - "__floordiv__", - } - - def __init__( - self, - data: Buffer | None = None, - mask: Buffer | None = None, - size: int | None = None, # TODO: make non-optional - offset: int = 0, - null_count: int | None = None, - children: tuple["column.ColumnBase", ...] = (), - ): - dtype = cudf.api.types.dtype("object") - - if size is None: - for child in children: - assert child.offset == 0 - - if len(children) == 0: - size = 0 - elif children[0].size == 0: - size = 0 - else: - # one less because the last element of offsets is the number of - # bytes in the data buffer - size = children[0].size - 1 - size = size - offset - - if len(children) == 0 and size != 0: - # all nulls-column: - offsets = column.as_column( - 0, length=size + 1, dtype=size_type_dtype - ) - - children = (offsets,) - - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - self._start_offset = None - self._end_offset = None - - def copy(self, deep: bool = True): - # Since string columns are immutable, both deep - # and shallow copies share the underlying device data and mask. - return super().copy(deep=False) - - @property - def start_offset(self) -> int: - if self._start_offset is None: - if ( - len(self.base_children) == 1 - and self.offset < self.base_children[0].size - ): - self._start_offset = int( - self.base_children[0].element_indexing(self.offset) - ) - else: - self._start_offset = 0 - - return self._start_offset - - @property - def end_offset(self) -> int: - if self._end_offset is None: - if ( - len(self.base_children) == 1 - and (self.offset + self.size) < self.base_children[0].size - ): - self._end_offset = int( - self.base_children[0].element_indexing( - self.offset + self.size - ) - ) - else: - self._end_offset = 0 - - return self._end_offset - - @cached_property - def memory_usage(self) -> int: - n = 0 - if self.data is not None: - n += self.data.size - if len(self.base_children) == 1: - child0_size = (self.size + 1) * self.base_children[ - 0 - ].dtype.itemsize - - n += child0_size - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - return n - - @property - def base_size(self) -> int: - if len(self.base_children) == 0: - return 0 - else: - return self.base_children[0].size - 1 - - # override for string column - @property - def data(self): - if self.base_data is None: - return None - if self._data is None: - if ( - self.offset == 0 - and len(self.base_children) > 0 - and self.size == self.base_children[0].size - 1 - ): - self._data = self.base_data - else: - self._data = self.base_data[ - self.start_offset : self.end_offset - ] - return self._data - - def all(self, skipna: bool = True) -> bool: - if skipna and self.null_count == self.size: - return True - elif not skipna and self.has_nulls(): - raise TypeError("boolean value of NA is ambiguous") - raise NotImplementedError("`all` not implemented for `StringColumn`") - - def any(self, skipna: bool = True) -> bool: - if not skipna and self.has_nulls(): - raise TypeError("boolean value of NA is ambiguous") - elif skipna and self.null_count == self.size: - return False - - raise NotImplementedError("`any` not implemented for `StringColumn`") - - def data_array_view( - self, *, mode="write" - ) -> numba.cuda.devicearray.DeviceNDArray: - raise ValueError("Cannot get an array view of a StringColumn") - - @property - def __cuda_array_interface__(self): - raise NotImplementedError( - f"dtype {self.dtype} is not yet supported via " - "`__cuda_array_interface__`" - ) - - def to_arrow(self) -> pa.Array: - """Convert to PyArrow Array - - Examples - -------- - >>> import cudf - >>> col = cudf.core.column.as_column([1, 2, 3, 4]) - >>> col.to_arrow() - - [ - 1, - 2, - 3, - 4 - ] - """ - if self.null_count == len(self): - return pa.NullArray.from_buffers( - pa.null(), len(self), [pa.py_buffer(b"")] - ) - else: - return super().to_arrow() - - def sum( - self, - skipna: bool | None = None, - dtype: Dtype | None = None, - min_count: int = 0, - ): - result_col = self._process_for_reduction( - skipna=skipna, min_count=min_count - ) - if isinstance(result_col, type(self)): - return libstrings.join( - result_col, - sep=cudf.Scalar(""), - na_rep=cudf.Scalar(None, "str"), - ).element_indexing(0) - else: - return result_col - - def __contains__(self, item: ScalarLike) -> bool: - if is_scalar(item): - return True in libcudf.search.contains( - self, column.as_column([item], dtype=self.dtype) - ) - else: - return True in libcudf.search.contains( - self, column.as_column(item, dtype=self.dtype) - ) - - def as_numerical_column( - self, dtype: Dtype - ) -> "cudf.core.column.NumericalColumn": - out_dtype = cudf.api.types.dtype(dtype) - string_col = self - if out_dtype.kind in {"i", "u"}: - if not libstrings.is_integer(string_col).all(): - raise ValueError( - "Could not convert strings to integer " - "type due to presence of non-integer values." - ) - elif out_dtype.kind == "f": - if not libstrings.is_float(string_col).all(): - raise ValueError( - "Could not convert strings to float " - "type due to presence of non-floating values." - ) - - result_col = _str_to_numeric_typecast_functions[out_dtype](string_col) - return result_col - - def strptime( - self, dtype: Dtype, format: str - ) -> cudf.core.column.DatetimeColumn | cudf.core.column.TimeDeltaColumn: - if dtype.kind not in "Mm": # type: ignore[union-attr] - raise ValueError( - f"dtype must be datetime or timedelta type, not {dtype}" - ) - elif self.null_count == len(self): - return column.column_empty(len(self), dtype=dtype, masked=True) # type: ignore[return-value] - elif (self == "None").any(): - raise ValueError( - "Cannot convert `None` value to datetime or timedelta." - ) - elif dtype.kind == "M": # type: ignore[union-attr] - if format.endswith("%z"): - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) - is_nat = self == "NaT" - without_nat = self.apply_boolean_mask(is_nat.unary_operator("not")) - all_same_length = ( - libstrings.count_characters(without_nat).distinct_count( - dropna=True - ) - == 1 - ) - if not all_same_length: - # Unfortunately disables OK cases like: - # ["2020-01-01", "2020-01-01 00:00:00"] - # But currently incorrect for cases like (drops 10): - # ["2020-01-01", "2020-01-01 10:00:00"] - raise NotImplementedError( - "Cannot parse date-like strings with different formats" - ) - valid_ts = str_cast.istimestamp(self, format) - valid = valid_ts | is_nat - if not valid.all(): - raise ValueError(f"Column contains invalid data for {format=}") - - casting_func = str_cast.timestamp2int - add_back_nat = is_nat.any() - elif dtype.kind == "m": # type: ignore[union-attr] - casting_func = str_cast.timedelta2int - add_back_nat = False - - result_col = casting_func(self, dtype, format) - - if add_back_nat: - result_col[is_nat] = None - - return result_col - - def as_datetime_column( - self, dtype: Dtype - ) -> cudf.core.column.DatetimeColumn: - not_null = self.apply_boolean_mask(self.notnull()) - if len(not_null) == 0: - # We should hit the self.null_count == len(self) condition - # so format doesn't matter - format = "" - else: - # infer on host from the first not na element - format = datetime.infer_format(not_null.element_indexing(0)) - return self.strptime(dtype, format) # type: ignore[return-value] - - def as_timedelta_column( - self, dtype: Dtype - ) -> cudf.core.column.TimeDeltaColumn: - return self.strptime(dtype, "%D days %H:%M:%S") # type: ignore[return-value] - - def as_decimal_column( - self, dtype: Dtype - ) -> "cudf.core.column.DecimalBaseColumn": - return libstrings.to_decimal(self, dtype) - - def as_string_column(self) -> StringColumn: - return self - - @property - def values_host(self) -> np.ndarray: - """ - Return a numpy representation of the StringColumn. - """ - return self.to_pandas().values - - @property - def values(self) -> cupy.ndarray: - """ - Return a CuPy representation of the StringColumn. - """ - raise TypeError("String Arrays is not yet implemented in cudf") - - def to_pandas( - self, - *, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Index: - if nullable and not arrow_type: - pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow()) - return pd.Index(pandas_array, copy=False) - else: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - - def can_cast_safely(self, to_dtype: Dtype) -> bool: - to_dtype = cudf.api.types.dtype(to_dtype) - - if self.dtype == to_dtype: - return True - elif ( - to_dtype.kind in {"i", "u"} - and not libstrings.is_integer(self).all() - ): - return False - elif to_dtype.kind == "f" and not libstrings.is_float(self).all(): - return False - else: - return True - - def find_and_replace( - self, - to_replace: ColumnLike, - replacement: ColumnLike, - all_nan: bool = False, - ) -> StringColumn: - """ - Return col with *to_replace* replaced with *value* - """ - - to_replace_col = column.as_column(to_replace) - replacement_col = column.as_column(replacement) - - if type(to_replace_col) != type(replacement_col): - raise TypeError( - f"to_replace and value should be of same types," - f"got to_replace dtype: {to_replace_col.dtype} and " - f"value dtype: {replacement_col.dtype}" - ) - - if ( - to_replace_col.dtype != self.dtype - and replacement_col.dtype != self.dtype - ): - return self.copy() - df = cudf.DataFrame._from_data( - {"old": to_replace_col, "new": replacement_col} - ) - df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) - if df._data["old"].null_count == 1: - res = self.fillna( - df._data["new"] - .apply_boolean_mask(df._data["old"].isnull()) - .element_indexing(0) - ) - df = df.dropna(subset=["old"]) - else: - res = self - return libcudf.replace.replace(res, df._data["old"], df._data["new"]) - - def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar: - if ( - isinstance(other, (column.ColumnBase, cudf.Scalar)) - and other.dtype == "object" - ): - return other - if is_scalar(other): - return cudf.Scalar(other) - return NotImplemented - - def _binaryop( - self, other: ColumnBinaryOperand, op: str - ) -> "column.ColumnBase": - reflect, op = self._check_reflected_op(op) - # Due to https://github.com/pandas-dev/pandas/issues/46332 we need to - # support binary operations between empty or all null string columns - # and columns of other dtypes, even if those operations would otherwise - # be invalid. For example, you cannot divide strings, but pandas allows - # division between an empty string column and a (nonempty) integer - # column. Ideally we would disable these operators entirely, but until - # the above issue is resolved we cannot avoid this problem. - if self.null_count == len(self): - if op in { - "__add__", - "__sub__", - "__mul__", - "__mod__", - "__pow__", - "__truediv__", - "__floordiv__", - }: - return self - elif op in {"__eq__", "__lt__", "__le__", "__gt__", "__ge__"}: - return self.notnull() - elif op == "__ne__": - return self.isnull() - - other = self._wrap_binop_normalization(other) - if other is NotImplemented: - return NotImplemented - - if isinstance(other, (StringColumn, str, cudf.Scalar)): - if isinstance(other, cudf.Scalar) and other.dtype != "O": - if op in { - "__eq__", - "__ne__", - }: - return column.as_column( - op == "__ne__", length=len(self), dtype="bool" - ).set_mask(self.mask) - else: - return NotImplemented - - if op == "__add__": - if isinstance(other, cudf.Scalar): - other = cast( - StringColumn, - column.as_column( - other, length=len(self), dtype="object" - ), - ) - - # Explicit types are necessary because mypy infers ColumnBase - # rather than StringColumn and sometimes forgets Scalar. - lhs: cudf.Scalar | StringColumn - rhs: cudf.Scalar | StringColumn - lhs, rhs = (other, self) if reflect else (self, other) - - return cast( - "column.ColumnBase", - libstrings.concatenate( - [lhs, rhs], - sep=cudf.Scalar(""), - na_rep=cudf.Scalar(None, "str"), - ), - ) - elif op in { - "__eq__", - "__ne__", - "__gt__", - "__lt__", - "__ge__", - "__le__", - "NULL_EQUALS", - "NULL_NOT_EQUALS", - }: - lhs, rhs = (other, self) if reflect else (self, other) - return libcudf.binaryop.binaryop( - lhs=lhs, rhs=rhs, op=op, dtype="bool" - ) - return NotImplemented - - @copy_docstring(column.ColumnBase.view) - def view(self, dtype) -> "cudf.core.column.ColumnBase": - if self.null_count > 0: - raise ValueError( - "Can not produce a view of a string column with nulls" - ) - dtype = cudf.api.types.dtype(dtype) - str_byte_offset = self.base_children[0].element_indexing(self.offset) - str_end_byte_offset = self.base_children[0].element_indexing( - self.offset + self.size - ) - - n_bytes_to_view = str_end_byte_offset - str_byte_offset - - to_view = cudf.core.column.NumericalColumn( - self.base_data, # type: ignore[arg-type] - dtype=np.dtype(np.int8), - offset=str_byte_offset, - size=n_bytes_to_view, - ) - - return to_view.view(dtype) - - -def _get_cols_list(parent_obj, others): - parent_index = ( - parent_obj.index if isinstance(parent_obj, cudf.Series) else parent_obj - ) - - if ( - can_convert_to_column(others) - and len(others) > 0 - and ( - can_convert_to_column( - others.iloc[0] - if isinstance(others, cudf.Series) - else others[0] - ) - ) - ): - """ - If others is a list-like object (in our case lists & tuples) - just another Series/Index, great go ahead with concatenation. - """ - cols_list = [ - column.as_column(frame.reindex(parent_index), dtype="str") - if ( - parent_index is not None - and isinstance(frame, cudf.Series) - and not frame.index.equals(parent_index) - ) - else column.as_column(frame, dtype="str") - for frame in others - ] - - return cols_list - elif others is not None and not isinstance(others, StringMethods): - if ( - parent_index is not None - and isinstance(others, cudf.Series) - and not others.index.equals(parent_index) - ): - others = others.reindex(parent_index) - - return [column.as_column(others, dtype="str")] - else: - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarrary " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py deleted file mode 100644 index 2fda3b2c434..00000000000 --- a/python/cudf/cudf/core/column/struct.py +++ /dev/null @@ -1,279 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -from functools import cached_property -from typing import TYPE_CHECKING - -import pandas as pd -import pyarrow as pa - -import cudf -from cudf.core.column import ColumnBase -from cudf.core.column.methods import ColumnMethods -from cudf.core.dtypes import StructDtype -from cudf.core.missing import NA - -if TYPE_CHECKING: - from typing_extensions import Self - - from cudf._typing import Dtype - from cudf.core.buffer import Buffer - - -class StructColumn(ColumnBase): - """ - Column that stores fields of values. - - Every column has n children, where n is - the number of fields in the Struct Dtype. - """ - - def __init__( - self, - data: None, - size: int, - dtype: StructDtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple[ColumnBase, ...] = (), - ): - if data is not None: - raise ValueError("data must be None.") - dtype = self._validate_dtype_instance(dtype) - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - @staticmethod - def _validate_dtype_instance(dtype: StructDtype) -> StructDtype: - # IntervalDtype is a subclass of StructDtype, so compare types exactly - if type(dtype) is not StructDtype: - raise ValueError( - f"{type(dtype).__name__} must be a StructDtype exactly." - ) - return dtype - - @property - def base_size(self): - if self.base_children: - return len(self.base_children[0]) - else: - return self.size + self.offset - - def to_arrow(self) -> pa.Array: - children = [ - pa.nulls(len(child)) - if len(child) == child.null_count - else child.to_arrow() - for child in self.children - ] - - pa_type = pa.struct( - { - field: child.type - for field, child in zip(self.dtype.fields, children) - } - ) - - if self.mask is not None: - buffers = (pa.py_buffer(self.mask.memoryview()),) - else: - buffers = (None,) - - return pa.StructArray.from_buffers( - pa_type, len(self), buffers, children=children - ) - - def to_pandas( - self, - *, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Index: - # We cannot go via Arrow's `to_pandas` because of the following issue: - # https://issues.apache.org/jira/browse/ARROW-12680 - if arrow_type or nullable: - return super().to_pandas(nullable=nullable, arrow_type=arrow_type) - else: - return pd.Index(self.to_arrow().tolist(), dtype="object") - - @cached_property - def memory_usage(self) -> int: - n = 0 - if self.nullable: - n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size) - - for child in self.children: - n += child.memory_usage - - return n - - def element_indexing(self, index: int): - result = super().element_indexing(index) - return { - field: value - for field, value in zip(self.dtype.fields, result.values()) - } - - def __setitem__(self, key, value): - if isinstance(value, dict): - # filling in fields not in dict - for field in self.dtype.fields: - value[field] = value.get(field, NA) - - value = cudf.Scalar(value, self.dtype) - super().__setitem__(key, value) - - def copy(self, deep: bool = True) -> Self: - # Since struct columns are immutable, both deep and - # shallow copies share the underlying device data and mask. - result = super().copy(deep=False) - if deep: - result = result._rename_fields(self.dtype.fields.keys()) - return result - - def _rename_fields(self, names) -> Self: - """ - Return a StructColumn with the same field values as this StructColumn, - but with the field names equal to `names`. - """ - dtype = StructDtype( - {name: col.dtype for name, col in zip(names, self.children)} - ) - return StructColumn( # type: ignore[return-value] - data=None, - size=self.size, - dtype=dtype, - mask=self.base_mask, - offset=self.offset, - null_count=self.null_count, - children=self.base_children, - ) - - @property - def __cuda_array_interface__(self): - raise NotImplementedError( - "Structs are not yet supported via `__cuda_array_interface__`" - ) - - def _with_type_metadata(self: StructColumn, dtype: Dtype) -> StructColumn: - from cudf.core.column import IntervalColumn - from cudf.core.dtypes import IntervalDtype - - # Check IntervalDtype first because it's a subclass of StructDtype - if isinstance(dtype, IntervalDtype): - return IntervalColumn.from_struct_column(self, closed=dtype.closed) - elif isinstance(dtype, StructDtype): - return StructColumn( - data=None, - dtype=dtype, - children=tuple( - self.base_children[i]._with_type_metadata(dtype.fields[f]) - for i, f in enumerate(dtype.fields.keys()) - ), - mask=self.base_mask, - size=self.size, - offset=self.offset, - null_count=self.null_count, - ) - - return self - - -class StructMethods(ColumnMethods): - """ - Struct methods for Series - """ - - _column: StructColumn - - def __init__(self, parent=None): - if not isinstance(parent.dtype, StructDtype): - raise AttributeError( - "Can only use .struct accessor with a 'struct' dtype" - ) - super().__init__(parent=parent) - - def field(self, key): - """ - Extract children of the specified struct column - in the Series - - Parameters - ---------- - key: int or str - index/position or field name of the respective - struct column - - Returns - ------- - Series - - Examples - -------- - >>> s = cudf.Series([{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]) - >>> s.struct.field(0) - 0 1 - 1 3 - dtype: int64 - >>> s.struct.field('a') - 0 1 - 1 3 - dtype: int64 - """ - fields = list(self._column.dtype.fields.keys()) - if key in fields: - pos = fields.index(key) - return self._return_or_inplace(self._column.children[pos]) - else: - if isinstance(key, int): - try: - return self._return_or_inplace(self._column.children[key]) - except IndexError: - raise IndexError(f"Index {key} out of range") - else: - raise KeyError( - f"Field '{key}' is not found in the set of existing keys." - ) - - def explode(self): - """ - Return a DataFrame whose columns are the fields of this struct Series. - - Notes - ----- - Note that a copy of the columns is made. - - Examples - -------- - >>> s - 0 {'a': 1, 'b': 'x'} - 1 {'a': 2, 'b': 'y'} - 2 {'a': 3, 'b': 'z'} - 3 {'a': 4, 'b': 'a'} - dtype: struct - - >>> s.struct.explode() - a b - 0 1 x - 1 2 y - 2 3 z - 3 4 a - """ - return cudf.DataFrame._from_data( - cudf.core.column_accessor.ColumnAccessor( - { - name: col.copy(deep=True) - for name, col in zip( - self._column.dtype.fields, self._column.children - ) - } - ) - ) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py deleted file mode 100644 index 6b6f3e517a8..00000000000 --- a/python/cudf/cudf/core/column/timedelta.py +++ /dev/null @@ -1,585 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import datetime -import functools -from typing import TYPE_CHECKING, Sequence, cast - -import numpy as np -import pandas as pd -import pyarrow as pa - -import cudf -from cudf import _lib as libcudf -from cudf.api.types import is_scalar -from cudf.core.buffer import Buffer, acquire_spill_lock -from cudf.core.column import ColumnBase, column, string -from cudf.utils.dtypes import np_to_pa_dtype -from cudf.utils.utils import _all_bools_with_nulls - -if TYPE_CHECKING: - from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype - -_unit_to_nanoseconds_conversion = { - "ns": 1, - "us": 1_000, - "ms": 1_000_000, - "s": 1_000_000_000, - "m": 60_000_000_000, - "h": 3_600_000_000_000, - "D": 86_400_000_000_000, -} - - -class TimeDeltaColumn(ColumnBase): - """ - Parameters - ---------- - data : Buffer - The Timedelta values - dtype : np.dtype - The data type - size : int - Size of memory allocation. - mask : Buffer; optional - The validity mask - offset : int - Data offset - null_count : int, optional - The number of null values. - If None, it is calculated automatically. - """ - - _VALID_BINARY_OPERATIONS = { - "__eq__", - "__ne__", - "__lt__", - "__le__", - "__gt__", - "__ge__", - "__add__", - "__sub__", - "__mul__", - "__mod__", - "__truediv__", - "__floordiv__", - "__radd__", - "__rsub__", - "__rmul__", - "__rmod__", - "__rtruediv__", - "__rfloordiv__", - } - - def __init__( - self, - data: Buffer, - size: int | None, - dtype: np.dtype, - mask: Buffer | None = None, - offset: int = 0, - null_count: int | None = None, - children: tuple = (), - ): - if not isinstance(data, Buffer): - raise ValueError("data must be a Buffer.") - if not (isinstance(dtype, np.dtype) and dtype.kind == "m"): - raise ValueError("dtype must be a timedelta numpy dtype.") - - if data.size % dtype.itemsize: - raise ValueError("Buffer size must be divisible by element size") - if size is None: - size = data.size // dtype.itemsize - size = size - offset - if len(children) != 0: - raise ValueError("TimedeltaColumn must have no children.") - super().__init__( - data=data, - size=size, - dtype=dtype, - mask=mask, - offset=offset, - null_count=null_count, - children=children, - ) - - def __contains__(self, item: DatetimeLikeScalar) -> bool: - try: - item = np.timedelta64(item, self.time_unit) - except ValueError: - # If item cannot be converted to duration type - # np.timedelta64 raises ValueError, hence `item` - # cannot exist in `self`. - return False - return item.view("int64") in cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ) - - @property - def values(self): - """ - Return a CuPy representation of the TimeDeltaColumn. - """ - raise NotImplementedError( - "TimeDelta Arrays is not yet implemented in cudf" - ) - - def element_indexing(self, index: int): - result = super().element_indexing(index) - if cudf.get_option("mode.pandas_compatible"): - return pd.Timedelta(result) - return result - - @acquire_spill_lock() - def to_arrow(self) -> pa.Array: - mask = None - if self.nullable: - mask = pa.py_buffer( - self.mask_array_view(mode="read").copy_to_host() - ) - data = pa.py_buffer( - self.astype("int64").data_array_view(mode="read").copy_to_host() - ) - pa_dtype = np_to_pa_dtype(self.dtype) - return pa.Array.from_buffers( - type=pa_dtype, - length=len(self), - buffers=[mask, data], - null_count=self.null_count, - ) - - def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: - reflect, op = self._check_reflected_op(op) - other = self._wrap_binop_normalization(other) - if other is NotImplemented: - return NotImplemented - - this: ColumnBinaryOperand = self - out_dtype = None - - if other.dtype.kind == "m": - # TODO: pandas will allow these operators to work but return false - # when comparing to non-timedelta dtypes. We should do the same. - if op in { - "__eq__", - "__ne__", - "__lt__", - "__gt__", - "__le__", - "__ge__", - "NULL_EQUALS", - "NULL_NOT_EQUALS", - }: - out_dtype = cudf.dtype(np.bool_) - elif op == "__mod__": - out_dtype = determine_out_dtype(self.dtype, other.dtype) - elif op in {"__truediv__", "__floordiv__"}: - common_dtype = determine_out_dtype(self.dtype, other.dtype) - out_dtype = np.float64 if op == "__truediv__" else np.int64 - this = self.astype(common_dtype).astype(out_dtype) - if isinstance(other, cudf.Scalar): - if other.is_valid(): - other = other.value.astype(common_dtype).astype( - out_dtype - ) - else: - other = cudf.Scalar(None, out_dtype) - else: - other = other.astype(common_dtype).astype(out_dtype) - elif op in {"__add__", "__sub__"}: - out_dtype = determine_out_dtype(self.dtype, other.dtype) - elif other.dtype.kind in {"f", "i", "u"}: - if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}: - out_dtype = self.dtype - elif op in {"__eq__", "__ne__", "NULL_EQUALS", "NULL_NOT_EQUALS"}: - if isinstance(other, ColumnBase) and not isinstance( - other, TimeDeltaColumn - ): - fill_value = op in ("__ne__", "NULL_NOT_EQUALS") - result = _all_bools_with_nulls( - self, - other, - bool_fill_value=fill_value, - ) - if cudf.get_option("mode.pandas_compatible"): - result = result.fillna(fill_value) - return result - - if out_dtype is None: - return NotImplemented - - lhs, rhs = (other, this) if reflect else (this, other) - - result = libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype) - if cudf.get_option( - "mode.pandas_compatible" - ) and out_dtype == cudf.dtype(np.bool_): - result = result.fillna(op == "__ne__") - return result - - def normalize_binop_value(self, other) -> ColumnBinaryOperand: - if isinstance(other, (ColumnBase, cudf.Scalar)): - return other - - tz_error_msg = ( - "Cannot perform binary operation on timezone-naive columns" - " and timezone-aware timestamps." - ) - if isinstance(other, datetime.datetime): - if other.tzinfo is not None: - raise NotImplementedError(tz_error_msg) - other = pd.Timestamp(other).to_datetime64() - elif isinstance(other, datetime.timedelta): - other = pd.Timedelta(other).to_timedelta64() - - if isinstance(other, np.timedelta64): - other_time_unit = cudf.utils.dtypes.get_time_unit(other) - if np.isnat(other): - return cudf.Scalar( - None, - dtype="timedelta64[ns]" - if other_time_unit not in {"s", "ms", "ns", "us"} - else self.dtype, - ) - - if other_time_unit not in {"s", "ms", "ns", "us"}: - common_dtype = "timedelta64[s]" - else: - common_dtype = determine_out_dtype(self.dtype, other.dtype) - return cudf.Scalar(other.astype(common_dtype)) - elif is_scalar(other): - return cudf.Scalar(other) - return NotImplemented - - @functools.cached_property - def time_unit(self) -> str: - return np.datetime_data(self.dtype)[0] - - def total_seconds(self) -> ColumnBase: - raise NotImplementedError("total_seconds is currently not implemented") - - def ceil(self, freq: str) -> ColumnBase: - raise NotImplementedError("ceil is currently not implemented") - - def floor(self, freq: str) -> ColumnBase: - raise NotImplementedError("floor is currently not implemented") - - def round(self, freq: str) -> ColumnBase: - raise NotImplementedError("round is currently not implemented") - - def as_numerical_column( - self, dtype: Dtype - ) -> cudf.core.column.NumericalColumn: - col = cudf.core.column.NumericalColumn( - data=self.base_data, # type: ignore[arg-type] - dtype=np.dtype(np.int64), - mask=self.base_mask, - offset=self.offset, - size=self.size, - ) - return cast("cudf.core.column.NumericalColumn", col.astype(dtype)) - - def as_datetime_column(self, dtype: Dtype) -> None: # type: ignore[override] - raise TypeError( - f"cannot astype a timedelta from {self.dtype} to {dtype}" - ) - - def strftime(self, format: str) -> cudf.core.column.StringColumn: - if len(self) == 0: - return cast( - cudf.core.column.StringColumn, - column.column_empty(0, dtype="object", masked=False), - ) - else: - return string._timedelta_to_str_typecast_functions[self.dtype]( - self, format=format - ) - - def as_string_column(self) -> cudf.core.column.StringColumn: - return self.strftime("%D days %H:%M:%S") - - def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn: - if dtype == self.dtype: - return self - return libcudf.unary.cast(self, dtype=dtype) - - def mean(self, skipna=None) -> pd.Timedelta: - return pd.Timedelta( - cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ).mean(skipna=skipna), - unit=self.time_unit, - ).as_unit(self.time_unit) - - def median(self, skipna: bool | None = None) -> pd.Timedelta: - return pd.Timedelta( - cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ).median(skipna=skipna), - unit=self.time_unit, - ).as_unit(self.time_unit) - - def isin(self, values: Sequence) -> ColumnBase: - return cudf.core.tools.datetimes._isin_datetimelike(self, values) - - def quantile( - self, - q: np.ndarray, - interpolation: str, - exact: bool, - return_scalar: bool, - ) -> ColumnBase: - result = self.astype("int64").quantile( - q=q, - interpolation=interpolation, - exact=exact, - return_scalar=return_scalar, - ) - if return_scalar: - return pd.Timedelta(result, unit=self.time_unit).as_unit( - self.time_unit - ) - return result.astype(self.dtype) - - def sum( - self, - skipna: bool | None = None, - min_count: int = 0, - dtype: Dtype | None = None, - ) -> pd.Timedelta: - return pd.Timedelta( - # Since sum isn't overridden in Numerical[Base]Column, mypy only - # sees the signature from Reducible (which doesn't have the extra - # parameters from ColumnBase._reduce) so we have to ignore this. - self.astype("int64").sum( # type: ignore - skipna=skipna, min_count=min_count, dtype=dtype - ), - unit=self.time_unit, - ).as_unit(self.time_unit) - - def std( - self, - skipna: bool | None = None, - min_count: int = 0, - ddof: int = 1, - ) -> pd.Timedelta: - return pd.Timedelta( - cast("cudf.core.column.NumericalColumn", self.astype("int64")).std( - skipna=skipna, min_count=min_count, ddof=ddof - ), - unit=self.time_unit, - ).as_unit(self.time_unit) - - def cov(self, other: TimeDeltaColumn) -> float: - if not isinstance(other, TimeDeltaColumn): - raise TypeError( - f"cannot perform cov with types {self.dtype}, {other.dtype}" - ) - return cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) - - def corr(self, other: TimeDeltaColumn) -> float: - if not isinstance(other, TimeDeltaColumn): - raise TypeError( - f"cannot perform corr with types {self.dtype}, {other.dtype}" - ) - return cast( - "cudf.core.column.NumericalColumn", self.astype("int64") - ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64"))) - - def components(self) -> dict[str, ColumnBase]: - """ - Return a Dataframe of the components of the Timedeltas. - - Returns - ------- - DataFrame - - Examples - -------- - >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='s')) - >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, - ... 3244334234], dtype='timedelta64[ms]') - >>> s - 0 141 days 13:35:12.123 - 1 14 days 06:00:31.231 - 2 13000 days 10:12:48.712 - 3 0 days 00:35:35.656 - 4 37 days 13:12:14.234 - dtype: timedelta64[ms] - >>> s.dt.components - days hours minutes seconds milliseconds microseconds nanoseconds - 0 141 13 35 12 123 0 0 - 1 14 6 0 31 231 0 0 - 2 13000 10 12 48 712 0 0 - 3 0 0 35 35 656 0 0 - 4 37 13 12 14 234 0 0 - """ # noqa: E501 - - date_meta = { - "seconds": ["m", "s"], - "milliseconds": ["s", "ms"], - "microseconds": ["ms", "us"], - "nanoseconds": ["us", "ns"], - } - data = { - "days": self - // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) - ), - "hours": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) - ) - ) - // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["h"], "ns" - ).astype(self.dtype) - ), - "minutes": ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["h"], "ns" - ).astype(self.dtype) - ) - ) - // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["m"], "ns" - ).astype(self.dtype) - ), - } - keys_list = iter(date_meta.keys()) - for name in keys_list: - value = date_meta[name] - data[name] = ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion[value[0]], "ns" - ).astype(self.dtype) - ) - ) // cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion[value[1]], "ns" - ).astype(self.dtype) - ) - if self.time_unit == value[1]: - break - - for name in keys_list: - res_col = column.as_column(0, length=len(self), dtype="int64") - if self.nullable: - res_col = res_col.set_mask(self.mask) - data[name] = res_col - return data - - @property - def days(self) -> "cudf.core.column.NumericalColumn": - """ - Number of days for each element. - - Returns - ------- - NumericalColumn - """ - return self // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["D"], "ns").astype( - self.dtype - ) - ) - - @property - def seconds(self) -> "cudf.core.column.NumericalColumn": - """ - Number of seconds (>= 0 and less than 1 day). - - Returns - ------- - NumericalColumn - """ - # This property must return the number of seconds (>= 0 and - # less than 1 day) for each element, hence first performing - # mod operation to remove the number of days and then performing - # division operation to extract the number of seconds. - - return ( - self - % cudf.Scalar( - np.timedelta64( - _unit_to_nanoseconds_conversion["D"], "ns" - ).astype(self.dtype) - ) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["s"], "ns") - ) - - @property - def microseconds(self) -> "cudf.core.column.NumericalColumn": - """ - Number of microseconds (>= 0 and less than 1 second). - - Returns - ------- - NumericalColumn - """ - # This property must return the number of microseconds (>= 0 and - # less than 1 second) for each element, hence first performing - # mod operation to remove the number of seconds and then performing - # division operation to extract the number of microseconds. - - return ( - self - % np.timedelta64( - _unit_to_nanoseconds_conversion["s"], "ns" - ).astype(self.dtype) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") - ) - - @property - def nanoseconds(self) -> "cudf.core.column.NumericalColumn": - """ - Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. - - Returns - ------- - NumericalColumn - """ - # This property must return the number of nanoseconds (>= 0 and - # less than 1 microsecond) for each element, hence first performing - # mod operation to remove the number of microseconds and then - # performing division operation to extract the number - # of nanoseconds. - - if self.time_unit != "ns": - res_col = column.as_column(0, length=len(self), dtype="int64") - if self.nullable: - res_col = res_col.set_mask(self.mask) - return cast("cudf.core.column.NumericalColumn", res_col) - return ( - self - % cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["us"], "ns") - ) - ) // cudf.Scalar( - np.timedelta64(_unit_to_nanoseconds_conversion["ns"], "ns") - ) - - -def determine_out_dtype(lhs_dtype: Dtype, rhs_dtype: Dtype) -> Dtype: - if np.can_cast(np.dtype(lhs_dtype), np.dtype(rhs_dtype)): - return rhs_dtype - elif np.can_cast(np.dtype(rhs_dtype), np.dtype(lhs_dtype)): - return lhs_dtype - else: - raise TypeError(f"Cannot type-cast {lhs_dtype} and {rhs_dtype}") diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py deleted file mode 100644 index bc093fdaa9a..00000000000 --- a/python/cudf/cudf/core/column_accessor.py +++ /dev/null @@ -1,794 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import itertools -import sys -from collections import abc -from functools import cached_property, reduce -from typing import TYPE_CHECKING, Any, Mapping, cast - -import numpy as np -import pandas as pd -from pandas.api.types import is_bool - -import cudf -from cudf.core import column - -if TYPE_CHECKING: - from typing_extensions import Self - - from cudf._typing import Dtype - from cudf.core.column import ColumnBase - - -class _NestedGetItemDict(dict): - """A dictionary whose __getitem__ method accesses nested dicts. - - This class directly subclasses dict for performance, so there are a number - of gotchas: 1) the only safe accessor for nested elements is - `__getitem__` (all other accessors will fail to perform nested lookups), 2) - nested mappings will not exhibit the same behavior (they will be raw - dictionaries unless explicitly created to be of this class), and 3) to - construct this class you _must_ use `from_zip` to get appropriate treatment - of tuple keys. - """ - - @classmethod - def from_zip(cls, data: abc.Iterator): - """Create from zip, specialized factory for nesting.""" - obj = cls() - for key, value in data: - d = obj - for k in key[:-1]: - d = d.setdefault(k, {}) - d[key[-1]] = value - return obj - - def __getitem__(self, key): - """Recursively apply dict.__getitem__ for nested elements.""" - # As described in the pandas docs - # https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#advanced-indexing-with-hierarchical-index # noqa: E501 - # accessing nested elements of a multiindex must be done using a tuple. - # Lists and other sequences are treated as accessing multiple elements - # at the top level of the index. - if isinstance(key, tuple): - return reduce(dict.__getitem__, key, self) - return super().__getitem__(key) - - -def _to_flat_dict_inner(d: dict, parents: tuple = ()): - for k, v in d.items(): - if not isinstance(v, d.__class__): - if parents: - k = parents + (k,) - yield (k, v) - else: - yield from _to_flat_dict_inner(d=v, parents=parents + (k,)) - - -class ColumnAccessor(abc.MutableMapping): - """ - Parameters - ---------- - data : mapping - Mapping of keys to column values. - multiindex : bool, optional - Whether tuple keys represent a hierarchical - index with multiple "levels" (default=False). - level_names : tuple, optional - Tuple containing names for each of the levels. - For a non-hierarchical index, a tuple of size 1 - may be passe. - rangeindex : bool, optional - Whether the keys should be returned as a RangeIndex - in `to_pandas_index` (default=False). - label_dtype : Dtype, optional - What dtype should be returned in `to_pandas_index` - (default=None). - verify : bool, optional - For non ColumnAccessor inputs, whether to verify - column length and data.values() are all Columns - """ - - _data: dict[abc.Hashable, ColumnBase] - _level_names: tuple[abc.Hashable, ...] - - def __init__( - self, - data: abc.MutableMapping[abc.Hashable, ColumnBase] | Self, - multiindex: bool = False, - level_names=None, - rangeindex: bool = False, - label_dtype: Dtype | None = None, - verify: bool = True, - ) -> None: - if isinstance(data, ColumnAccessor): - self._data = data._data - self._level_names = data.level_names - self.multiindex: bool = data.multiindex - self.rangeindex: bool = data.rangeindex - self.label_dtype: Dtype | None = data.label_dtype - elif isinstance(data, abc.MutableMapping): - # This code path is performance-critical for copies and should be - # modified with care. - if data and verify: - # Faster than next(iter(data.values())) - column_length = len(data[next(iter(data))]) - # TODO: we should validate the keys of `data` - for col in data.values(): - if not isinstance(col, column.ColumnBase): - raise ValueError( - f"All data.values() must be Column, not {type(col).__name__}" - ) - if len(col) != column_length: - raise ValueError("All columns must be of equal length") - - if not isinstance(data, dict): - data = dict(data) - self._data = data - - if rangeindex and multiindex: - raise ValueError( - f"{rangeindex=} and {multiindex=} cannot both be True." - ) - self.rangeindex = rangeindex - self.multiindex = multiindex - self.label_dtype = label_dtype - self._level_names = level_names - else: - raise ValueError( - f"data must be a ColumnAccessor or MutableMapping, not {type(data).__name__}" - ) - - def __iter__(self) -> abc.Iterator: - return iter(self._data) - - def __getitem__(self, key: abc.Hashable) -> ColumnBase: - return self._data[key] - - def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None: - self.set_by_label(key, value) - - def __delitem__(self, key: abc.Hashable) -> None: - old_ncols = len(self) - del self._data[key] - new_ncols = len(self) - self._clear_cache(old_ncols, new_ncols) - - def __len__(self) -> int: - return len(self._data) - - def __repr__(self) -> str: - type_info = ( - f"{self.__class__.__name__}(" - f"multiindex={self.multiindex}, " - f"level_names={self.level_names}, " - f"rangeindex={self.rangeindex}, " - f"label_dtype={self.label_dtype})" - ) - column_info = "\n".join( - [f"{name}: {col.dtype}" for name, col in self.items()] - ) - return f"{type_info}\n{column_info}" - - def _from_columns_like_self( - self, columns: abc.Iterable[ColumnBase], verify: bool = True - ) -> Self: - """ - Return a new ColumnAccessor with columns and the properties of self. - - Parameters - ---------- - columns : iterable of Columns - New columns for the ColumnAccessor. - verify : bool, optional - Whether to verify column length and type. - """ - if sys.version_info.major >= 3 and sys.version_info.minor >= 10: - data = zip(self.names, columns, strict=True) # type: ignore[call-overload] - else: - columns = list(columns) - if len(columns) != len(self.names): - raise ValueError( - f"The number of columns ({len(columns)}) must match " - f"the number of existing column labels ({len(self.names)})." - ) - data = zip(self.names, columns) - return type(self)( - data=dict(data), - multiindex=self.multiindex, - level_names=self.level_names, - rangeindex=self.rangeindex, - label_dtype=self.label_dtype, - verify=verify, - ) - - @property - def level_names(self) -> tuple[abc.Hashable, ...]: - if self._level_names is None or len(self._level_names) == 0: - return tuple((None,) * max(1, self.nlevels)) - else: - return self._level_names - - @property - def nlevels(self) -> int: - if len(self) == 0: - return 0 - if not self.multiindex: - return 1 - else: - return len(next(iter(self.keys()))) - - @property - def name(self) -> abc.Hashable: - return self.level_names[-1] - - @cached_property - def nrows(self) -> int: - if len(self) == 0: - return 0 - else: - return len(next(iter(self.values()))) - - @cached_property - def names(self) -> tuple[abc.Hashable, ...]: - return tuple(self.keys()) - - @cached_property - def columns(self) -> tuple[ColumnBase, ...]: - return tuple(self.values()) - - @cached_property - def _grouped_data(self) -> abc.MutableMapping: - """ - If self.multiindex is True, - return the underlying mapping as a nested mapping. - """ - if self.multiindex: - return _NestedGetItemDict.from_zip(zip(self.names, self.columns)) - else: - return self._data - - def _clear_cache(self, old_ncols: int, new_ncols: int) -> None: - """ - Clear cached attributes. - - Parameters - ---------- - old_ncols: int - len(self) before self._data was modified - new_ncols: int - len(self) after self._data was modified - """ - cached_properties = ("columns", "names", "_grouped_data") - for attr in cached_properties: - try: - self.__delattr__(attr) - except AttributeError: - pass - - # nrows should only be cleared if empty before/after the op. - if (old_ncols == 0) ^ (new_ncols == 0): - try: - del self.nrows - except AttributeError: - pass - - def to_pandas_index(self) -> pd.Index: - """Convert the keys of the ColumnAccessor to a Pandas Index object.""" - if self.multiindex and len(self.level_names) > 0: - result = pd.MultiIndex.from_tuples( - self.names, - names=self.level_names, - ) - else: - # Determine if we can return a RangeIndex - if self.rangeindex: - if not self.names: - return pd.RangeIndex( - start=0, stop=0, step=1, name=self.name - ) - elif cudf.api.types.infer_dtype(self.names) == "integer": - if len(self.names) == 1: - start = cast(int, self.names[0]) - return pd.RangeIndex( - start=start, stop=start + 1, step=1, name=self.name - ) - uniques = np.unique(np.diff(np.array(self.names))) - if len(uniques) == 1 and uniques[0] != 0: - diff = uniques[0] - new_range = range( - cast(int, self.names[0]), - cast(int, self.names[-1]) + diff, - diff, - ) - return pd.RangeIndex(new_range, name=self.name) - result = pd.Index( - self.names, - name=self.name, - tupleize_cols=False, - dtype=self.label_dtype, - ) - return result - - def insert( - self, name: abc.Hashable, value: ColumnBase, loc: int = -1 - ) -> None: - """ - Insert column into the ColumnAccessor at the specified location. - - Parameters - ---------- - name : Name corresponding to the new column - value : ColumnBase - loc : int, optional - The location to insert the new value at. - Must be (0 <= loc <= ncols). By default, the column is added - to the end. - - Returns - ------- - None, this function operates in-place. - """ - name = self._pad_key(name) - if name in self._data: - raise ValueError(f"Cannot insert '{name}', already exists") - - old_ncols = len(self) - if loc == -1: - loc = old_ncols - elif not (0 <= loc <= old_ncols): - raise ValueError( - f"insert: loc out of bounds: must be 0 <= loc <= {old_ncols}" - ) - - if not isinstance(value, column.ColumnBase): - raise ValueError("value must be a Column") - elif old_ncols > 0 and len(value) != self.nrows: - raise ValueError("All columns must be of equal length") - - # TODO: we should move all insert logic here - if loc == old_ncols: - self._data[name] = value - else: - new_keys = self.names[:loc] + (name,) + self.names[loc:] - new_values = self.columns[:loc] + (value,) + self.columns[loc:] - self._data = dict(zip(new_keys, new_values)) - self._clear_cache(old_ncols, old_ncols + 1) - if old_ncols == 0: - # The type(name) may no longer match the prior label_dtype - self.label_dtype = None - - def copy(self, deep: bool = False) -> Self: - """ - Make a copy of this ColumnAccessor. - """ - if deep or cudf.get_option("copy_on_write"): - data = {k: v.copy(deep=deep) for k, v in self._data.items()} - else: - data = self._data.copy() - return self.__class__( - data=data, - multiindex=self.multiindex, - level_names=self.level_names, - rangeindex=self.rangeindex, - label_dtype=self.label_dtype, - verify=False, - ) - - def select_by_label(self, key: Any) -> Self: - """ - Return a subset of this column accessor, - composed of the keys specified by `key`. - - Parameters - ---------- - key : slice, list-like, tuple or scalar - - Returns - ------- - ColumnAccessor - """ - if isinstance(key, slice): - return self._select_by_label_slice(key) - elif pd.api.types.is_list_like(key) and not isinstance(key, tuple): - return self._select_by_label_list_like(tuple(key)) - else: - if isinstance(key, tuple): - if any(isinstance(k, slice) for k in key): - return self._select_by_label_with_wildcard(key) - return self._select_by_label_grouped(key) - - def get_labels_by_index(self, index: Any) -> tuple: - """Get the labels corresponding to the provided column indices. - - Parameters - ---------- - index : integer, integer slice, boolean mask, - or list-like of integers - The column indexes. - - Returns - ------- - tuple - """ - if isinstance(index, slice): - start, stop, step = index.indices(len(self)) - return self.names[start:stop:step] - elif pd.api.types.is_integer(index): - return (self.names[index],) - elif (bn := len(index)) > 0 and all(map(is_bool, index)): - if bn != (n := len(self.names)): - raise IndexError( - f"Boolean mask has wrong length: {bn} not {n}" - ) - if isinstance(index, (pd.Series, cudf.Series)): - # Don't allow iloc indexing with series - raise NotImplementedError( - "Cannot use Series object for mask iloc indexing" - ) - # TODO: Doesn't handle on-device columns - return tuple(n for n, keep in zip(self.names, index) if keep) - else: - if len(set(index)) != len(index): - raise NotImplementedError( - "Selecting duplicate column labels is not supported." - ) - return tuple(self.names[i] for i in index) - - def select_by_index(self, index: Any) -> Self: - """ - Return a ColumnAccessor composed of the columns - specified by index. - - Parameters - ---------- - key : integer, integer slice, boolean mask, - or list-like of integers - - Returns - ------- - ColumnAccessor - """ - keys = self.get_labels_by_index(index) - data = {k: self._data[k] for k in keys} - return type(self)( - data, - multiindex=self.multiindex, - level_names=self.level_names, - label_dtype=self.label_dtype, - verify=False, - ) - - def swaplevel(self, i: abc.Hashable = -2, j: abc.Hashable = -1) -> Self: - """ - Swap level i with level j. - Calling this method does not change the ordering of the values. - - Parameters - ---------- - i : int or str, default -2 - First level of index to be swapped. - j : int or str, default -1 - Second level of index to be swapped. - - Returns - ------- - ColumnAccessor - """ - if not self.multiindex: - raise ValueError( - "swaplevel is only valid for self.multiindex=True" - ) - - i = _get_level(i, self.nlevels, self.level_names) - j = _get_level(j, self.nlevels, self.level_names) - - new_keys = [list(row) for row in self] - new_dict = {} - - # swap old keys for i and j - for n, row in enumerate(self.names): - new_keys[n][i], new_keys[n][j] = row[j], row[i] # type: ignore[call-overload, index] - new_dict.update({row: tuple(new_keys[n])}) - - # TODO: Change to deep=False when copy-on-write is default - new_data = {new_dict[k]: v.copy(deep=True) for k, v in self.items()} - - # swap level_names for i and j - new_names = list(self.level_names) - new_names[i], new_names[j] = new_names[j], new_names[i] # type: ignore[call-overload] - - return type(self)( - new_data, # type: ignore[arg-type] - multiindex=self.multiindex, - level_names=new_names, - rangeindex=self.rangeindex, - label_dtype=self.label_dtype, - verify=False, - ) - - def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None: - """ - Add (or modify) column by name. - - Parameters - ---------- - key - name of the column - value : Column - The value to insert into the column. - """ - key = self._pad_key(key) - if not isinstance(value, column.ColumnBase): - raise ValueError("value must be a Column") - if len(self) > 0 and len(value) != self.nrows: - raise ValueError("All columns must be of equal length") - - old_ncols = len(self) - self._data[key] = value - new_ncols = len(self) - self._clear_cache(old_ncols, new_ncols) - - def _select_by_label_list_like(self, key: tuple) -> Self: - # Special-casing for boolean mask - if (bn := len(key)) > 0 and all(map(is_bool, key)): - if bn != (n := len(self.names)): - raise IndexError( - f"Boolean mask has wrong length: {bn} not {n}" - ) - data = dict( - item - for item, keep in zip(self._grouped_data.items(), key) - if keep - ) - else: - data = {k: self._grouped_data[k] for k in key} - if len(data) != len(key): - raise ValueError( - "Selecting duplicate column labels is not supported." - ) - if self.multiindex: - data = dict(_to_flat_dict_inner(data)) - return type(self)( - data, - multiindex=self.multiindex, - level_names=self.level_names, - label_dtype=self.label_dtype, - verify=False, - ) - - def _select_by_label_grouped(self, key: abc.Hashable) -> Self: - result = self._grouped_data[key] - if isinstance(result, column.ColumnBase): - # self._grouped_data[key] = self._data[key] so skip validation - return type(self)( - data={key: result}, - multiindex=self.multiindex, - label_dtype=self.label_dtype, - verify=False, - ) - else: - if self.multiindex: - result = dict(_to_flat_dict_inner(result)) - if not isinstance(key, tuple): - key = (key,) - return self.__class__( - result, - multiindex=self.nlevels - len(key) > 1, - level_names=self.level_names[len(key) :], - verify=False, - ) - - def _select_by_label_slice(self, key: slice) -> Self: - start, stop = key.start, key.stop - if key.step is not None: - raise TypeError("Label slicing with step is not supported") - - if start is None: - start = self.names[0] - if stop is None: - stop = self.names[-1] - start = self._pad_key(start, slice(None)) - stop = self._pad_key(stop, slice(None)) - for idx, name in enumerate(self.names): - if _keys_equal(name, start): - start_idx = idx - break - for idx, name in enumerate(reversed(self.names)): - if _keys_equal(name, stop): - stop_idx = len(self.names) - idx - break - keys = self.names[start_idx:stop_idx] - return type(self)( - {k: self._data[k] for k in keys}, - multiindex=self.multiindex, - level_names=self.level_names, - label_dtype=self.label_dtype, - verify=False, - ) - - def _select_by_label_with_wildcard(self, key: tuple) -> Self: - pad_key = self._pad_key(key, slice(None)) - data = { - k: self._data[k] - for k in self.names - if _keys_equal(k, pad_key) # type: ignore[arg-type] - } - return type(self)( - data, - multiindex=self.multiindex, - level_names=self.level_names, - label_dtype=self.label_dtype, - verify=False, - ) - - def _pad_key( - self, key: abc.Hashable, pad_value: str | slice = "" - ) -> abc.Hashable: - """ - Pad the provided key to a length equal to the number - of levels. - """ - if not self.multiindex: - return key - if not isinstance(key, tuple): - key = (key,) - return key + (pad_value,) * (self.nlevels - len(key)) - - def rename_levels( - self, - mapper: Mapping[abc.Hashable, abc.Hashable] | abc.Callable, - level: int | None = None, - ) -> Self: - """ - Rename the specified levels of the given ColumnAccessor - - Parameters - ---------- - self : ColumnAccessor of a given dataframe - - mapper : dict-like or function transformations to apply to - the column label values depending on selected ``level``. - - If dict-like, only replace the specified level of the - ColumnAccessor's keys (that match the mapper's keys) with - mapper's values - - If callable, the function is applied only to the specified level - of the ColumnAccessor's keys. - - level : int - In case of RangeIndex, only supported level is [0, None]. - In case of a MultiColumn, only the column labels in the specified - level of the ColumnAccessor's keys will be transformed. - - Returns - ------- - A new ColumnAccessor with values in the keys replaced according - to the given mapper and level. - - """ - new_col_names: abc.Iterable - if self.multiindex: - - def rename_column(x): - x = list(x) - if isinstance(mapper, Mapping): - x[level] = mapper.get(x[level], x[level]) - else: - x[level] = mapper(x[level]) - x = tuple(x) - return x - - if level is None: - level = 0 - new_col_names = (rename_column(k) for k in self.keys()) - - else: - if level is None: - level = 0 - if level != 0: - raise IndexError( - f"Too many levels: Index has only 1 level, not {level+1}" - ) - - if isinstance(mapper, Mapping): - new_col_names = [ - mapper.get(col_name, col_name) for col_name in self.keys() - ] - else: - new_col_names = [mapper(col_name) for col_name in self.keys()] - - if len(new_col_names) != len(set(new_col_names)): - raise ValueError("Duplicate column names are not allowed") - - data = dict(zip(new_col_names, self.values())) - return type(self)( - data=data, - level_names=self.level_names, - multiindex=self.multiindex, - label_dtype=self.label_dtype, - verify=False, - ) - - def droplevel(self, level: int) -> None: - # drop the nth level - if level < 0: - level += self.nlevels - - old_ncols = len(self) - self._data = { - _remove_key_level(key, level): value # type: ignore[arg-type] - for key, value in self._data.items() - } - new_ncols = len(self) - self._level_names = ( - self._level_names[:level] + self._level_names[level + 1 :] - ) - - if len(self._level_names) == 1: - # can't use nlevels, as it depends on multiindex - self.multiindex = False - self._clear_cache(old_ncols, new_ncols) - - -def _keys_equal(target: abc.Hashable, key: abc.Iterable) -> bool: - """ - Compare `key` to `target`. - - Return True if each value in `key` == corresponding value in `target`. - If any value in `key` is slice(None), it is considered equal - to the corresponding value in `target`. - """ - if not isinstance(target, tuple): - return target == key - for k1, k2 in itertools.zip_longest(target, key, fillvalue=None): - if k2 == slice(None): - continue - if k1 != k2: - return False - return True - - -def _remove_key_level(key: tuple, level: int) -> abc.Hashable: - """ - Remove a level from key. If detupleize is True, and if only a - single level remains, convert the tuple to a scalar. - """ - result = key[:level] + key[level + 1 :] - if len(result) == 1: - return result[0] - return result - - -def _get_level( - x: abc.Hashable, nlevels: int, level_names: tuple[abc.Hashable, ...] -) -> abc.Hashable: - """Get the level index from a level number or name. - - If given an integer, this function will handle wraparound for - negative values. If given a string (the level name), this function - will extract the index of that level from `level_names`. - - Parameters - ---------- - x - The level number to validate - nlevels - The total available levels in the MultiIndex - level_names - The names of the levels. - """ - if isinstance(x, int): - if x < 0: - x += nlevels - if x >= nlevels: - raise IndexError( - f"Level {x} out of bounds. Index has {nlevels} levels." - ) - return x - else: - x = level_names.index(x) - return x diff --git a/python/cudf/cudf/core/common.py b/python/cudf/cudf/core/common.py deleted file mode 100644 index 5276cd518e5..00000000000 --- a/python/cudf/cudf/core/common.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - - -def pipe(obj, func, *args, **kwargs): - """ - Apply a function ``func`` to object ``obj`` either by passing obj as the - first argument to the function or, in the case that the func is a tuple, - interpret the first element of the tuple as a function and pass the obj to - that function as a keyword argument whose key is the value of the second - element of the tuple. - - Parameters - ---------- - func : callable or tuple of (callable, str) - Function to apply to this object or, alternatively, a - ``(callable, data_keyword)`` tuple where ``data_keyword`` is a - string indicating the keyword of `callable`` that expects the - object. - *args : iterable, optional - Positional arguments passed into ``func``. - **kwargs : dict, optional - A dictionary of keyword arguments passed into ``func``. - - Returns - ------- - object : the return type of ``func``. - """ - if isinstance(func, tuple): - func, target = func - if target in kwargs: - raise ValueError( - f"{target} is both the pipe target and a keyword argument" - ) - kwargs[target] = obj - return func(*args, **kwargs) - else: - return func(obj, *args, **kwargs) diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py deleted file mode 100644 index 16d8964f083..00000000000 --- a/python/cudf/cudf/core/copy_types.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, cast - -from typing_extensions import Self - -import cudf -import cudf._lib as libcudf -from cudf._lib.types import size_type_dtype - -if TYPE_CHECKING: - from cudf.core.column import NumericalColumn - - -@dataclass -class GatherMap: - """A representation of a column as a gather map. - - This object augments the column with the information that it - is valid as a gather map for the specified number of rows with - the given nullification flag. - - Parameters - ---------- - column - The data to turn into a column and then verify - nrows - The number of rows to verify against - nullify - Will the gather map be used nullifying out of bounds - accesses? - - Returns - ------- - GatherMap - New object wrapping the column bearing witness to its - suitability as a gather map for columns with nrows. - - Raises - ------ - TypeError - If the column is of unsuitable dtype - IndexError - If the map is not in bounds. - """ - - #: The number of rows the gather map has been validated for - nrows: int - #: Was the validation for nullify=True? - nullify: bool - - def __init__(self, column: Any, nrows: int, *, nullify: bool): - #: The gather map - self.column = cast( - cudf.core.column.NumericalColumn, - cudf.core.column.as_column(column), - ) - self.nrows = nrows - self.nullify = nullify - if len(self.column) == 0: - # Any empty column is valid as a gather map - # This is necessary because as_column([]) defaults to float64 - # TODO: we should fix this further up. - # Alternately we can have an Optional[Column] and handle None - # specially in _gather. - self.column = cast( - "NumericalColumn", self.column.astype(size_type_dtype) - ) - else: - if self.column.dtype.kind not in {"i", "u"}: - raise TypeError("Gather map must have integer dtype") - if not nullify: - lo, hi = libcudf.reduce.minmax(self.column) - if lo.value < -nrows or hi.value >= nrows: - raise IndexError( - f"Gather map is out of bounds for [0, {nrows})" - ) - - @classmethod - def from_column_unchecked( - cls, column: "NumericalColumn", nrows: int, *, nullify: bool - ) -> Self: - """Construct a new GatherMap from a column without checks. - - Parameters - ---------- - column - The column that will be used as a gather map - nrows - The number of rows the gather map will be used for - nullify - Will the gather map be used nullifying out of bounds - accesses? - - Returns - ------- - GatherMap - - Notes - ----- - This method asserts, by fiat, that the column is valid. - Behaviour is undefined if it is not. - """ - self = cls.__new__(cls) - self.column = column - self.nrows = nrows - self.nullify = nullify - return self - - -@dataclass -class BooleanMask: - """A representation of a column as a boolean mask. - - This augments the column with information that it is valid as a - boolean mask for columns with a given number of rows - - Parameters - ---------- - column - The data to turn into a column to then verify - nrows - the number of rows to verify against - - Returns - ------- - BooleanMask - New object wrapping the column bearing witness to its - suitability as a boolean mask for columns with matching - row count. - - Raises - ------ - TypeError - If the column is of unsuitable dtype - IndexError - If the mask has the wrong number of rows - """ - - def __init__(self, column: Any, nrows: int): - #: The boolean mask - self.column = cast( - cudf.core.column.NumericalColumn, - cudf.core.column.as_column(column), - ) - if self.column.dtype.kind != "b": - raise TypeError("Boolean mask must have bool dtype") - if len(column) != nrows: - raise IndexError( - f"Column with {len(column)} rows not suitable " - f"as a boolean mask for {nrows} rows" - ) - - @classmethod - def from_column_unchecked(cls, column: "NumericalColumn") -> Self: - """Construct a new BooleanMask from a column without checks. - - Parameters - ---------- - column - The column that will be used as a boolean mask - - Returns - ------- - BooleanMask - - Notes - ----- - This method asserts, by fiat, that the column is valid. - Behaviour is undefined if it is not. - """ - self = cls.__new__(cls) - self.column = column - return self diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py deleted file mode 100644 index c9b1fa2669c..00000000000 --- a/python/cudf/cudf/core/cut.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from collections import abc - -import cupy -import numpy as np -import pandas as pd - -import cudf -from cudf.api.types import is_list_like -from cudf.core.column import as_column -from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes -from cudf.core.index import IntervalIndex, interval_range - - -def cut( - x, - bins, - right: bool = True, - labels=None, - retbins: bool = False, - precision: int = 3, - include_lowest: bool = False, - duplicates: str = "raise", - ordered: bool = True, -): - """Bin values into discrete intervals. - - Use cut when you need to segment and sort data values into bins. This - function is also useful for going from a continuous variable to a - categorical variable. - - Parameters - ---------- - x : array-like - The input array to be binned. Must be 1-dimensional. - bins : int, sequence of scalars, or IntervalIndex - The criteria to bin by. - - * int : Defines the number of equal-width bins in the range of `x`. The - range of `x` is extended by .1% on each side to include the minimum - and maximum values of `x`. - * sequence of scalars : Defines the bin edges allowing for non-uniform - width. No extension of the range of `x` is done. - * IntervalIndex : Defines the exact bins to be used. Note that - IntervalIndex for `bins` must be non-overlapping. - - right : bool, default True - Indicates whether bins includes the rightmost edge or not. - labels : array or False, default None - Specifies the labels for the returned bins. Must be the same - length as the resulting bins. If False, returns only integer - indicators of the bins. If True,raises an error. When ordered=False, - labels must be provided. - retbins : bool, default False - Whether to return the bins or not. - precision : int, default 3 - The precision at which to store and display the bins labels. - include_lowest : bool, default False - Whether the first interval should be left-inclusive or not. - duplicates : {default 'raise', 'drop'}, optional - If bin edges are not unique, raise ValueError or drop non-uniques. - ordered : bool, default True - Whether the labels are ordered or not. Applies to returned types - Categorical and Series (with Categorical dtype). If True, - the resulting categorical will be ordered. If False, the resulting - categorical will be unordered (labels must be provided). - - Returns - ------- - out : CategoricalIndex - An array-like object representing the respective bin for each value - of x. The type depends on the value of labels. - bins : numpy.ndarray or IntervalIndex. - The computed or specified bins. Only returned when retbins=True. - For scalar or sequence bins, this is an ndarray with the computed - bins. If set duplicates=drop, bins will drop non-unique bin. For - an IntervalIndex bins, this is equal to bins. - - Examples - -------- - Discretize into three equal-sized bins. - - >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3) - CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], - (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0], - (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category') - - >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) - (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], - (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0], - (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category'), - array([0.994, 3. , 5. , 7. ])) - - >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), - ... 3, labels=["bad", "medium", "good"]) - CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'], - categories=['bad', 'medium', 'good'],ordered=True, - dtype='category') - - >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, - ... labels=["B", "A", "B"], ordered=False) - CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'], - ordered=False, dtype='category') - - >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False) - array([0, 1, 1, 3], dtype=int32) - - Passing a Series as an input returns a Series with categorical dtype: - - >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]), - ... index=['a', 'b', 'c', 'd', 'e']) - >>> cudf.cut(s, 3) - """ - left_inclusive = False - right_inclusive = True - # saving the original input x for use in case its a series - orig_x = x - old_bins = bins - - if not ordered and labels is None: - raise ValueError("'labels' must be provided if 'ordered = False'") - - if duplicates not in ["raise", "drop"]: - raise ValueError( - "invalid value for 'duplicates' parameter, valid options are: " - "raise, drop" - ) - - if labels is not False: - if not (labels is None or is_list_like(labels)): - raise ValueError( - "Bin labels must either be False, None or passed in as a " - "list-like argument" - ) - if ordered and labels is not None: - if len(set(labels)) != len(labels): - raise ValueError( - "labels must be unique if ordered=True;" - "pass ordered=False for duplicate labels" - ) - - # bins can either be an int, sequence of scalars or an intervalIndex - if isinstance(bins, abc.Sequence): - if len(set(bins)) is not len(bins): - if duplicates == "raise": - raise ValueError( - f"Bin edges must be unique: {repr(bins)}.\n" - f"You can drop duplicate edges by setting the 'duplicates'" - "kwarg" - ) - elif duplicates == "drop": - # get unique values but maintain list dtype - bins = list(dict.fromkeys(bins)) - - # if bins is an intervalIndex we ignore the value of right - elif isinstance(bins, (pd.IntervalIndex, cudf.IntervalIndex)): - right = bins.closed == "right" - - # create bins if given an int or single scalar - if not isinstance(bins, pd.IntervalIndex): - if not isinstance(bins, (abc.Sequence)): - if isinstance( - x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray) - ): - mn = x.min() - mx = x.max() - else: - mn = min(x) - mx = max(x) - bins = np.linspace(mn, mx, bins + 1, endpoint=True) - adj = (mx - mn) * 0.001 - if right: - bins[0] -= adj - else: - bins[-1] += adj - - # if right and include lowest we adjust the first - # bin edge to make sure it is included - if right and include_lowest: - bins[0] = bins[0] - 10 ** (-precision) - - # if right is false the last bin edge is not included - if not right: - right_edge = bins[-1] - x = cupy.asarray(x) - x[x == right_edge] = right_edge + 1 - - # adjust bin edges decimal precision - int_label_bins = np.around(bins, precision) - - # checking for the correct inclusivity values - if right: - closed = "right" - else: - closed = "left" - left_inclusive = True - - if isinstance(bins, pd.IntervalIndex): - interval_labels = bins - elif labels is None: - if duplicates == "drop" and len(bins) == 1 and len(old_bins) != 1: - if right and include_lowest: - old_bins[0] = old_bins[0] - 10 ** (-precision) - interval_labels = interval_range( - old_bins[0], old_bins[1], periods=1, closed=closed - ) - else: - interval_labels = IntervalIndex.from_breaks( - old_bins, closed=closed - ) - else: - # get labels for categories - interval_labels = IntervalIndex.from_breaks( - int_label_bins, closed=closed - ) - elif labels is not False: - if not (is_list_like(labels)): - raise ValueError( - "Bin labels must either be False, None or passed in as a " - "list-like argument" - ) - if ordered and len(set(labels)) != len(labels): - raise ValueError( - "labels must be unique if ordered=True; " - "pass ordered=False for" - "duplicate labels" - ) - - if len(labels) != len(bins) - 1: - raise ValueError( - "Bin labels must be one fewer than the number of bin edges" - ) - if not ordered and len(set(labels)) != len(labels): - interval_labels = cudf.CategoricalIndex( - labels, categories=None, ordered=False - ) - else: - interval_labels = ( - labels if len(set(labels)) == len(labels) else None - ) - - # the inputs is a column of the values in the array x - input_arr = as_column(x) - - if isinstance(bins, pd.IntervalIndex): - # get the left and right edges of the bins as columns - # we cannot typecast an IntervalIndex, so we need to - # make the edges the same type as the input array - left_edges = as_column(bins.left).astype(input_arr.dtype) - right_edges = as_column(bins.right).astype(input_arr.dtype) - else: - # get the left and right edges of the bins as columns - left_edges = as_column(bins[:-1:], dtype="float64") - right_edges = as_column(bins[+1::], dtype="float64") - # the input arr must be changed to the same type as the edges - input_arr = input_arr.astype(left_edges.dtype) - # get the indexes for the appropriate number - index_labels = cudf._lib.labeling.label_bins( - input_arr, left_edges, left_inclusive, right_edges, right_inclusive - ) - - if labels is False: - # if labels is false we return the index labels, we return them - # as a series if we have a series input - if isinstance(orig_x, (pd.Series, cudf.Series)): - # need to run more tests but looks like in this case pandas - # always returns a float64 dtype - indx_arr_series = cudf.Series(index_labels, dtype="float64") - # if retbins we return the bins as well - if retbins: - return indx_arr_series, bins - else: - return indx_arr_series - elif retbins: - return index_labels.values, bins - else: - return index_labels.values - - if labels is not None: - if labels is not ordered and len(set(labels)) != len(labels): - # when we have duplicate labels and ordered is False, we - # should allow duplicate categories. - return interval_labels[index_labels] - - index_labels = as_unsigned_codes(len(interval_labels), index_labels) - - col = CategoricalColumn( - data=None, - size=index_labels.size, - dtype=cudf.CategoricalDtype( - categories=interval_labels, ordered=ordered - ), - mask=index_labels.base_mask, - offset=index_labels.offset, - children=(index_labels,), - ) - - # we return a categorical index, as we don't have a Categorical method - categorical_index = cudf.CategoricalIndex._from_column(col) - - if isinstance(orig_x, (pd.Series, cudf.Series)): - # if we have a series input we return a series output - res_series = cudf.Series(categorical_index, index=orig_x.index) - if retbins: - return res_series, bins - else: - return res_series - elif retbins: - # if retbins is true we return the bins as well - return categorical_index, bins - else: - return categorical_index diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py deleted file mode 100644 index 79ed5a0e187..00000000000 --- a/python/cudf/cudf/core/dataframe.py +++ /dev/null @@ -1,8513 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import functools -import inspect -import itertools -import numbers -import os -import pickle -import re -import sys -import textwrap -import warnings -from collections import abc, defaultdict -from collections.abc import Callable, Iterator -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast - -import cupy -import numba -import numpy as np -import pandas as pd -import pyarrow as pa -from nvtx import annotate -from pandas.io.formats import console -from pandas.io.formats.printing import pprint_thing -from typing_extensions import Self, assert_never - -import cudf -import cudf.core.common -from cudf import _lib as libcudf -from cudf.api.extensions import no_default -from cudf.api.types import ( - _is_scalar_or_zero_d_array, - is_dict_like, - is_dtype_equal, - is_list_like, - is_numeric_dtype, - is_object_dtype, - is_scalar, - is_string_dtype, -) -from cudf.core import column, df_protocol, indexing_utils, reshape -from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable -from cudf.core.column import ( - CategoricalColumn, - ColumnBase, - StructColumn, - as_column, - column_empty, - concat_columns, -) -from cudf.core.column.categorical import as_unsigned_codes -from cudf.core.column_accessor import ColumnAccessor -from cudf.core.copy_types import BooleanMask -from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template -from cudf.core.index import ( - BaseIndex, - RangeIndex, - _index_from_data, - ensure_index, -) -from cudf.core.indexed_frame import ( - IndexedFrame, - _FrameIndexer, - _get_label_range_or_mask, - _indices_from_labels, - doc_reset_index_template, -) -from cudf.core.join import Merge, MergeSemi -from cudf.core.missing import NA -from cudf.core.multiindex import MultiIndex -from cudf.core.resample import DataFrameResampler -from cudf.core.series import Series -from cudf.core.udf.row_function import _get_row_kernel -from cudf.errors import MixedTypeError -from cudf.utils import applyutils, docutils, ioutils, queryutils -from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import ( - can_convert_to_column, - cudf_dtype_from_pydata_dtype, - find_common_type, - is_column_like, - min_signed_type, -) -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api - -if TYPE_CHECKING: - from cudf._typing import ColumnLike, Dtype, NotImplementedType - -_cupy_nan_methods_map = { - "min": "nanmin", - "max": "nanmax", - "sum": "nansum", - "prod": "nanprod", - "product": "nanprod", - "mean": "nanmean", - "std": "nanstd", - "var": "nanvar", -} - - -def _shape_mismatch_error(x, y): - raise ValueError( - f"shape mismatch: value array of shape {x} " - f"could not be broadcast to indexing result of " - f"shape {y}" - ) - - -class _DataFrameIndexer(_FrameIndexer): - def __getitem__(self, arg): - if ( - isinstance(self._frame.index, MultiIndex) - or self._frame._data.multiindex - ): - # This try/except block allows the use of pandas-like - # tuple arguments into MultiIndex dataframes. - try: - return self._getitem_tuple_arg(arg) - except (TypeError, KeyError, IndexError, ValueError): - return self._getitem_tuple_arg((arg, slice(None))) - else: - if not isinstance(arg, tuple): - arg = (arg, slice(None)) - return self._getitem_tuple_arg(arg) - - def __setitem__(self, key, value): - if not isinstance(key, tuple): - key = (key, slice(None)) - return self._setitem_tuple_arg(key, value) - - @_performance_tracking - def _can_downcast_to_series(self, df, arg): - """ - This method encapsulates the logic used - to determine whether or not the result of a loc/iloc - operation should be "downcasted" from a DataFrame to a - Series - """ - if isinstance(df, cudf.Series): - return False - nrows, ncols = df.shape - if nrows == 1: - if type(arg[0]) is slice: - if not is_scalar(arg[1]): - return False - elif (is_list_like(arg[0]) or is_column_like(arg[0])) and ( - is_list_like(arg[1]) - or is_column_like(arg[0]) - or type(arg[1]) is slice - ): - return False - else: - if as_column(arg[0]).dtype.kind == "b" and not isinstance( - arg[1], slice - ): - return True - dtypes = df.dtypes.values.tolist() - all_numeric = all(is_numeric_dtype(t) for t in dtypes) - if all_numeric or ( - len(dtypes) and all(t == dtypes[0] for t in dtypes) - ): - return True - if isinstance(arg[1], tuple): - return True - if ncols == 1: - if type(arg[1]) is slice: - return False - if isinstance(arg[1], tuple): - return len(arg[1]) == df._data.nlevels - if not (is_list_like(arg[1]) or is_column_like(arg[1])): - return True - return False - - @_performance_tracking - def _downcast_to_series(self, df: DataFrame, arg): - """ - "Downcast" from a DataFrame to a Series - based on Pandas indexing rules - """ - nrows, ncols = df.shape - # determine the axis along which the Series is taken: - if nrows == 1 and ncols == 1: - if is_scalar(arg[0]) and ( - is_scalar(arg[1]) - or (df._data.multiindex and arg[1] in df._column_names) - ): - return df[df._column_names[0]].iloc[0] - elif not is_scalar(arg[0]): - axis = 1 - else: - axis = 0 - - elif nrows == 1: - axis = 0 - elif ncols == 1: - axis = 1 - else: - raise ValueError("Cannot downcast DataFrame selection to Series") - - # take series along the axis: - if axis == 1: - return df[df._column_names[0]] - else: - if df._num_columns > 0: - dtypes = df.dtypes.values.tolist() - normalized_dtype = np.result_type(*dtypes) - for name, col in df._column_labels_and_values: - df[name] = col.astype(normalized_dtype) - - sr = df.T - return sr[sr._column_names[0]] - - -class _DataFrameLocIndexer(_DataFrameIndexer): - """ - For selection by label. - """ - - @_performance_tracking - def _getitem_scalar(self, arg): - return self._frame[arg[1]].loc[arg[0]] - - @_performance_tracking - def _getitem_tuple_arg(self, arg): - from uuid import uuid4 - - # Step 1: Gather columns - if isinstance(arg, tuple): - columns_df = self._frame._get_columns_by_label(arg[1]) - columns_df.index = self._frame.index - else: - columns_df = self._frame - - # Step 2: Gather rows - if isinstance(columns_df.index, MultiIndex): - if isinstance(arg, (MultiIndex, pd.MultiIndex)): - if isinstance(arg, pd.MultiIndex): - arg = MultiIndex.from_pandas(arg) - - indices = _indices_from_labels(columns_df, arg) - return columns_df.take(indices) - - else: - if isinstance(arg, tuple): - row_arg = arg[0] - elif is_scalar(arg): - row_arg = (arg,) - else: - row_arg = arg - result = columns_df.index._get_row_major(columns_df, row_arg) - if ( - len(result) == 1 - and isinstance(arg, tuple) - and len(arg) > 1 - and is_scalar(arg[1]) - ): - return result._columns[0].element_indexing(0) - return result - else: - if isinstance(arg[0], slice): - out = _get_label_range_or_mask( - columns_df.index, arg[0].start, arg[0].stop, arg[0].step - ) - if isinstance(out, slice): - df = columns_df._slice(out) - else: - df = columns_df._apply_boolean_mask( - BooleanMask.from_column_unchecked( - cudf.core.column.as_column(out) - ) - ) - else: - tmp_arg = arg - if is_scalar(arg[0]): - # If a scalar, there is possibility of having duplicates. - # Join would get all the duplicates. So, converting it to - # an array kind. - if cudf.get_option("mode.pandas_compatible"): - if any( - c.dtype != columns_df._columns[0].dtype - for c in columns_df._columns - ): - raise TypeError( - "All columns need to be of same type, please " - "typecast to common dtype." - ) - tmp_arg = ([tmp_arg[0]], tmp_arg[1]) - if len(tmp_arg[0]) == 0: - return columns_df._empty_like(keep_index=True) - tmp_arg = ( - as_column( - tmp_arg[0], - dtype=self._frame.index.dtype - if isinstance( - self._frame.index.dtype, cudf.CategoricalDtype - ) - else None, - ), - tmp_arg[1], - ) - - if tmp_arg[0].dtype.kind == "b": - df = columns_df._apply_boolean_mask( - BooleanMask(tmp_arg[0], len(columns_df)) - ) - else: - tmp_col_name = str(uuid4()) - cantor_name = "_" + "_".join( - map(str, columns_df._column_names) - ) - if columns_df._data.multiindex: - # column names must be appropriate length tuples - extra = tuple( - "" for _ in range(columns_df._data.nlevels - 1) - ) - tmp_col_name = (tmp_col_name, *extra) - cantor_name = (cantor_name, *extra) - other_df = DataFrame( - { - tmp_col_name: column.as_column( - range(len(tmp_arg[0])) - ) - }, - index=cudf.Index._from_column(tmp_arg[0]), - ) - columns_df[cantor_name] = column.as_column( - range(len(columns_df)) - ) - df = other_df.join(columns_df, how="inner") - # as join is not assigning any names to index, - # update it over here - df.index.name = columns_df.index.name - if not isinstance( - df.index, MultiIndex - ) and is_numeric_dtype(df.index.dtype): - # Preserve the original index type. - df.index = df.index.astype(self._frame.index.dtype) - df = df.sort_values(by=[tmp_col_name, cantor_name]) - df.drop(columns=[tmp_col_name, cantor_name], inplace=True) - # There were no indices found - if len(df) == 0: - raise KeyError(arg) - - # Step 3: Downcast - if self._can_downcast_to_series(df, arg): - return self._downcast_to_series(df, arg) - return df - - @_performance_tracking - def _setitem_tuple_arg(self, key, value): - if ( - isinstance(self._frame.index, MultiIndex) - or self._frame._data.multiindex - ): - raise NotImplementedError( - "Setting values using df.loc[] not supported on " - "DataFrames with a MultiIndex" - ) - - try: - columns_df = self._frame._get_columns_by_label(key[1]) - except KeyError: - if not self._frame.empty and isinstance(key[0], slice): - pos_range = _get_label_range_or_mask( - self._frame.index, key[0].start, key[0].stop, key[0].step - ) - idx = self._frame.index[pos_range] - elif self._frame.empty and isinstance(key[0], slice): - idx = None - else: - if is_scalar(key[0]): - arr = [key[0]] - else: - arr = key[0] - idx = cudf.Index(arr) - if is_scalar(value): - length = len(idx) if idx is not None else 1 - value = as_column(value, length=length) - - if isinstance(value, ColumnBase): - new_ser = cudf.Series._from_column(value, index=idx) - else: - new_ser = cudf.Series(value, index=idx) - if len(self._frame.index) != 0: - new_ser = new_ser._align_to_index( - self._frame.index, how="right" - ) - - if len(self._frame.index) == 0: - self._frame.index = ( - idx if idx is not None else cudf.RangeIndex(len(new_ser)) - ) - self._frame._data.insert(key[1], new_ser._column) - else: - if is_scalar(value): - for col in columns_df._column_names: - self._frame[col].loc[key[0]] = value - - elif isinstance(value, cudf.DataFrame): - if value.shape != self._frame.loc[key[0]].shape: - _shape_mismatch_error( - value.shape, - self._frame.loc[key[0]].shape, - ) - value_column_names = set(value._column_names) - scatter_map = _indices_from_labels(self._frame, key[0]) - for col in columns_df._column_names: - columns_df[col][scatter_map] = ( - value._data[col] if col in value_column_names else NA - ) - - else: - if not is_column_like(value): - value = cupy.asarray(value) - if getattr(value, "ndim", 1) == 2: - # If the inner dimension is 1, it's broadcastable to - # all columns of the dataframe. - indexed_shape = columns_df.loc[key[0]].shape - if value.shape[1] == 1: - if value.shape[0] != indexed_shape[0]: - _shape_mismatch_error(value.shape, indexed_shape) - for i, col in enumerate(columns_df._column_names): - self._frame[col].loc[key[0]] = value[:, 0] - else: - if value.shape != indexed_shape: - _shape_mismatch_error(value.shape, indexed_shape) - for i, col in enumerate(columns_df._column_names): - self._frame[col].loc[key[0]] = value[:, i] - else: - # handle cases where value is 1d object: - # If the key on column axis is a scalar, we indexed - # a single column; The 1d value should assign along - # the columns. - if is_scalar(key[1]): - for col in columns_df._column_names: - self._frame[col].loc[key[0]] = value - # Otherwise, there are two situations. The key on row axis - # can be a scalar or 1d. In either of the situation, the - # ith element in value corresponds to the ith row in - # the indexed object. - # If the key is 1d, a broadcast will happen. - else: - for i, col in enumerate(columns_df._column_names): - self._frame[col].loc[key[0]] = value[i] - - -class _DataFrameAtIndexer(_DataFrameLocIndexer): - pass - - -class _DataFrameIlocIndexer(_DataFrameIndexer): - """ - For selection by index. - """ - - _frame: DataFrame - - def __getitem__(self, arg): - ( - row_key, - ( - col_is_scalar, - column_names, - ), - ) = indexing_utils.destructure_dataframe_iloc_indexer(arg, self._frame) - row_spec = indexing_utils.parse_row_iloc_indexer( - row_key, len(self._frame) - ) - ca = self._frame._data - index = self._frame.index - if col_is_scalar: - name = column_names[0] - s = Series._from_column(ca._data[name], name=name, index=index) - return s._getitem_preprocessed(row_spec) - if column_names != list(self._frame._column_names): - frame = self._frame._from_data( - data=ColumnAccessor( - {key: ca._data[key] for key in column_names}, - multiindex=ca.multiindex, - level_names=ca.level_names, - verify=False, - ), - index=index, - ) - else: - frame = self._frame - if isinstance(row_spec, indexing_utils.MapIndexer): - return frame._gather(row_spec.key, keep_index=True) - elif isinstance(row_spec, indexing_utils.MaskIndexer): - return frame._apply_boolean_mask(row_spec.key, keep_index=True) - elif isinstance(row_spec, indexing_utils.SliceIndexer): - return frame._slice(row_spec.key) - elif isinstance(row_spec, indexing_utils.ScalarIndexer): - result = frame._gather(row_spec.key, keep_index=True) - new_name = result.index[0] - new_index = ensure_index(result.keys()) - # Attempt to turn into series. - if len(column_names) == 0: - return Series([], index=new_index, name=new_name) - else: - try: - # Behaviour difference from pandas, which will merrily - # turn any heterogeneous set of columns into a series if - # you only ask for one row. - ser = Series._concat( - [result[name] for name in column_names], - ) - except TypeError as err: - # Couldn't find a common type, Hence: - # Raise in pandas compatibility mode, - # or just return a 1xN dataframe otherwise - if cudf.get_option("mode.pandas_compatible"): - raise TypeError( - "All columns need to be of same type, please " - "typecast to common dtype." - ) from err - return result - else: - ser.index = new_index - ser.name = new_name - return ser - elif isinstance(row_spec, indexing_utils.EmptyIndexer): - return frame._empty_like(keep_index=True) - assert_never(row_spec) - - @_performance_tracking - def _setitem_tuple_arg(self, key, value): - columns_df = self._frame._from_data( - self._frame._data.select_by_index(key[1]), self._frame.index - ) - - if is_scalar(value): - for col in columns_df._column_names: - self._frame[col].iloc[key[0]] = value - - elif isinstance(value, cudf.DataFrame): - if value.shape != self._frame.iloc[key[0]].shape: - _shape_mismatch_error( - value.shape, - self._frame.loc[key[0]].shape, - ) - value_column_names = set(value._column_names) - for col in columns_df._column_names: - columns_df[col][key[0]] = ( - value._data[col] if col in value_column_names else NA - ) - - else: - # TODO: consolidate code path with identical counterpart - # in `_DataFrameLocIndexer._setitem_tuple_arg` - if not is_column_like(value): - value = cupy.asarray(value) - if getattr(value, "ndim", 1) == 2: - indexed_shape = columns_df.iloc[key[0]].shape - if value.shape[1] == 1: - if value.shape[0] != indexed_shape[0]: - _shape_mismatch_error(value.shape, indexed_shape) - for i, col in enumerate(columns_df._column_names): - self._frame[col].iloc[key[0]] = value[:, 0] - else: - if value.shape != indexed_shape: - _shape_mismatch_error(value.shape, indexed_shape) - for i, col in enumerate(columns_df._column_names): - self._frame._data[col][key[0]] = value[:, i] - else: - if is_scalar(key[1]): - for col in columns_df._column_names: - self._frame[col].iloc[key[0]] = value - else: - for i, col in enumerate(columns_df._column_names): - self._frame[col].iloc[key[0]] = value[i] - - -class _DataFrameiAtIndexer(_DataFrameIlocIndexer): - pass - - -class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): - """ - A GPU Dataframe object. - - Parameters - ---------- - data : array-like, Iterable, dict, or DataFrame. - Dict can contain Series, arrays, constants, or list-like objects. - index : Index or array-like - Index to use for resulting frame. Will default to - RangeIndex if no indexing information part of input data and - no index provided. - columns : Index or array-like - Column labels to use for resulting frame. - Will default to RangeIndex (0, 1, 2, …, n) if no column - labels are provided. - dtype : dtype, default None - Data type to force. Only a single dtype is allowed. - If None, infer. - copy : bool or None, default None - Copy data from inputs. - Currently not implemented. - nan_as_null : bool, Default True - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Examples - -------- - Build dataframe with ``__setitem__``: - - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df - key val - 0 0 10.0 - 1 1 11.0 - 2 2 12.0 - 3 3 13.0 - 4 4 14.0 - - Build DataFrame via dict of columns: - - >>> import numpy as np - >>> from datetime import datetime, timedelta - >>> t0 = datetime.strptime('2018-10-07 12:00:00', '%Y-%m-%d %H:%M:%S') - >>> n = 5 - >>> df = cudf.DataFrame({ - ... 'id': np.arange(n), - ... 'datetimes': np.array( - ... [(t0+ timedelta(seconds=x)) for x in range(n)]) - ... }) - >>> df - id datetimes - 0 0 2018-10-07 12:00:00 - 1 1 2018-10-07 12:00:01 - 2 2 2018-10-07 12:00:02 - 3 3 2018-10-07 12:00:03 - 4 4 2018-10-07 12:00:04 - - Build DataFrame via list of rows as tuples: - - >>> df = cudf.DataFrame([ - ... (5, "cats", "jump", np.nan), - ... (2, "dogs", "dig", 7.5), - ... (3, "cows", "moo", -2.1, "occasionally"), - ... ]) - >>> df - 0 1 2 3 4 - 0 5 cats jump - 1 2 dogs dig 7.5 - 2 3 cows moo -2.1 occasionally - - Convert from a Pandas DataFrame: - - >>> import pandas as pd - >>> pdf = pd.DataFrame({'a': [0, 1, 2, 3],'b': [0.1, 0.2, None, 0.3]}) - >>> pdf - a b - 0 0 0.1 - 1 1 0.2 - 2 2 NaN - 3 3 0.3 - >>> df = cudf.from_pandas(pdf) - >>> df - a b - 0 0 0.1 - 1 1 0.2 - 2 2 - 3 3 0.3 - """ - - _PROTECTED_KEYS = frozenset( - ("_data", "_index", "_ipython_canary_method_should_not_exist_") - ) - _accessors: set[Any] = set() - _loc_indexer_type = _DataFrameLocIndexer - _iloc_indexer_type = _DataFrameIlocIndexer - _groupby = DataFrameGroupBy - _resampler = DataFrameResampler - - @_performance_tracking - def __init__( - self, - data=None, - index=None, - columns=None, - dtype=None, - copy=None, - nan_as_null=no_default, - ): - if copy is not None: - raise NotImplementedError("copy is not currently implemented.") - super().__init__({}, index=cudf.Index([])) - if nan_as_null is no_default: - nan_as_null = not cudf.get_option("mode.pandas_compatible") - - if isinstance(columns, (Series, cudf.BaseIndex)): - columns = columns.to_pandas() - - if isinstance(data, (DataFrame, pd.DataFrame)): - if isinstance(data, pd.DataFrame): - data = self.from_pandas(data, nan_as_null=nan_as_null) - - if index is not None: - if not data.index.equals(index): - data = data.reindex(index) - index = data.index - else: - index = ensure_index(index) - else: - index = data.index - - self._index = index - - if columns is not None: - self._data = data._data - self._reindex( - column_names=columns, index=index, deep=False, inplace=True - ) - if isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ): - self._data.rangeindex = True - else: - self._data = data._data - self._data.rangeindex = True - elif isinstance(data, (cudf.Series, pd.Series)): - if isinstance(data, pd.Series): - data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null) - - # Series.name is not None and Series.name in columns - # -> align - # Series.name is not None and Series.name not in columns - # -> return empty DataFrame - # Series.name is None and no columns - # -> return 1 column DataFrame - # Series.name is None and columns - # -> return 1 column DataFrame if len(columns) in {0, 1} - if data.name is None and columns is not None: - if len(columns) > 1: - raise ValueError( - "Length of columns must be less than 2 if " - f"{type(data).__name__}.name is None." - ) - name = columns[0] - else: - name = data.name or 0 - self._init_from_dict_like( - {name: data}, - index=index, - columns=columns, - nan_as_null=nan_as_null, - ) - elif data is None: - if index is None: - self._index = RangeIndex(0) - else: - self._index = ensure_index(index) - if columns is not None: - rangeindex = isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ) - label_dtype = getattr(columns, "dtype", None) - self._data = ColumnAccessor( - { - k: column.column_empty( - len(self), dtype="object", masked=True - ) - for k in columns - }, - level_names=tuple(columns.names) - if isinstance(columns, pd.Index) - else None, - rangeindex=rangeindex, - label_dtype=label_dtype, - verify=False, - ) - elif isinstance(data, ColumnAccessor): - raise TypeError( - "Use cudf.Series._from_data for constructing a Series from " - "ColumnAccessor" - ) - elif hasattr(data, "__cuda_array_interface__"): - arr_interface = data.__cuda_array_interface__ - # descr is an optional field of the _cuda_ary_iface_ - if "descr" in arr_interface: - if len(arr_interface["descr"]) == 1: - new_df = self._from_arrays( - data, index=index, columns=columns - ) - else: - new_df = self.from_records( - data, index=index, columns=columns - ) - else: - new_df = self._from_arrays(data, index=index, columns=columns) - - self._data = new_df._data - self._index = new_df._index - self._check_data_index_length_match() - elif hasattr(data, "__array_interface__"): - arr_interface = data.__array_interface__ - if len(arr_interface["descr"]) == 1: - # not record arrays - new_df = self._from_arrays(data, index=index, columns=columns) - else: - new_df = self.from_records(data, index=index, columns=columns) - self._data = new_df._data - self._index = new_df._index - self._check_data_index_length_match() - else: - if isinstance(data, Iterator): - data = list(data) - if is_list_like(data): - if len(data) > 0 and is_scalar(data[0]): - if columns is not None: - label_dtype = getattr(columns, "dtype", None) - data = dict(zip(columns, [data])) - rangeindex = isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ) - else: - data = dict(enumerate([data])) - rangeindex = True - label_dtype = None - new_df = DataFrame(data=data, index=index) - - self._data = new_df._data - self._index = new_df._index - self._data._level_names = ( - tuple(columns.names) - if isinstance(columns, pd.Index) - else self._data._level_names - ) - self._data.rangeindex = rangeindex - self._data.label_dtype = ( - cudf.dtype(label_dtype) - if label_dtype is not None - else None - ) - elif len(data) > 0 and isinstance(data[0], Series): - self._init_from_series_list( - data=data, columns=columns, index=index - ) - else: - self._init_from_list_like( - data, index=index, columns=columns - ) - self._check_data_index_length_match() - else: - if not is_dict_like(data): - raise TypeError("data must be list or dict-like") - - self._init_from_dict_like( - data, index=index, columns=columns, nan_as_null=nan_as_null - ) - self._check_data_index_length_match() - - if dtype: - self._data = self.astype(dtype)._data - - self._data.multiindex = self._data.multiindex or isinstance( - columns, pd.MultiIndex - ) - - @_performance_tracking - def _init_from_series_list(self, data, columns, index): - if index is None: - # When `index` is `None`, the final index of - # resulting dataframe will be union of - # all Series's names. - final_index = cudf.Index(_get_union_of_series_names(data)) - else: - # When an `index` is passed, the final index of - # resulting dataframe will be whatever - # index passed, but will need - # shape validations - explained below - data_length = len(data) - index_length = len(index) - if data_length != index_length: - # If the passed `index` length doesn't match - # length of Series objects in `data`, we must - # check if `data` can be duplicated/expanded - # to match the length of index. For that we - # check if the length of index is a factor - # of length of data. - # - # 1. If yes, we extend data - # until length of data is equal to length of index. - # 2. If no, we throw an error stating the - # shape of resulting `data` and `index` - - # Simple example - # >>> import pandas as pd - # >>> s = pd.Series([1, 2, 3]) - # >>> pd.DataFrame([s], index=['a', 'b']) - # 0 1 2 - # a 1 2 3 - # b 1 2 3 - # >>> pd.DataFrame([s], index=['a', 'b', 'c']) - # 0 1 2 - # a 1 2 3 - # b 1 2 3 - # c 1 2 3 - if index_length % data_length == 0: - initial_data = data - data = [] - for _ in range(int(index_length / data_length)): - data.extend([o for o in initial_data]) - else: - raise ValueError( - f"Length of values ({data_length}) does " - f"not match length of index ({index_length})" - ) - - final_index = ensure_index(index) - - series_lengths = list(map(len, data)) - common_dtype = find_common_type([obj.dtype for obj in data]) - data = [obj.astype(common_dtype) for obj in data] - if series_lengths.count(series_lengths[0]) == len(series_lengths): - # Calculating the final dataframe columns by - # getting union of all `index` of the Series objects. - final_columns = _get_union_of_indices([d.index for d in data]) - if isinstance(final_columns, cudf.RangeIndex): - self._data.rangeindex = True - - for idx, series in enumerate(data): - if not series.index.is_unique: - raise ValueError( - "Reindexing only valid with uniquely valued Index " - "objects" - ) - if not series.index.equals(final_columns): - series = series.reindex(final_columns) - self._data[idx] = series._column - - # Setting `final_columns` to self._index so - # that the resulting `transpose` will be have - # columns set to `final_columns` - self._index = cudf.Index(final_columns) - - transpose = self.T - else: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - concat_df = cudf.concat(data, axis=1) - - cols = concat_df._data.to_pandas_index() - if cols.dtype == "object": - concat_df.columns = cols.astype("str") - - transpose = concat_df.T - - transpose._index = final_index - self._data = transpose._data - self._index = transpose._index - - # If `columns` is passed, the result dataframe - # contain a dataframe with only the - # specified `columns` in the same order. - if columns is not None: - for col_name in columns: - if col_name not in self._data: - self._data[col_name] = column.column_empty( - row_count=len(self), dtype=None, masked=True - ) - self._data._level_names = ( - tuple(columns.names) - if isinstance(columns, pd.Index) - else self._data._level_names - ) - self._data = self._data.select_by_label(columns) - self._data.rangeindex = isinstance( - columns, (range, cudf.RangeIndex, pd.RangeIndex) - ) - self._data.label_dtype = pd.Index(columns).dtype - else: - self._data.rangeindex = True - - @_performance_tracking - def _init_from_list_like(self, data, index=None, columns=None): - if index is None: - index = RangeIndex(start=0, stop=len(data)) - else: - index = ensure_index(index) - - self._index = index - # list-of-dicts case - if len(data) > 0 and isinstance(data[0], dict): - data = DataFrame.from_pandas(pd.DataFrame(data)) - self._data = data._data - # interval in a list - elif len(data) > 0 and isinstance(data[0], pd.Interval): - data = DataFrame.from_pandas(pd.DataFrame(data)) - self._data = data._data - elif any( - not isinstance(col, (abc.Iterable, abc.Sequence)) for col in data - ): - raise TypeError("Inputs should be an iterable or sequence.") - elif len(data) > 0 and not can_convert_to_column(data[0]): - raise ValueError("Must pass 2-d input.") - else: - if ( - len(data) > 0 - and columns is None - and isinstance(data[0], tuple) - and hasattr(data[0], "_fields") - ): - # pandas behavior is to use the fields from the first - # namedtuple as the column names - columns = data[0]._fields - - data = list(itertools.zip_longest(*data)) - - if columns is not None and len(data) == 0: - data = [ - cudf.core.column.column_empty(row_count=0, dtype=None) - for _ in columns - ] - - for col_name, col in enumerate(data): - self._data[col_name] = column.as_column(col) - self._data.rangeindex = True - - if columns is not None: - if len(columns) != len(data): - raise ValueError( - f"Shape of passed values is ({len(index)}, {len(data)}), " - f"indices imply ({len(index)}, {len(columns)})." - ) - - self.columns = columns - self._data.rangeindex = isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ) - self._data.label_dtype = getattr(columns, "dtype", None) - - @_performance_tracking - def _init_from_dict_like( - self, data, index=None, columns=None, nan_as_null=None - ): - label_dtype = None - if columns is not None: - label_dtype = getattr(columns, "dtype", None) - # remove all entries in data that are not in columns, - # inserting new empty columns for entries in columns that - # are not in data - if any(c in data for c in columns): - # Let the downstream logic determine the length of the - # empty columns here - empty_column = lambda: None # noqa: E731 - else: - # If keys is empty, none of the data keys match the - # columns, so we need to create an empty DataFrame. To - # match pandas, the size of the dataframe must match - # the provided index, so we need to return a masked - # array of nulls if an index is given. - empty_column = functools.partial( - cudf.core.column.column_empty, - row_count=(0 if index is None else len(index)), - masked=index is not None, - ) - - data = { - c: data[c] if c in data else empty_column() for c in columns - } - - data, index = self._align_input_series_indices(data, index=index) - - if index is None: - num_rows = 0 - if data: - keys, values, lengths = zip( - *( - (k, v, 1) - if is_scalar(v) - else ( - k, - vc := as_column(v, nan_as_null=nan_as_null), - len(vc), - ) - for k, v in data.items() - ) - ) - data = dict(zip(keys, values)) - try: - (num_rows,) = (set(lengths) - {1}) or {1} - except ValueError: - raise ValueError("All arrays must be the same length") - - self._index = RangeIndex(0, num_rows) - else: - self._index = ensure_index(index) - - if len(data): - self._data.multiindex = True - for i, col_name in enumerate(data): - self._data.multiindex = self._data.multiindex and isinstance( - col_name, tuple - ) - self._insert( - i, - col_name, - data[col_name], - nan_as_null=nan_as_null, - ) - self._data._level_names = ( - tuple(columns.names) - if isinstance(columns, pd.Index) - else self._data._level_names - ) - self._data.label_dtype = label_dtype - - @classmethod - def _from_data( - cls, - data: MutableMapping, - index: BaseIndex | None = None, - columns: Any = None, - ) -> DataFrame: - out = super()._from_data(data=data, index=index) - if columns is not None: - out.columns = columns - return out - - @staticmethod - @_performance_tracking - def _align_input_series_indices(data, index): - input_series = [ - Series(val) - for val in data.values() - if isinstance(val, (pd.Series, Series, dict)) - ] - - if input_series: - if index is not None: - aligned_input_series = [ - sr._align_to_index(index, how="right", sort=False) - for sr in input_series - ] - - else: - aligned_input_series = cudf.core.series._align_indices( - input_series - ) - index = aligned_input_series[0].index - - data = data.copy() - for name, val in data.items(): - if isinstance(val, (pd.Series, Series, dict)): - data[name] = aligned_input_series.pop(0) - - return data, index - - # The `constructor*` properties are used by `dask` (and `dask_cudf`) - @property - def _constructor(self): - return DataFrame - - @property - def _constructor_sliced(self): - return Series - - @property - def _constructor_expanddim(self): - raise NotImplementedError( - "_constructor_expanddim not supported for DataFrames!" - ) - - def serialize(self): - header, frames = super().serialize() - - header["index"], index_frames = self.index.serialize() - header["index_frame_count"] = len(index_frames) - # For backwards compatibility with older versions of cuDF, index - # columns are placed before data columns. - frames = index_frames + frames - - return header, frames - - @classmethod - def deserialize(cls, header, frames): - index_nframes = header["index_frame_count"] - obj = super().deserialize( - header, frames[header["index_frame_count"] :] - ) - - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) - obj.index = index - - return obj - - @property - @_performance_tracking - def shape(self): - """Returns a tuple representing the dimensionality of the DataFrame.""" - return self._num_rows, self._num_columns - - @property - def dtypes(self): - """ - Return the dtypes in this object. - - Returns - ------- - pandas.Series - The data type of each column. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> df = cudf.DataFrame({'float': [1.0], - ... 'int': [1], - ... 'datetime': [pd.Timestamp('20180310')], - ... 'string': ['foo']}) - >>> df - float int datetime string - 0 1.0 1 2018-03-10 foo - >>> df.dtypes - float float64 - int int64 - datetime datetime64[ns] - string object - dtype: object - """ - return pd.Series(dict(self._dtypes), dtype="object") - - @property - def ndim(self) -> int: - """Dimension of the data. DataFrame ndim is always 2.""" - return 2 - - def __dir__(self): - # Add the columns of the DataFrame to the dir output. - o = set(dir(type(self))) - o.update(self.__dict__) - o.update( - c - for c in self._column_names - if isinstance(c, str) and c.isidentifier() - ) - return list(o) - - def __setattr__(self, key, col): - try: - # Preexisting attributes may be set. We cannot rely on checking the - # `_PROTECTED_KEYS` because we must also allow for settable - # properties, and we must call object.__getattribute__ to bypass - # the `__getitem__` behavior inherited from `GetAttrGetItemMixin`. - object.__getattribute__(self, key) - except AttributeError: - if key not in self._PROTECTED_KEYS: - try: - # Check key existence. - self[key] - # If a column already exists, set it. - self[key] = col - return - except KeyError: - pass - - # Set a new attribute that is not already a column. - super().__setattr__(key, col) - - except RuntimeError as e: - # TODO: This allows setting properties that are marked as forbidden - # for internal usage. It is necessary because the __getattribute__ - # call in the try block will trigger the error. We should see if - # setting these variables can also always be disabled - if "External-only API" not in str(e): - raise - super().__setattr__(key, col) - else: - super().__setattr__(key, col) - - @_performance_tracking - def __getitem__(self, arg): - """ - If *arg* is a ``str`` or ``int`` type, return the column Series. - If *arg* is a ``slice``, return a new DataFrame with all columns - sliced to the specified range. - If *arg* is an ``array`` containing column names, return a new - DataFrame with the corresponding columns. - If *arg* is a ``dtype.bool array``, return the rows marked True - - Examples - -------- - >>> df = cudf.DataFrame({ - ... 'a': list(range(10)), - ... 'b': list(range(10)), - ... 'c': list(range(10)), - ... }) - - Get first 4 rows of all columns. - - >>> df[:4] - a b c - 0 0 0 0 - 1 1 1 1 - 2 2 2 2 - 3 3 3 3 - - Get last 5 rows of all columns. - - >>> df[-5:] - a b c - 5 5 5 5 - 6 6 6 6 - 7 7 7 7 - 8 8 8 8 - 9 9 9 9 - - Get columns a and c. - - >>> df[['a', 'c']] - a c - 0 0 0 - 1 1 1 - 2 2 2 - 3 3 3 - 4 4 4 - 5 5 5 - 6 6 6 - 7 7 7 - 8 8 8 - 9 9 9 - - Return the rows specified in the boolean mask. - - >>> df[[True, False, True, False, True, - ... False, True, False, True, False]] - a b c - 0 0 0 0 - 2 2 2 2 - 4 4 4 4 - 6 6 6 6 - 8 8 8 8 - """ - if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple): - out = self._get_columns_by_label(arg) - if is_scalar(arg): - nlevels = 1 - elif isinstance(arg, tuple): - nlevels = len(arg) - if self._data.multiindex is False or nlevels == self._data.nlevels: - out = self._constructor_sliced._from_data(out._data) - out.index = self.index - out.name = arg - return out - - elif isinstance(arg, slice): - return self._slice(arg) - - elif can_convert_to_column(arg): - mask = arg - if is_list_like(mask): - dtype = None - mask = pd.Series(mask, dtype=dtype) - if mask.dtype == "bool": - return self._apply_boolean_mask(BooleanMask(mask, len(self))) - else: - return self._get_columns_by_label(mask) - elif isinstance(arg, DataFrame): - return self.where(arg) - else: - raise TypeError( - f"__getitem__ on type {type(arg)} is not supported" - ) - - @_performance_tracking - def __setitem__(self, arg, value): - """Add/set column by *arg or DataFrame*""" - if isinstance(arg, DataFrame): - # not handling set_item where arg = df & value = df - if isinstance(value, DataFrame): - raise TypeError( - f"__setitem__ with arg = {type(value)} and " - f"value = {type(arg)} is not supported" - ) - else: - for col_name in self._data: - scatter_map = arg._data[col_name] - if is_scalar(value): - self._data[col_name][scatter_map] = value - else: - self._data[col_name][scatter_map] = column.as_column( - value - )[scatter_map] - elif is_scalar(arg) or isinstance(arg, tuple): - if isinstance(value, DataFrame): - _setitem_with_dataframe( - input_df=self, - replace_df=value, - input_cols=[arg], - mask=None, - ) - else: - if arg in self._data: - if not is_scalar(value) and len(self) == 0: - value = column.as_column(value) - length = len(value) - new_columns = ( - value - if key == arg - else column.column_empty_like( - col, masked=True, newsize=length - ) - for key, col in self._column_labels_and_values - ) - self._data = self._data._from_columns_like_self( - new_columns, verify=False - ) - if isinstance(value, (pd.Series, Series)): - self._index = cudf.Index(value.index) - elif len(value) > 0: - self._index = RangeIndex(length) - return - elif isinstance(value, (pd.Series, Series)): - value = Series(value)._align_to_index( - self.index, - how="right", - sort=False, - allow_non_unique=True, - ) - if is_scalar(value): - self._data[arg] = as_column(value, length=len(self)) - else: - value = as_column(value) - self._data[arg] = value - else: - # disc. with pandas here - # pandas raises key error here - self.insert(self._num_columns, arg, value) - - elif can_convert_to_column(arg): - mask = arg - if is_list_like(mask): - mask = np.array(mask) - - if mask.dtype == "bool": - mask = column.as_column(arg) - - if isinstance(value, DataFrame): - _setitem_with_dataframe( - input_df=self, - replace_df=value, - input_cols=None, - mask=mask, - ) - else: - if not is_scalar(value): - value = column.as_column(value)[mask] - for col_name in self._data: - self._data[col_name][mask] = value - else: - if isinstance(value, (cupy.ndarray, np.ndarray)): - _setitem_with_dataframe( - input_df=self, - replace_df=cudf.DataFrame(value), - input_cols=arg, - mask=None, - ignore_index=True, - ) - elif isinstance(value, DataFrame): - _setitem_with_dataframe( - input_df=self, - replace_df=value, - input_cols=arg, - mask=None, - ) - else: - for col in arg: - if is_scalar(value): - self._data[col] = as_column( - value, length=len(self) - ) - else: - self._data[col] = column.as_column(value) - - else: - raise TypeError( - f"__setitem__ on type {type(arg)} is not supported" - ) - - def __delitem__(self, name): - self._drop_column(name) - - @_performance_tracking - def memory_usage(self, index=True, deep=False) -> cudf.Series: - mem_usage = [col.memory_usage for col in self._columns] - names = [str(name) for name in self._column_names] - if index: - mem_usage.append(self.index.memory_usage()) - names.append("Index") - return Series._from_column( - as_column(mem_usage), - index=cudf.Index(names), - ) - - @_performance_tracking - def __array_function__(self, func, types, args, kwargs): - if "out" in kwargs or not all( - issubclass(t, (Series, DataFrame)) for t in types - ): - return NotImplemented - - try: - if func.__name__ in {"any", "all"}: - # NumPy default for `axis` is - # different from `cudf`/`pandas` - # hence need this special handling. - kwargs.setdefault("axis", None) - if cudf_func := getattr(self.__class__, func.__name__, None): - out = cudf_func(*args, **kwargs) - # The dot product of two DataFrames returns an array in pandas. - if ( - func is np.dot - and isinstance(args[0], (DataFrame, pd.DataFrame)) - and isinstance(args[1], (DataFrame, pd.DataFrame)) - ): - return out.values - return out - except Exception: - # The rare instance where a "silent" failure is preferable. Except - # in the (highly unlikely) case that some other library - # interoperates with cudf objects, the result will be that numpy - # raises a TypeError indicating that the operation is not - # implemented, which is much friendlier than an arbitrary internal - # cudf error. - pass - return NotImplemented - - def __arrow_c_stream__(self, requested_schema=None): - """ - Export the cudf DataFrame as an Arrow C stream PyCapsule. - - Parameters - ---------- - requested_schema : PyCapsule, default None - The schema to which the dataframe should be casted, passed as a - PyCapsule containing a C ArrowSchema representation of the - requested schema. Currently not implemented. - - Returns - ------- - PyCapsule - """ - if requested_schema is not None: - raise NotImplementedError("requested_schema is not supported") - return self.to_arrow().__arrow_c_stream__() - - # The _get_numeric_data method is necessary for dask compatibility. - @_performance_tracking - def _get_numeric_data(self): - """Return a dataframe with only numeric data types""" - columns = [ - c - for c, dt in self.dtypes.items() - if dt != object and not isinstance(dt, cudf.CategoricalDtype) - ] - return self[columns] - - @_performance_tracking - def assign(self, **kwargs: Callable[[Self], Any] | Any): - """ - Assign columns to DataFrame from keyword arguments. - - Parameters - ---------- - **kwargs: dict mapping string column names to values - The value for each key can either be a literal column (or - something that can be converted to a column), or - a callable of one argument that will be given the - dataframe as an argument and should return the new column - (without modifying the input argument). - Columns are added in-order, so callables can refer to - column names constructed in the assignment. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df = df.assign(a=[0, 1, 2], b=[3, 4, 5]) - >>> df - a b - 0 0 3 - 1 1 4 - 2 2 5 - """ - new_df = self.copy(deep=False) - for k, v in kwargs.items(): - new_df[k] = v(new_df) if callable(v) else v - return new_df - - @classmethod - @_performance_tracking - def _concat( - cls, objs, axis=0, join="outer", ignore_index=False, sort=False - ): - # flag to indicate at least one empty input frame also has an index - empty_has_index = False - # length of output frame's RangeIndex if all input frames are empty, - # and at least one has an index - result_index_length = 0 - # the number of empty input frames - num_empty_input_frames = 0 - - # flag to indicate if all DataFrame's have - # RangeIndex as their index - are_all_range_index = False - - for i, obj in enumerate(objs): - # shallow-copy the input DFs in case the same DF instance - # is concatenated with itself - objs[i] = obj.copy(deep=False) - - # If ignore_index is true, determine if - # all or some objs are empty(and have index). - # 1. If all objects are empty(and have index), we - # should set the index separately using RangeIndex. - # 2. If some objects are empty(and have index), we - # create empty columns later while populating `columns` - # variable. Detailed explanation of second case before - # allocation of `columns` variable below. - if ignore_index and obj.empty: - num_empty_input_frames += 1 - result_index_length += len(obj) - empty_has_index = empty_has_index or len(obj) > 0 - - are_all_range_index = ( - True if i == 0 else are_all_range_index - ) and isinstance(obj.index, cudf.RangeIndex) - - if join == "inner": - sets_of_column_names = [set(obj._column_names) for obj in objs] - - intersecting_columns = functools.reduce( - set.intersection, sets_of_column_names - ) - union_of_columns = functools.reduce( - set.union, sets_of_column_names - ) - non_intersecting_columns = union_of_columns.symmetric_difference( - intersecting_columns - ) - - # Get an ordered list of the intersecting columns to preserve input - # order, which is promised by pandas for inner joins. - ordered_intersecting_columns = [ - name - for obj in objs - for name in obj._column_names - if name in intersecting_columns - ] - - names = dict.fromkeys(ordered_intersecting_columns).keys() - - if axis == 0: - if ignore_index and ( - num_empty_input_frames > 0 - or len(intersecting_columns) == 0 - ): - # When ignore_index is True and if there is - # at least 1 empty dataframe and no - # intersecting columns are present, an empty dataframe - # needs to be returned just with an Index. - empty_has_index = True - num_empty_input_frames = len(objs) - result_index_length = sum(len(obj) for obj in objs) - - # remove columns not present in all objs - for obj in objs: - obj.drop( - columns=non_intersecting_columns, - inplace=True, - errors="ignore", - ) - elif join == "outer": - # Get a list of the unique table column names - names = [name for f in objs for name in f._column_names] - names = dict.fromkeys(names).keys() - - else: - raise ValueError( - "Only can inner (intersect) or outer (union) when joining" - "the other axis" - ) - - if sort: - try: - # Sorted always returns a list, but will fail to sort if names - # include different types that are not comparable. - names = sorted(names) - except TypeError: - # For pandas compatibility, we also try to handle the case - # where some column names are strings and others are ints. Just - # assume that everything that isn't a str is numerical, we - # can't sort anything else. - try: - str_names = sorted(n for n in names if isinstance(n, str)) - non_str_names = sorted( - n for n in names if not isinstance(n, str) - ) - names = non_str_names + str_names - except TypeError: - names = list(names) - else: - names = list(names) - - # Combine the index and table columns for each Frame into a list of - # [...index_cols, ...table_cols]. - # - # If any of the input frames have a non-empty index, include these - # columns in the list of columns to concatenate, even if the input - # frames are empty and `ignore_index=True`. - columns = [ - ( - [] - if are_all_range_index - or (ignore_index and not empty_has_index) - else list(f.index._columns) - ) - + [f._data[name] if name in f._data else None for name in names] - for f in objs - ] - - # Get a list of the combined index and table column indices - indices = list(range(functools.reduce(max, map(len, columns)))) - # The position of the first table column in each - # combined index + table columns list - first_data_column_position = len(indices) - len(names) - - # Get the non-null columns and their dtypes - non_null_cols, dtypes = _get_non_null_cols_and_dtypes(indices, columns) - - # Infer common dtypes between numeric columns - # and combine CategoricalColumn categories - categories = _find_common_dtypes_and_categories(non_null_cols, dtypes) - - # Cast all columns to a common dtype, assign combined categories, - # and back-fill missing columns with all-null columns - _cast_cols_to_common_dtypes(indices, columns, dtypes, categories) - - # Construct input tables with the index and data columns in the same - # order. This strips the given index/column names and replaces the - # names with their integer positions in the `cols` list - tables = [] - for cols in columns: - table_index = None - if 1 == first_data_column_position: - table_index = cudf.Index._from_column(cols[0]) - elif first_data_column_position > 1: - table_index = cudf.MultiIndex._from_data( - data=dict( - zip( - indices[:first_data_column_position], - cols[:first_data_column_position], - ) - ) - ) - tables.append( - DataFrame._from_data( - data=dict( - zip( - indices[first_data_column_position:], - cols[first_data_column_position:], - ) - ), - index=table_index, - ) - ) - - # Concatenate the Tables - out = cls._from_data( - *libcudf.concat.concat_tables( - tables, ignore_index=ignore_index or are_all_range_index - ) - ) - - # If ignore_index is True, all input frames are empty, and at - # least one input frame has an index, assign a new RangeIndex - # to the result frame. - if empty_has_index and num_empty_input_frames == len(objs): - out.index = cudf.RangeIndex(result_index_length) - elif are_all_range_index and not ignore_index: - out.index = cudf.core.index.Index._concat([o.index for o in objs]) - - # Reassign the categories for any categorical table cols - _reassign_categories( - categories, out._data, indices[first_data_column_position:] - ) - - # Reassign the categories for any categorical index cols - if not isinstance(out.index, cudf.RangeIndex): - _reassign_categories( - categories, - out.index._data, - indices[:first_data_column_position], - ) - if not isinstance(out.index, MultiIndex) and isinstance( - out.index.dtype, cudf.CategoricalDtype - ): - out = out.set_index(out.index) - for name, col in out._column_labels_and_values: - out._data[name] = col._with_type_metadata( - tables[0]._data[name].dtype - ) - - # Reassign index and column names - if objs[0]._data.multiindex: - out._set_columns_like(objs[0]._data) - else: - out.columns = names - if not ignore_index: - out.index.name = objs[0].index.name - out.index.names = objs[0].index.names - - return out - - def astype( - self, - dtype, - copy: bool = False, - errors: Literal["raise", "ignore"] = "raise", - ): - if is_dict_like(dtype): - if len(set(dtype.keys()) - set(self._column_names)) > 0: - raise KeyError( - "Only a column name can be used for the " - "key in a dtype mappings argument." - ) - else: - dtype = {cc: dtype for cc in self._column_names} - return super().astype(dtype, copy, errors) - - def _clean_renderable_dataframe(self, output): - """ - This method takes in partial/preprocessed dataframe - and returns correct representation of it with correct - dimensions (rows x columns) - """ - - max_rows = pd.get_option("display.max_rows") - min_rows = pd.get_option("display.min_rows") - max_cols = pd.get_option("display.max_columns") - max_colwidth = pd.get_option("display.max_colwidth") - show_dimensions = pd.get_option("display.show_dimensions") - if pd.get_option("display.expand_frame_repr"): - width, _ = console.get_console_size() - else: - width = None - - output = output.to_pandas().to_string( - max_rows=max_rows, - min_rows=min_rows, - max_cols=max_cols, - line_width=width, - max_colwidth=max_colwidth, - show_dimensions=show_dimensions, - ) - - lines = output.split("\n") - - if lines[-1].startswith("["): - lines = lines[:-1] - lines.append( - "[%d rows x %d columns]" % (len(self), self._num_columns) - ) - return "\n".join(lines) - - def _clean_nulls_from_dataframe(self, df): - """ - This function converts all ``null`` values to ```` for - representation as a string in `__repr__`. - - Since we utilize Pandas `__repr__` at all places in our code - for formatting purposes, we convert columns to `str` dtype for - filling with `` values. - """ - for col in df._data: - if isinstance( - df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) - ): - # TODO we need to handle this - pass - elif df._data[col].has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance( - df._data[col], - ( - cudf.core.column.DatetimeColumn, - cudf.core.column.TimeDeltaColumn, - ), - ) - else str(cudf.NA) - ) - - df[col] = df._data[col].astype("str").fillna(fill_value) - else: - df[col] = df._data[col] - - return df - - def _get_renderable_dataframe(self): - """ - Takes rows and columns from pandas settings or estimation from size. - pulls quadrants based off of some known parameters then style for - multiindex as well producing an efficient representative string - for printing with the dataframe. - """ - max_rows = pd.options.display.max_rows - nrows = np.max([len(self) if max_rows is None else max_rows, 1]) - if pd.options.display.max_rows == 0: - nrows = len(self) - ncols = ( - pd.options.display.max_columns - if pd.options.display.max_columns - else pd.options.display.width / 2 - ) - - if len(self) <= nrows and self._num_columns <= ncols: - output = self.copy(deep=False) - elif self.empty and len(self.index) > 0: - max_seq_items = pd.options.display.max_seq_items - # In case of Empty DataFrame with index, Pandas prints - # first `pd.options.display.max_seq_items` index values - # followed by ... To obtain ... at the end of index list, - # adding 1 extra value. - # If `pd.options.display.max_seq_items` is None, - # entire sequence/Index is to be printed. - # Note : Pandas truncates the dimensions at the end of - # the resulting dataframe when `display.show_dimensions` - # is set to truncate. Hence to display the dimensions we - # need to extract maximum of `max_seq_items` and `nrows` - # and have 1 extra value for ... to show up in the output - # string. - if max_seq_items is not None: - output = self.head(max(max_seq_items, nrows) + 1) - else: - output = self.copy(deep=False) - else: - left_cols = self._num_columns - right_cols = 0 - upper_rows = len(self) - lower_rows = 0 - if len(self) > nrows and nrows > 0: - upper_rows = int(nrows / 2.0) + 1 - lower_rows = upper_rows + (nrows % 2) - if left_cols > ncols: - right_cols = left_cols - int(ncols / 2.0) - # adjust right columns for output if multiindex. - right_cols = ( - right_cols - 1 - if isinstance(self.index, MultiIndex) - else right_cols - ) - left_cols = int(ncols / 2.0) + 1 - if right_cols > 0: - # Pick ncols - left_cols number of columns - # from the right side/from the end. - right_cols = -(int(ncols) - left_cols + 1) - else: - # If right_cols is 0 or negative, it means - # self has lesser number of columns than ncols. - # Hence assign self._num_columns which - # will result in empty `*_right` quadrants. - # This is because `*_left` quadrants will - # contain all columns. - right_cols = self._num_columns - - upper_left = self.head(upper_rows).iloc[:, :left_cols] - upper_right = self.head(upper_rows).iloc[:, right_cols:] - lower_left = self.tail(lower_rows).iloc[:, :left_cols] - lower_right = self.tail(lower_rows).iloc[:, right_cols:] - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - upper = cudf.concat([upper_left, upper_right], axis=1) - lower = cudf.concat([lower_left, lower_right], axis=1) - output = cudf.concat([upper, lower]) - - output = self._clean_nulls_from_dataframe(output) - output.index = output.index._clean_nulls_from_index() - - return output - - @_performance_tracking - def __repr__(self): - output = self._get_renderable_dataframe() - return self._clean_renderable_dataframe(output) - - @_performance_tracking - def _repr_html_(self): - lines = ( - self._get_renderable_dataframe() - .to_pandas() - ._repr_html_() - .split("\n") - ) - if lines[-2].startswith("

"): - lines = lines[:-2] - lines.append( - "

%d rows × %d columns

" % (len(self), self._num_columns) - ) - lines.append("") - return "\n".join(lines) - - @_performance_tracking - def _repr_latex_(self): - return self._get_renderable_dataframe().to_pandas()._repr_latex_() - - def _make_operands_and_index_for_binop( - self, - other: Any, - fn: str, - fill_value: Any = None, - reflect: bool = False, - can_reindex: bool = False, - ) -> tuple[ - dict[str | None, tuple[ColumnBase, Any, bool, Any]] - | NotImplementedType, - BaseIndex | None, - bool, - ]: - lhs, rhs = self._data, other - index = self.index - fill_requires_key = False - left_default: Any = False - equal_columns = False - can_use_self_column_name = True - - if _is_scalar_or_zero_d_array(other): - rhs = {name: other for name in self._data} - equal_columns = True - elif isinstance(other, Series): - if ( - not (self_pd_columns := self._data.to_pandas_index()).equals( - other_pd_index := other.index.to_pandas() - ) - and not can_reindex - and fn in cudf.utils.utils._EQUALITY_OPS - ): - raise ValueError( - "Can only compare DataFrame & Series objects " - "whose columns & index are same respectively, " - "please reindex." - ) - rhs = dict(zip(other_pd_index, other.values_host)) - # For keys in right but not left, perform binops between NaN (not - # NULL!) and the right value (result is NaN). - left_default = as_column(np.nan, length=len(self)) - equal_columns = other_pd_index.equals(self_pd_columns) - can_use_self_column_name = ( - equal_columns or other_pd_index.names == self_pd_columns.names - ) - elif isinstance(other, DataFrame): - if ( - not can_reindex - and fn in cudf.utils.utils._EQUALITY_OPS - and ( - not self.index.equals(other.index) - or not self._data.to_pandas_index().equals( - other._data.to_pandas_index() - ) - ) - ): - raise ValueError( - "Can only compare identically-labeled DataFrame objects" - ) - new_lhs, new_rhs = _align_indices(self, other) - index = new_lhs.index - lhs, rhs = new_lhs._data, new_rhs._data - fill_requires_key = True - # For DataFrame-DataFrame ops, always default to operating against - # the fill value. - left_default = fill_value - equal_columns = self._column_names == other._column_names - can_use_self_column_name = ( - equal_columns - or self._data._level_names == other._data._level_names - ) - elif isinstance(other, (dict, abc.Mapping)): - # Need to fail early on host mapping types because we ultimately - # convert everything to a dict. - return NotImplemented, None, True - - if not isinstance(rhs, (dict, abc.Mapping)): - return NotImplemented, None, True - - operands = { - k: ( - v, - rhs.get(k, fill_value), - reflect, - fill_value if (not fill_requires_key or k in rhs) else None, - ) - for k, v in lhs.items() - } - - if left_default is not False: - for k, v in rhs.items(): - if k not in lhs: - operands[k] = (left_default, v, reflect, None) - - if not equal_columns: - if isinstance(other, DataFrame): - column_names_list = self._data.to_pandas_index().join( - other._data.to_pandas_index(), how="outer" - ) - elif isinstance(other, Series): - column_names_list = self._data.to_pandas_index().join( - other.index.to_pandas(), how="outer" - ) - else: - raise ValueError("other must be a DataFrame or Series.") - - sorted_dict = {key: operands[key] for key in column_names_list} - return sorted_dict, index, can_use_self_column_name - return operands, index, can_use_self_column_name - - @classmethod - @_performance_tracking - def from_dict( - cls, - data: dict, - orient: str = "columns", - dtype: Dtype | None = None, - columns: list | None = None, - ) -> DataFrame: - """ - Construct DataFrame from dict of array-like or dicts. - Creates DataFrame object from dictionary by columns or by index - allowing dtype specification. - - Parameters - ---------- - data : dict - Of the form {field : array-like} or {field : dict}. - orient : {'columns', 'index', 'tight'}, default 'columns' - The "orientation" of the data. If the keys of the passed dict - should be the columns of the resulting DataFrame, pass 'columns' - (default). Otherwise if the keys should be rows, pass 'index'. - If 'tight', assume a dict with keys ['index', 'columns', 'data', - 'index_names', 'column_names']. - dtype : dtype, default None - Data type to force, otherwise infer. - columns : list, default None - Column labels to use when ``orient='index'``. Raises a ``ValueError`` - if used with ``orient='columns'`` or ``orient='tight'``. - - Returns - ------- - DataFrame - - See Also - -------- - DataFrame.from_records : DataFrame from structured ndarray, sequence - of tuples or dicts, or DataFrame. - DataFrame : DataFrame object creation using constructor. - DataFrame.to_dict : Convert the DataFrame to a dictionary. - - Examples - -------- - By default the keys of the dict become the DataFrame columns: - - >>> import cudf - >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} - >>> cudf.DataFrame.from_dict(data) - col_1 col_2 - 0 3 a - 1 2 b - 2 1 c - 3 0 d - - Specify ``orient='index'`` to create the DataFrame using dictionary - keys as rows: - - >>> data = {'row_1': [3, 2, 1, 0], 'row_2': [10, 11, 12, 13]} - >>> cudf.DataFrame.from_dict(data, orient='index') - 0 1 2 3 - row_1 3 2 1 0 - row_2 10 11 12 13 - - When using the 'index' orientation, the column names can be - specified manually: - - >>> cudf.DataFrame.from_dict(data, orient='index', - ... columns=['A', 'B', 'C', 'D']) - A B C D - row_1 3 2 1 0 - row_2 10 11 12 13 - - Specify ``orient='tight'`` to create the DataFrame using a 'tight' - format: - - >>> data = {'index': [('a', 'b'), ('a', 'c')], - ... 'columns': [('x', 1), ('y', 2)], - ... 'data': [[1, 3], [2, 4]], - ... 'index_names': ['n1', 'n2'], - ... 'column_names': ['z1', 'z2']} - >>> cudf.DataFrame.from_dict(data, orient='tight') - z1 x y - z2 1 2 - n1 n2 - a b 1 3 - c 2 4 - """ # noqa: E501 - - orient = orient.lower() - if orient == "index": - if isinstance( - next(iter(data.values()), None), (cudf.Series, cupy.ndarray) - ): - result = cls(data).T - result.columns = ( - columns - if columns is not None - else range(len(result._data)) - ) - if dtype is not None: - result = result.astype(dtype) - return result - else: - return cls.from_pandas( - pd.DataFrame.from_dict( - data=data, - orient=orient, - dtype=dtype, - columns=columns, - ) - ) - elif orient == "columns": - if columns is not None: - raise ValueError( - "Cannot use columns parameter with orient='columns'" - ) - return cls(data, columns=None, dtype=dtype) - elif orient == "tight": - if columns is not None: - raise ValueError( - "Cannot use columns parameter with orient='right'" - ) - - index = _from_dict_create_index( - data["index"], data["index_names"], cudf - ) - columns = _from_dict_create_index( - data["columns"], data["column_names"], pd - ) - return cls(data["data"], index=index, columns=columns, dtype=dtype) - else: - raise ValueError( - "Expected 'index', 'columns' or 'tight' for orient " - f"parameter. Got '{orient}' instead" - ) - - @_performance_tracking - def to_dict( - self, - orient: str = "dict", - into: type[dict] = dict, - index: bool = True, - ) -> dict | list[dict]: - """ - Convert the DataFrame to a dictionary. - - The type of the key-value pairs can be customized with the parameters - (see below). - - Parameters - ---------- - orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} - Determines the type of the values of the dictionary. - - - 'dict' (default) : dict like {column -> {index -> value}} - - 'list' : dict like {column -> [values]} - - 'series' : dict like {column -> Series(values)} - - 'split' : dict like - {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} - - 'tight' : dict like - {'index' -> [index], 'columns' -> [columns], 'data' -> [values], - 'index_names' -> [index.names], 'column_names' -> [column.names]} - - 'records' : list like - [{column -> value}, ... , {column -> value}] - - 'index' : dict like {index -> {column -> value}} - - Abbreviations are allowed. `s` indicates `series` and `sp` - indicates `split`. - - into : class, default dict - The collections.abc.Mapping subclass used for all Mappings - in the return value. Can be the actual class or an empty - instance of the mapping type you want. If you want a - collections.defaultdict, you must pass it initialized. - - index : bool, default True - Whether to include the index item (and index_names item if `orient` - is 'tight') in the returned dictionary. Can only be ``False`` - when `orient` is 'split' or 'tight'. Note that when `orient` is - 'records', this parameter does not take effect (index item always - not included). - - Returns - ------- - dict, list or collections.abc.Mapping - Return a collections.abc.Mapping object representing the DataFrame. - The resulting transformation depends on the `orient` parameter. - - See Also - -------- - DataFrame.from_dict: Create a DataFrame from a dictionary. - DataFrame.to_json: Convert a DataFrame to JSON format. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'col1': [1, 2], - ... 'col2': [0.5, 0.75]}, - ... index=['row1', 'row2']) - >>> df - col1 col2 - row1 1 0.50 - row2 2 0.75 - >>> df.to_dict() - {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} - - You can specify the return orientation. - - >>> df.to_dict('series') - {'col1': row1 1 - row2 2 - Name: col1, dtype: int64, - 'col2': row1 0.50 - row2 0.75 - Name: col2, dtype: float64} - - >>> df.to_dict('split') - {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], - 'data': [[1, 0.5], [2, 0.75]]} - - >>> df.to_dict('records') - [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] - - >>> df.to_dict('index') - {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} - - >>> df.to_dict('tight') - {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], - 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} - - You can also specify the mapping type. - - >>> from collections import OrderedDict, defaultdict - >>> df.to_dict(into=OrderedDict) # doctest: +SKIP - OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), - ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) - - If you want a `defaultdict`, you need to initialize it: - - >>> dd = defaultdict(list) - >>> df.to_dict('records', into=dd) - [defaultdict(, {'col1': 1, 'col2': 0.5}), - defaultdict(, {'col1': 2, 'col2': 0.75})] - """ # noqa: E501 - orient = orient.lower() - - if orient == "series": - # Special case needed to avoid converting - # cudf.Series objects into pd.Series - if not inspect.isclass(into): - cons = type(into) # type: ignore[assignment] - if isinstance(into, defaultdict): - cons = functools.partial(cons, into.default_factory) - elif issubclass(into, abc.Mapping): - cons = into # type: ignore[assignment] - if issubclass(into, defaultdict): - raise TypeError( - "to_dict() only accepts initialized defaultdicts" - ) - else: - raise TypeError(f"unsupported type: {into}") - return cons(self.items()) # type: ignore[misc] - - return self.to_pandas().to_dict(orient=orient, into=into, index=index) - - @_performance_tracking - def scatter_by_map( - self, map_index, map_size=None, keep_index=True, debug: bool = False - ): - """Scatter to a list of dataframes. - - Uses map_index to determine the destination - of each row of the original DataFrame. - - Parameters - ---------- - map_index : Series, str or list-like - Scatter assignment for each row - map_size : int - Length of output list. Must be >= uniques in map_index - keep_index : bool - Conserve original index values for each row - - Returns - ------- - A list of cudf.DataFrame objects. - - Raises - ------ - ValueError - If the map_index has invalid entries (not all in [0, - num_partitions)). - """ - # map_index might be a column name or array, - # make it a Column - if isinstance(map_index, str): - map_index = self._data[map_index] - elif isinstance(map_index, cudf.Series): - map_index = map_index._column - else: - map_index = as_column(map_index) - - # Convert float to integer - if map_index.dtype.kind == "f": - map_index = map_index.astype(np.int32) - - # Convert string or categorical to integer - if isinstance(map_index, cudf.core.column.StringColumn): - cat_index = cast( - cudf.core.column.CategoricalColumn, - map_index.astype("category"), - ) - map_index = cat_index.codes - warnings.warn( - "Using StringColumn for map_index in scatter_by_map. " - "Use an integer array/column for better performance." - ) - elif isinstance(map_index, cudf.core.column.CategoricalColumn): - map_index = map_index.codes - warnings.warn( - "Using CategoricalColumn for map_index in scatter_by_map. " - "Use an integer array/column for better performance." - ) - - if debug and map_size is not None: - count = map_index.distinct_count() - if map_size < count: - raise ValueError( - f"ERROR: map_size must be >= {count} (got {map_size})." - ) - - partitioned_columns, output_offsets = libcudf.partitioning.partition( - [*(self.index._columns if keep_index else ()), *self._columns], - map_index, - map_size, - ) - partitioned = self._from_columns_like_self( - partitioned_columns, - column_names=self._column_names, - index_names=list(self._index_names) if keep_index else None, - ) - - # due to the split limitation mentioned - # here: https://github.com/rapidsai/cudf/issues/4607 - # we need to remove first & last elements in offsets. - # TODO: Remove this after the above issue is fixed. - output_offsets = output_offsets[1:-1] - - result = partitioned._split(output_offsets, keep_index=keep_index) - - if map_size: - result += [ - self._empty_like(keep_index) - for _ in range(map_size - len(result)) - ] - - return result - - @_performance_tracking - def update( - self, - other, - join="left", - overwrite=True, - filter_func=None, - errors="ignore", - ): - """ - Modify a DataFrame in place using non-NA values from another DataFrame. - - Aligns on indices. There is no return value. - - Parameters - ---------- - other : DataFrame, or object coercible into a DataFrame - Should have at least one matching index/column label with the - original DataFrame. If a Series is passed, its name attribute must - be set, and that will be used as the column name to align with the - original DataFrame. - - join : {'left'}, default 'left' - Only left join is implemented, keeping the index and - columns of the original object. - - overwrite : {True, False}, default True - How to handle non-NA values for overlapping keys: - True: overwrite original DataFrame's values with values from other. - False: only update values that are NA in the original DataFrame. - - filter_func : None - filter_func is not supported yet - Return True for values that should be updated.S - - errors : {'raise', 'ignore'}, default 'ignore' - If 'raise', will raise a ValueError if the DataFrame and other - both contain non-NA data in the same place. - - - Returns - ------- - None : method directly changes calling object - - Raises - ------ - ValueError - - When ``errors`` = 'raise' and there's overlapping non-NA data. - - When ``errors`` is not either 'ignore' or 'raise' - - NotImplementedError - - If ``join`` != 'left' - """ - # TODO: Support other joins - if join != "left": - raise NotImplementedError("Only left join is supported") - if errors not in {"ignore", "raise"}: - raise ValueError( - "The parameter errors must be either 'ignore' or 'raise'" - ) - if filter_func is not None: - raise NotImplementedError("filter_func is not supported yet") - - if not isinstance(other, DataFrame): - other = DataFrame(other) - - self_cols = self._data.to_pandas_index() - if not self_cols.equals(other._data.to_pandas_index()): - other = other.reindex(self_cols, axis=1) - if not self.index.equals(other.index): - other = other.reindex(self.index, axis=0) - - source_df = self.copy(deep=False) - for col in source_df._column_names: - this = source_df[col] - that = other[col] - - if errors == "raise": - mask_this = that.notna() - mask_that = this.notna() - if (mask_this & mask_that).any(): - raise ValueError("Data overlaps.") - - if overwrite: - mask = that.isna() - else: - mask = this.notna() - - # don't overwrite columns unnecessarily - if mask.all(): - continue - source_df[col] = source_df[col].where(mask, that) - - self._mimic_inplace(source_df, inplace=True) - - @_performance_tracking - def __iter__(self): - return iter(self._column_names) - - @_performance_tracking - def __contains__(self, item): - # This must check against containment in the pandas Index and not - # self._column_names to handle NA, None, nan, etc. correctly. - return item in self._data.to_pandas_index() - - @_performance_tracking - def items(self): - """Iterate over column names and series pairs""" - for k in self: - yield (k, self[k]) - - @_performance_tracking - def equals(self, other) -> bool: - ret = super().equals(other) - # If all other checks matched, validate names. - if ret: - for self_name, other_name in zip( - self._column_names, other._column_names - ): - if self_name != other_name: - ret = False - break - return ret - - @property - def iat(self): - """ - Alias for ``DataFrame.iloc``; provided for compatibility with Pandas. - """ - return _DataFrameiAtIndexer(self) - - @property - def at(self): - """ - Alias for ``DataFrame.loc``; provided for compatibility with Pandas. - """ - return _DataFrameAtIndexer(self) - - @property # type: ignore - @_external_only_api( - "Use _column_names instead, or _data.to_pandas_index() if a pandas " - "index is absolutely necessary. For checking if the columns are a " - "MultiIndex, use _data.multiindex." - ) - @_performance_tracking - def columns(self): - """Returns a tuple of columns""" - return self._data.to_pandas_index() - - @columns.setter # type: ignore - @_performance_tracking - def columns(self, columns): - multiindex = False - rangeindex = False - label_dtype = None - level_names = None - if isinstance(columns, (pd.MultiIndex, cudf.MultiIndex)): - multiindex = True - if isinstance(columns, cudf.MultiIndex): - pd_columns = columns.to_pandas() - else: - pd_columns = columns - if pd_columns.nunique(dropna=False) != len(pd_columns): - raise ValueError("Duplicate column names are not allowed") - level_names = list(pd_columns.names) - elif isinstance(columns, (cudf.BaseIndex, ColumnBase, Series)): - level_names = (getattr(columns, "name", None),) - rangeindex = isinstance(columns, cudf.RangeIndex) - if rangeindex: - unique_count = len(columns) - else: - columns = as_column(columns) - unique_count = columns.distinct_count(dropna=False) - if unique_count != len(columns): - raise ValueError("Duplicate column names are not allowed") - pd_columns = pd.Index(columns.to_pandas()) - label_dtype = pd_columns.dtype - else: - pd_columns = pd.Index(columns) - if pd_columns.nunique(dropna=False) != len(pd_columns): - raise ValueError("Duplicate column names are not allowed") - rangeindex = isinstance(pd_columns, pd.RangeIndex) - level_names = (pd_columns.name,) - label_dtype = pd_columns.dtype - - if len(pd_columns) != self._num_columns: - raise ValueError( - f"Length mismatch: expected {self._num_columns} elements, " - f"got {len(pd_columns)} elements" - ) - - self._data = ColumnAccessor( - data=dict(zip(pd_columns, self._columns)), - multiindex=multiindex, - level_names=level_names, - label_dtype=label_dtype, - rangeindex=rangeindex, - verify=False, - ) - - def _set_columns_like(self, other: ColumnAccessor) -> None: - """ - Modify self with the column properties of other. - - * Whether .columns is a MultiIndex/RangeIndex - * The possible .columns.dtype - * The .columns.names/name (depending on if it's a MultiIndex) - """ - if self._num_columns != len(other.names): - raise ValueError( - f"Length mismatch: expected {len(other)} elements, " - f"got {len(self)} elements" - ) - self._data = ColumnAccessor( - data=dict(zip(other.names, self._columns)), - multiindex=other.multiindex, - rangeindex=other.rangeindex, - level_names=other.level_names, - label_dtype=other.label_dtype, - verify=False, - ) - - @_performance_tracking - def reindex( - self, - labels=None, - index=None, - columns=None, - axis=None, - method=None, - copy=True, - level=None, - fill_value=NA, - limit=None, - tolerance=None, - ): - """ - Conform DataFrame to new index. Places NA/NaN in locations - having no value in the previous index. A new object is produced - unless the new index is equivalent to the current one and copy=False. - - Parameters - ---------- - labels : Index, Series-convertible, optional, default None - New labels / index to conform the axis specified by ``axis`` to. - index : Index, Series-convertible, optional, default None - The index labels specifying the index to conform to. - columns : array-like, optional, default None - The column labels specifying the columns to conform to. - axis : Axis to target. - Can be either the axis name - (``index``, ``columns``) or number (0, 1). - method : Not supported - copy : boolean, default True - Return a new object, even if the passed indexes are the same. - level : Not supported - fill_value : Value to use for missing values. - Defaults to ``NA``, but can be any "compatible" value. - limit : Not supported - tolerance : Not supported - - Returns - ------- - DataFrame with changed index. - - Examples - -------- - ``DataFrame.reindex`` supports two calling conventions - * ``(index=index_labels, columns=column_labels, ...)`` - * ``(labels, axis={'index', 'columns'}, ...)`` - We _highly_ recommend using keyword arguments to clarify your intent. - - Create a dataframe with some fictional data. - - >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] - >>> df = cudf.DataFrame({'http_status': [200, 200, 404, 404, 301], - ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, - ... index=index) - >>> df - http_status response_time - Firefox 200 0.04 - Chrome 200 0.02 - Safari 404 0.07 - IE10 404 0.08 - Konqueror 301 1.00 - >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10', - ... 'Chrome'] - >>> df.reindex(new_index) - http_status response_time - Safari 404 0.07 - Iceweasel - Comodo Dragon - IE10 404 0.08 - Chrome 200 0.02 - - .. pandas-compat:: - :meth:`pandas.DataFrame.reindex` - - Note: One difference from Pandas is that ``NA`` is used for rows - that do not match, rather than ``NaN``. One side effect of this is - that the column ``http_status`` retains an integer dtype in cuDF - where it is cast to float in Pandas. - - We can fill in the missing values by - passing a value to the keyword ``fill_value``. - - >>> df.reindex(new_index, fill_value=0) - http_status response_time - Safari 404 0.07 - Iceweasel 0 0.00 - Comodo Dragon 0 0.00 - IE10 404 0.08 - Chrome 200 0.02 - - We can also reindex the columns. - - >>> df.reindex(columns=['http_status', 'user_agent']) - http_status user_agent - Firefox 200 - Chrome 200 - Safari 404 - IE10 404 - Konqueror 301 - - Or we can use "axis-style" keyword arguments - - >>> df.reindex(columns=['http_status', 'user_agent']) - http_status user_agent - Firefox 200 - Chrome 200 - Safari 404 - IE10 404 - Konqueror 301 - """ - - if labels is None and index is None and columns is None: - return self.copy(deep=copy) - - # pandas simply ignores the labels keyword if it is provided in - # addition to index and columns, but it prohibits the axis arg. - if (index is not None or columns is not None) and axis is not None: - raise TypeError( - "Cannot specify both 'axis' and any of 'index' or 'columns'." - ) - - axis = 0 if axis is None else self._get_axis_from_axis_arg(axis) - if axis == 0: - if index is None: - index = labels - else: - if columns is None: - columns = labels - if columns is None: - df = self - else: - columns = cudf.Index(columns) - intersection = self._data.to_pandas_index().intersection( - columns.to_pandas() - ) - df = self.loc[:, intersection] - - return df._reindex( - column_names=columns, - dtypes=dict(self._dtypes), - deep=copy, - index=index, - inplace=False, - fill_value=fill_value, - level=level, - method=method, - limit=limit, - tolerance=tolerance, - ) - - @_performance_tracking - def set_index( - self, - keys, - drop=True, - append=False, - inplace=False, - verify_integrity=False, - ): - """Return a new DataFrame with a new index - - Parameters - ---------- - keys : Index, Series-convertible, label-like, or list - Index : the new index. - Series-convertible : values for the new index. - Label-like : Label of column to be used as index. - List : List of items from above. - drop : boolean, default True - Whether to drop corresponding column for str index argument - append : boolean, default True - Whether to append columns to the existing index, - resulting in a MultiIndex. - inplace : boolean, default False - Modify the DataFrame in place (do not create a new object). - verify_integrity : boolean, default False - Check for duplicates in the new index. - - Examples - -------- - >>> df = cudf.DataFrame({ - ... "a": [1, 2, 3, 4, 5], - ... "b": ["a", "b", "c", "d","e"], - ... "c": [1.0, 2.0, 3.0, 4.0, 5.0] - ... }) - >>> df - a b c - 0 1 a 1.0 - 1 2 b 2.0 - 2 3 c 3.0 - 3 4 d 4.0 - 4 5 e 5.0 - - Set the index to become the 'b' column: - - >>> df.set_index('b') - a c - b - a 1 1.0 - b 2 2.0 - c 3 3.0 - d 4 4.0 - e 5 5.0 - - Create a MultiIndex using columns 'a' and 'b': - - >>> df.set_index(["a", "b"]) - c - a b - 1 a 1.0 - 2 b 2.0 - 3 c 3.0 - 4 d 4.0 - 5 e 5.0 - - Set new Index instance as index: - - >>> df.set_index(cudf.RangeIndex(10, 15)) - a b c - 10 1 a 1.0 - 11 2 b 2.0 - 12 3 c 3.0 - 13 4 d 4.0 - 14 5 e 5.0 - - Setting `append=True` will combine current index with column `a`: - - >>> df.set_index("a", append=True) - b c - a - 0 1 a 1.0 - 1 2 b 2.0 - 2 3 c 3.0 - 3 4 d 4.0 - 4 5 e 5.0 - - `set_index` supports `inplace` parameter too: - - >>> df.set_index("a", inplace=True) - >>> df - b c - a - 1 a 1.0 - 2 b 2.0 - 3 c 3.0 - 4 d 4.0 - 5 e 5.0 - """ - - if not isinstance(keys, list): - keys = [keys] - if len(keys) == 0: - raise ValueError("No valid columns to be added to index.") - if append: - keys = [self.index] + keys - - # Preliminary type check - labels_not_found = [] - data_to_add = [] - names = [] - to_drop = [] - for col in keys: - # label-like - if is_scalar(col) or isinstance(col, tuple): - if col in self._column_names: - data_to_add.append(self[col]._column) - names.append(col) - if drop: - to_drop.append(col) - else: - labels_not_found.append(col) - # index-like - elif isinstance(col, (MultiIndex, pd.MultiIndex)): - if isinstance(col, pd.MultiIndex): - col = MultiIndex.from_pandas(col) - data_to_add.extend(col._columns) - names.extend(col.names) - elif isinstance( - col, (cudf.Series, cudf.Index, pd.Series, pd.Index) - ): - data_to_add.append(as_column(col)) - names.append(col.name) - else: - try: - col = as_column(col) - except TypeError as err: - msg = f"{col} cannot be converted to column-like." - raise TypeError(msg) from err - data_to_add.append(col) - names.append(None) - - if labels_not_found: - raise KeyError(f"None of {labels_not_found} are in the columns") - - if ( - len(data_to_add) == 1 - and len(keys) == 1 - and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex)) - ): - # Don't turn single level MultiIndex into an Index - idx = cudf.Index._from_column(data_to_add[0], name=names[0]) - else: - idx = MultiIndex._from_data(dict(enumerate(data_to_add))) - idx.names = names - - # TODO: Change to deep=False when copy-on-write is default - df = self if inplace else self.copy(deep=True) - - if verify_integrity and not idx.is_unique: - raise ValueError(f"Values in Index are not unique: {idx}") - - if to_drop: - df.drop(columns=to_drop, inplace=True) - - df.index = idx - return df if not inplace else None - - @_performance_tracking - def fillna( - self, value=None, method=None, axis=None, inplace=False, limit=None - ): # noqa: D102 - if isinstance(value, (pd.Series, pd.DataFrame)): - value = cudf.from_pandas(value) - if isinstance(value, cudf.Series): - # Align value.index to self.columns - value = value.reindex(self._column_names) - elif isinstance(value, cudf.DataFrame): - if not self.index.equals(value.index): - # Align value.index to self.index - value = value.reindex(self.index) - value = dict(value.items()) - elif isinstance(value, abc.Mapping): - # Align value.indexes to self.index - value = { - key: value.reindex(self.index) - if isinstance(value, cudf.Series) - else value - for key, value in value.items() - } - return super().fillna( - value=value, method=method, axis=axis, inplace=inplace, limit=limit - ) - - @_performance_tracking - def where(self, cond, other=None, inplace=False, axis=None, level=None): - if axis is not None: - raise NotImplementedError("axis is not supported.") - elif level is not None: - raise NotImplementedError("level is not supported.") - - from cudf.core._internals.where import ( - _check_and_cast_columns_with_other, - ) - - # First process the condition. - if isinstance(cond, Series): - cond = self._from_data( - self._data._from_columns_like_self( - itertools.repeat(cond._column, len(self._column_names)), - verify=False, - ) - ) - elif hasattr(cond, "__cuda_array_interface__"): - cond = DataFrame( - cond, columns=self._column_names, index=self.index - ) - elif ( - hasattr(cond, "__array_interface__") - and cond.__array_interface__["shape"] != self.shape - ): - raise ValueError("conditional must be same shape as self") - elif not isinstance(cond, DataFrame): - cond = cudf.DataFrame(cond) - - if set(self._column_names).intersection(set(cond._column_names)): - if not self.index.equals(cond.index): - cond = cond.reindex(self.index) - else: - if cond.shape != self.shape: - raise ValueError( - "Array conditional must be same shape as self" - ) - # Setting `self` column names to `cond` as it has no column names. - cond._set_columns_like(self._data) - - # If other was provided, process that next. - if isinstance(other, DataFrame): - other_cols = [other._data[col] for col in self._column_names] - elif cudf.api.types.is_scalar(other): - other_cols = [other] * len(self._column_names) - elif isinstance(other, cudf.Series): - other_cols = other.to_pandas() - else: - other_cols = other - - if len(self._columns) != len(other_cols): - raise ValueError( - """Replacement list length or number of data columns - should be equal to number of columns of self""" - ) - - out = [] - for (name, col), other_col in zip( - self._column_labels_and_values, other_cols - ): - source_col, other_col = _check_and_cast_columns_with_other( - source_col=col, - other=other_col, - inplace=inplace, - ) - - if cond_col := cond._data.get(name): - result = cudf._lib.copying.copy_if_else( - source_col, other_col, cond_col - ) - - out.append(result._with_type_metadata(col.dtype)) - else: - out_mask = cudf._lib.null_mask.create_null_mask( - len(source_col), - state=cudf._lib.null_mask.MaskState.ALL_NULL, - ) - out.append(source_col.set_mask(out_mask)) - - return self._mimic_inplace( - self._from_data_like_self(self._data._from_columns_like_self(out)), - inplace=inplace, - ) - - @docutils.doc_apply( - doc_reset_index_template.format( - klass="DataFrame", - argument="", - return_type="DataFrame or None", - return_doc="", - example=""" - >>> df = cudf.DataFrame([('bird', 389.0), - ... ('bird', 24.0), - ... ('mammal', 80.5), - ... ('mammal', np.nan)], - ... index=['falcon', 'parrot', 'lion', 'monkey'], - ... columns=('class', 'max_speed')) - >>> df - class max_speed - falcon bird 389.0 - parrot bird 24.0 - lion mammal 80.5 - monkey mammal - >>> df.reset_index() - index class max_speed - 0 falcon bird 389.0 - 1 parrot bird 24.0 - 2 lion mammal 80.5 - 3 monkey mammal - >>> df.reset_index(drop=True) - class max_speed - 0 bird 389.0 - 1 bird 24.0 - 2 mammal 80.5 - 3 mammal - - You can also use ``reset_index`` with MultiIndex. - - >>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'), - ... ('bird', 'parrot'), - ... ('mammal', 'lion'), - ... ('mammal', 'monkey')], - ... names=['class', 'name']) - >>> df = cudf.DataFrame([(389.0, 'fly'), - ... ( 24.0, 'fly'), - ... ( 80.5, 'run'), - ... (np.nan, 'jump')], - ... index=index, - ... columns=('speed', 'type')) - >>> df - speed type - class name - bird falcon 389.0 fly - parrot 24.0 fly - mammal lion 80.5 run - monkey jump - >>> df.reset_index(level='class') - class speed type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal jump - """, - ) - ) - def reset_index( - self, - level=None, - drop=False, - inplace=False, - col_level=0, - col_fill="", - allow_duplicates: bool = False, - names: abc.Hashable | abc.Sequence[abc.Hashable] | None = None, - ): - return self._mimic_inplace( - DataFrame._from_data( - *self._reset_index( - level=level, - drop=drop, - col_level=col_level, - col_fill=col_fill, - allow_duplicates=allow_duplicates, - names=names, - ) - ), - inplace=inplace, - ) - - @_performance_tracking - def insert( - self, - loc, - column, - value, - allow_duplicates: bool = False, - nan_as_null=no_default, - ): - """Add a column to DataFrame at the index specified by loc. - - Parameters - ---------- - loc : int - location to insert by index, cannot be greater then num columns + 1 - column : number or string - column or label of column to be inserted - value : Series or array-like - nan_as_null : bool, Default None - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - """ - if allow_duplicates is not False: - raise NotImplementedError( - "allow_duplicates is currently not implemented." - ) - if nan_as_null is no_default: - nan_as_null = not cudf.get_option("mode.pandas_compatible") - return self._insert( - loc=loc, - name=column, - value=value, - nan_as_null=nan_as_null, - ignore_index=False, - ) - - @_performance_tracking - def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): - """ - Same as `insert`, with additional `ignore_index` param. - - ignore_index : bool, default True - If True, there will be no index equality check & reindexing - happening. - If False, a reindexing operation is performed if - `value.index` is not equal to `self.index`. - """ - num_cols = self._num_columns - if loc < 0: - loc += num_cols + 1 - - if not (0 <= loc <= num_cols): - raise ValueError( - f"insert location must be within range " - f"{-(num_cols + 1) * (num_cols > 0)}, " - f"{num_cols * (num_cols > 0)}" - ) - - # TODO: This check is currently necessary because - # _is_scalar_or_zero_d_array below will treat a length 1 pd.Categorical - # as a scalar and attempt to use column.full, which can't handle it. - # Maybe _is_scalar_or_zero_d_array should be changed, or maybe we just - # shouldn't support pd.Categorical at all, but those changes will at - # least require a deprecation cycle because we currently support - # inserting a pd.Categorical. - if isinstance(value, pd.Categorical): - value = as_column(value) - - if _is_scalar_or_zero_d_array(value): - dtype = None - if isinstance(value, (np.ndarray, cupy.ndarray)): - dtype = value.dtype - value = value.item() - if libcudf.scalar._is_null_host_scalar(value): - dtype = "str" - value = as_column( - value, - length=len(self), - dtype=dtype, - ) - - if len(self) == 0: - if isinstance(value, (pd.Series, Series)): - if not ignore_index: - self.index = cudf.Index(value.index) - elif (length := len(value)) > 0: - if num_cols != 0: - ca = self._data._from_columns_like_self( - ( - column.column_empty_like( - col_data, masked=True, newsize=length - ) - for col_data in self._columns - ), - verify=False, - ) - else: - ca = ColumnAccessor({}) - self._data = ca - self._index = RangeIndex(length) - - elif isinstance(value, (pd.Series, Series)): - value = Series(value, nan_as_null=nan_as_null) - if not ignore_index: - value = value._align_to_index( - self.index, how="right", sort=False - ) - - value = column.as_column(value, nan_as_null=nan_as_null) - - self._data.insert(name, value, loc=loc) - - @property # type:ignore - @_performance_tracking - def axes(self): - """ - Return a list representing the axes of the DataFrame. - - DataFrame.axes returns a list of two elements: - element zero is the row index and element one is the columns. - - Examples - -------- - >>> import cudf - >>> cdf1 = cudf.DataFrame() - >>> cdf1["key"] = [0,0,1,1] - >>> cdf1["k2"] = [1,2,2,3] - >>> cdf1["val"] = [1,2,3,4] - >>> cdf1["temp"] = [-1,2,2,3] - >>> cdf1.axes - [RangeIndex(start=0, stop=4, step=1), - Index(['key', 'k2', 'val', 'temp'], dtype='object')] - - """ - return [self.index, self._data.to_pandas_index()] - - def diff(self, periods=1, axis=0): - """ - First discrete difference of element. - - Calculates the difference of a DataFrame element compared with another - element in the DataFrame (default is element in previous row). - - Parameters - ---------- - periods : int, default 1 - Periods to shift for calculating difference, - accepts negative values. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Take difference over rows (0) or columns (1). - Only row-wise (0) shift is supported. - - Returns - ------- - DataFrame - First differences of the DataFrame. - - Examples - -------- - >>> import cudf - >>> gdf = cudf.DataFrame({'a': [1, 2, 3, 4, 5, 6], - ... 'b': [1, 1, 2, 3, 5, 8], - ... 'c': [1, 4, 9, 16, 25, 36]}) - >>> gdf - a b c - 0 1 1 1 - 1 2 1 4 - 2 3 2 9 - 3 4 3 16 - 4 5 5 25 - 5 6 8 36 - >>> gdf.diff(periods=2) - a b c - 0 - 1 - 2 2 1 8 - 3 2 2 12 - 4 2 3 16 - 5 2 5 20 - - .. pandas-compat:: - :meth:`pandas.DataFrame.diff` - - Diff currently only supports numeric dtype columns. - """ - if not isinstance(periods, int): - if not (isinstance(periods, float) and periods.is_integer()): - raise ValueError("periods must be an integer") - periods = int(periods) - - axis = self._get_axis_from_axis_arg(axis) - if axis != 0: - raise NotImplementedError("Only axis=0 is supported.") - - if abs(periods) > len(self): - df = cudf.DataFrame._from_data( - { - name: column_empty(len(self), dtype=dtype, masked=True) - for name, dtype in zip(self._column_names, self.dtypes) - } - ) - return df - - return self - self.shift(periods=periods) - - @_performance_tracking - def drop_duplicates( - self, - subset=None, - keep="first", - inplace=False, - ignore_index=False, - ): - """ - Return DataFrame with duplicate rows removed. - - Considering certain columns is optional. Indexes, including time - indexes are ignored. - - Parameters - ---------- - subset : column label or sequence of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns. - keep : {'first', 'last', ``False``}, default 'first' - Determines which duplicates (if any) to keep. - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - inplace : bool, default ``False`` - Whether to drop duplicates in place or to return a copy. - ignore_index : bool, default ``False`` - If True, the resulting axis will be labeled 0, 1, ..., n - 1. - - Returns - ------- - DataFrame or None - DataFrame with duplicates removed or None if ``inplace=True``. - - See Also - -------- - DataFrame.value_counts: Count unique combinations of columns. - - Examples - -------- - Consider a dataset containing ramen ratings. - - >>> import cudf - >>> df = cudf.DataFrame({ - ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], - ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], - ... 'rating': [4, 4, 3.5, 15, 5] - ... }) - >>> df - brand style rating - 0 Yum Yum cup 4.0 - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - By default, it removes duplicate rows based on all columns. - - >>> df.drop_duplicates() - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - To remove duplicates on specific column(s), use ``subset``. - - >>> df.drop_duplicates(subset=['brand']) - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - - To remove duplicates and keep last occurrences, use ``keep``. - - >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') - brand style rating - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 4 Indomie pack 5.0 - """ # noqa: E501 - outdf = super().drop_duplicates( - subset=subset, - keep=keep, - ignore_index=ignore_index, - ) - - return self._mimic_inplace(outdf, inplace=inplace) - - @_performance_tracking - def pop(self, item): - """Return a column and drop it from the DataFrame.""" - popped = self[item] - del self[item] - return popped - - @_performance_tracking - def rename( - self, - mapper=None, - index=None, - columns=None, - axis=0, - copy=True, - inplace=False, - level=None, - errors="ignore", - ): - """Alter column and index labels. - - Function / dict values must be unique (1-to-1). Labels not contained in - a dict / Series will be left as-is. Extra labels listed don't throw an - error. - - ``DataFrame.rename`` supports two calling conventions: - - ``(index=index_mapper, columns=columns_mapper, ...)`` - - ``(mapper, axis={0/'index' or 1/'column'}, ...)`` - - We highly recommend using keyword arguments to clarify your intent. - - Parameters - ---------- - mapper : dict-like or function, default None - optional dict-like or functions transformations to apply to - the index/column values depending on selected ``axis``. - index : dict-like, default None - Optional dict-like transformations to apply to the index axis' - values. Does not support functions for axis 0 yet. - columns : dict-like or function, default None - optional dict-like or functions transformations to apply to - the columns axis' values. - axis : int, default 0 - Axis to rename with mapper. - 0 or 'index' for index - 1 or 'columns' for columns - copy : boolean, default True - Also copy underlying data - inplace : boolean, default False - Return new DataFrame. If True, assign columns without copy - level : int or level name, default None - In case of a MultiIndex, only rename labels in the specified level. - errors : {'raise', 'ignore', 'warn'}, default 'ignore' - *Only 'ignore' supported* - Control raising of exceptions on invalid data for provided dtype. - - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original - object. - - ``warn`` : prints last exceptions as warnings and - return original object. - - Returns - ------- - DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - >>> df - A B - 0 1 4 - 1 2 5 - 2 3 6 - - Rename columns using a mapping: - - >>> df.rename(columns={"A": "a", "B": "c"}) - a c - 0 1 4 - 1 2 5 - 2 3 6 - - Rename index using a mapping: - - >>> df.rename(index={0: 10, 1: 20, 2: 30}) - A B - 10 1 4 - 20 2 5 - 30 3 6 - - .. pandas-compat:: - :meth:`pandas.DataFrame.rename` - - * Not Supporting: level - - Rename will not overwrite column names. If a list with - duplicates is passed, column names will be postfixed - with a number. - """ - if errors != "ignore": - raise NotImplementedError( - "Only errors='ignore' is currently supported" - ) - - if mapper is None and index is None and columns is None: - return self.copy(deep=copy) - - index = mapper if index is None and axis in (0, "index") else index - columns = ( - mapper if columns is None and axis in (1, "columns") else columns - ) - - result = self if inplace else self.copy(deep=copy) - - out_index = None - if index: - if ( - any(isinstance(item, str) for item in index.values()) - and self.index.dtype != "object" - ): - raise NotImplementedError( - "Implicit conversion of index to " - "mixed type is not yet supported." - ) - - if level is not None and isinstance(self.index, MultiIndex): - level = self.index._get_level_label(level) - level_values = self.index.get_level_values(level) - ca = self.index._data.copy(deep=copy) - ca[level] = level_values._column.find_and_replace( - to_replace=list(index.keys()), - replacement=list(index.values()), - ) - out_index = type(self.index)._from_data( - ca, name=self.index.name - ) - else: - to_replace = list(index.keys()) - vals = list(index.values()) - is_all_na = vals.count(None) == len(vals) - - try: - out_index = _index_from_data( - { - name: col.find_and_replace( - to_replace, vals, is_all_na - ) - for name, col in self.index._column_labels_and_values - } - ) - except OverflowError: - pass - - if out_index is not None: - result.index = out_index - - if columns: - result._data = result._data.rename_levels( - mapper=columns, level=level - ) - - return result - - @_performance_tracking - def add_prefix(self, prefix, axis=None): - if axis is not None: - raise NotImplementedError("axis is currently not implemented.") - # TODO: Change to deep=False when copy-on-write is default - out = self.copy(deep=True) - out.columns = [prefix + col_name for col_name in self._column_names] - return out - - @_performance_tracking - def add_suffix(self, suffix, axis=None): - if axis is not None: - raise NotImplementedError("axis is currently not implemented.") - # TODO: Change to deep=False when copy-on-write is default - out = self.copy(deep=True) - out.columns = [col_name + suffix for col_name in self._column_names] - return out - - @_performance_tracking - def agg(self, aggs, axis=None): - """ - Aggregate using one or more operations over the specified axis. - - Parameters - ---------- - aggs : Iterable (set, list, string, tuple or dict) - Function to use for aggregating data. Accepted types are: - * string name, e.g. ``"sum"`` - * list of functions, e.g. ``["sum", "min", "max"]`` - * dict of axis labels specified operations per column, - e.g. ``{"a": "sum"}`` - - axis : not yet supported - - Returns - ------- - Aggregation Result : ``Series`` or ``DataFrame`` - When ``DataFrame.agg`` is called with single agg, - ``Series`` is returned. - When ``DataFrame.agg`` is called with several aggs, - ``DataFrame`` is returned. - - .. pandas-compat:: - :meth:`pandas.DataFrame.agg` - - * Not supporting: ``axis``, ``*args``, ``**kwargs`` - - """ - dtypes = [self[col].dtype for col in self._column_names] - common_dtype = find_common_type(dtypes) - if common_dtype.kind != "b" and any( - dtype.kind == "b" for dtype in dtypes - ): - raise MixedTypeError("Cannot create a column with mixed types") - - if any(is_string_dtype(dt) for dt in dtypes): - raise NotImplementedError( - "DataFrame.agg() is not supported for " - "frames containing string columns" - ) - - if axis == 0 or axis is not None: - raise NotImplementedError("axis not implemented yet") - - if isinstance(aggs, abc.Iterable) and not isinstance( - aggs, (str, dict) - ): - result = DataFrame() - # TODO : Allow simultaneous pass for multi-aggregation as - # a future optimization - for agg in aggs: - result[agg] = getattr(self, agg)() - return result.T.sort_index(axis=1, ascending=True) - - elif isinstance(aggs, str): - if not hasattr(self, aggs): - raise AttributeError( - f"{aggs} is not a valid function for " - f"'DataFrame' object" - ) - result = DataFrame() - result[aggs] = getattr(self, aggs)() - result = result.iloc[:, 0] - result.name = None - return result - - elif isinstance(aggs, dict): - cols = aggs.keys() - if any(callable(val) for val in aggs.values()): - raise NotImplementedError( - "callable parameter is not implemented yet" - ) - elif all(isinstance(val, str) for val in aggs.values()): - res = {} - for key, value in aggs.items(): - col = self[key] - if not hasattr(col, value): - raise AttributeError( - f"{value} is not a valid function for " - f"'Series' object" - ) - res[key] = getattr(col, value)() - result = cudf.Series(list(res.values()), index=res.keys()) - elif all(isinstance(val, abc.Iterable) for val in aggs.values()): - idxs = set() - for val in aggs.values(): - if isinstance(val, str): - idxs.add(val) - elif isinstance(val, abc.Iterable): - idxs.update(val) - idxs = sorted(list(idxs)) - for agg in idxs: - if agg is callable: - raise NotImplementedError( - "callable parameter is not implemented yet" - ) - result = DataFrame(index=idxs, columns=cols) - for key in aggs.keys(): - col = self[key] - col_empty = column_empty( - len(idxs), dtype=col.dtype, masked=True - ) - ans = cudf.Series._from_column( - col_empty, index=cudf.Index(idxs) - ) - if isinstance(aggs.get(key), abc.Iterable): - # TODO : Allow simultaneous pass for multi-aggregation - # as a future optimization - for agg in aggs.get(key): - if not hasattr(col, agg): - raise AttributeError( - f"{agg} is not a valid function for " - f"'Series' object" - ) - ans[agg] = getattr(col, agg)() - elif isinstance(aggs.get(key), str): - if not hasattr(col, aggs.get(key)): - raise AttributeError( - f"{aggs.get(key)} is not a valid function for " - f"'Series' object" - ) - ans[aggs.get(key)] = getattr(col, agg)() - result[key] = ans - else: - raise ValueError("values of dict must be a string or list") - - return result - - elif callable(aggs): - raise NotImplementedError( - "callable parameter is not implemented yet" - ) - - else: - raise ValueError("argument must be a string, list or dict") - - @_performance_tracking - def nlargest(self, n, columns, keep="first"): - """Return the first *n* rows ordered by *columns* in descending order. - - Return the first *n* rows with the largest values in *columns*, in - descending order. The columns that are not specified are returned as - well, but not used for ordering. - - Parameters - ---------- - n : int - Number of rows to return. - columns : label or list of labels - Column label(s) to order by. - keep : {'first', 'last'}, default 'first' - Where there are duplicate values: - - - `first` : prioritize the first occurrence(s) - - `last` : prioritize the last occurrence(s) - - Returns - ------- - DataFrame - The first `n` rows ordered by the given columns in descending - order. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000, - ... 434000, 434000, 337000, 11300, - ... 11300, 11300], - ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, - ... 17036, 182, 38, 311], - ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", - ... "IS", "NR", "TV", "AI"]}, - ... index=["Italy", "France", "Malta", - ... "Maldives", "Brunei", "Iceland", - ... "Nauru", "Tuvalu", "Anguilla"]) - >>> df - population GDP alpha-2 - Italy 59000000 1937894 IT - France 65000000 2583560 FR - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN - Iceland 337000 17036 IS - Nauru 11300 182 NR - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - >>> df.nlargest(3, 'population') - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Malta 434000 12011 MT - >>> df.nlargest(3, 'population', keep='last') - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Brunei 434000 12128 BN - - .. pandas-compat:: - :meth:`pandas.DataFrame.nlargest` - - - Only a single column is supported in *columns* - """ - return self._n_largest_or_smallest(True, n, columns, keep) - - def nsmallest(self, n, columns, keep="first"): - """Return the first *n* rows ordered by *columns* in ascending order. - - Return the first *n* rows with the smallest values in *columns*, in - ascending order. The columns that are not specified are returned as - well, but not used for ordering. - - Parameters - ---------- - n : int - Number of items to retrieve. - columns : list or str - Column name or names to order by. - keep : {'first', 'last'}, default 'first' - Where there are duplicate values: - - - ``first`` : take the first occurrence. - - ``last`` : take the last occurrence. - - Returns - ------- - DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000, - ... 434000, 434000, 337000, 337000, - ... 11300, 11300], - ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, - ... 17036, 182, 38, 311], - ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", - ... "IS", "NR", "TV", "AI"]}, - ... index=["Italy", "France", "Malta", - ... "Maldives", "Brunei", "Iceland", - ... "Nauru", "Tuvalu", "Anguilla"]) - >>> df - population GDP alpha-2 - Italy 59000000 1937894 IT - France 65000000 2583560 FR - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN - Iceland 337000 17036 IS - Nauru 337000 182 NR - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - - In the following example, we will use ``nsmallest`` to select the - three rows having the smallest values in column "population". - - >>> df.nsmallest(3, 'population') - population GDP alpha-2 - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - Iceland 337000 17036 IS - - When using ``keep='last'``, ties are resolved in reverse order: - - >>> df.nsmallest(3, 'population', keep='last') - population GDP alpha-2 - Anguilla 11300 311 AI - Tuvalu 11300 38 TV - Nauru 337000 182 NR - - .. pandas-compat:: - :meth:`pandas.DataFrame.nsmallest` - - - Only a single column is supported in *columns* - """ - return self._n_largest_or_smallest(False, n, columns, keep) - - @_performance_tracking - def swaplevel(self, i=-2, j=-1, axis=0): - """ - Swap level i with level j. - Calling this method does not change the ordering of the values. - - Parameters - ---------- - i : int or str, default -2 - First level of index to be swapped. - j : int or str, default -1 - Second level of index to be swapped. - axis : The axis to swap levels on. - 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - - Examples - -------- - >>> import cudf - >>> midx = cudf.MultiIndex(levels=[['llama', 'cow', 'falcon'], - ... ['speed', 'weight', 'length'],['first','second']], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2], - ... [0, 0, 0, 0, 0, 0, 1, 1, 1]]) - >>> cdf = cudf.DataFrame(index=midx, columns=['big', 'small'], - ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], - ... [250, 150], [1.5, 0.8], [320, 250], [1, 0.8], [0.3, 0.2]]) - - >>> cdf - big small - llama speed first 45.0 30.0 - weight first 200.0 100.0 - length first 1.5 1.0 - cow speed first 30.0 20.0 - weight first 250.0 150.0 - length first 1.5 0.8 - falcon speed second 320.0 250.0 - weight second 1.0 0.8 - length second 0.3 0.2 - - >>> cdf.swaplevel() - big small - llama first speed 45.0 30.0 - weight 200.0 100.0 - length 1.5 1.0 - cow first speed 30.0 20.0 - weight 250.0 150.0 - length 1.5 0.8 - falcon second speed 320.0 250.0 - weight 1.0 0.8 - length 0.3 0.2 - """ - # TODO: Change to deep=False when copy-on-write is default - result = self.copy(deep=True) - - # To get axis number - axis = self._get_axis_from_axis_arg(axis) - - if axis == 0: - if not isinstance(result.index, MultiIndex): - raise TypeError("Can only swap levels on a hierarchical axis.") - result.index = result.index.swaplevel(i, j) - else: - if not result._data.multiindex: - raise TypeError("Can only swap levels on a hierarchical axis.") - result._data = result._data.swaplevel(i, j) - - return result - - @_performance_tracking - def transpose(self): - """Transpose index and columns. - - Returns - ------- - a new (ncol x nrow) dataframe. self is (nrow x ncol) - - .. pandas-compat:: - :meth:`pandas.DataFrame.transpose`, :attr:`pandas.DataFrame.T` - - Not supporting *copy* because default and only behavior is - copy=True - """ - index = self._data.to_pandas_index() - columns = self.index.copy(deep=False) - if self._num_columns == 0 or self._num_rows == 0: - return DataFrame(index=index, columns=columns) - - # No column from index is transposed with libcudf. - source_columns = [*self._columns] - source_dtype = source_columns[0].dtype - if isinstance(source_dtype, cudf.CategoricalDtype): - if any( - not isinstance(c.dtype, cudf.CategoricalDtype) - for c in source_columns - ): - raise ValueError("Columns must all have the same dtype") - cats = list(c.categories for c in source_columns) - cats = cudf.core.column.concat_columns(cats).unique() - source_columns = [ - col._set_categories(cats, is_unique=True).codes - for col in source_columns - ] - - if any(c.dtype != source_columns[0].dtype for c in source_columns): - raise ValueError("Columns must all have the same dtype") - - result_columns = libcudf.transpose.transpose(source_columns) - - if isinstance(source_dtype, cudf.CategoricalDtype): - result_columns = [ - codes._with_type_metadata( - cudf.core.dtypes.CategoricalDtype(categories=cats) - ) - for codes in result_columns - ] - else: - result_columns = [ - result_column._with_type_metadata(source_dtype) - for result_column in result_columns - ] - - # Set the old column names as the new index - result = self.__class__._from_data( - ColumnAccessor(dict(enumerate(result_columns)), verify=False), - index=cudf.Index(index), - ) - # Set the old index as the new column names - result.columns = columns - return result - - T = property(transpose, doc=transpose.__doc__) - - @_performance_tracking - def melt( - self, - id_vars=None, - value_vars=None, - var_name=None, - value_name="value", - col_level=None, - ignore_index: bool = True, - ): - """Unpivots a DataFrame from wide format to long format, - optionally leaving identifier variables set. - - Parameters - ---------- - frame : DataFrame - id_vars : tuple, list, or ndarray, optional - Column(s) to use as identifier variables. - default: None - value_vars : tuple, list, or ndarray, optional - Column(s) to unpivot. - default: all columns that are not set as `id_vars`. - var_name : scalar - Name to use for the `variable` column. - default: frame.columns.name or 'variable' - value_name : str - Name to use for the `value` column. - default: 'value' - - Returns - ------- - out : DataFrame - Melted result - """ - from cudf.core.reshape import melt - - return melt( - self, - id_vars=id_vars, - value_vars=value_vars, - var_name=var_name, - value_name=value_name, - col_level=col_level, - ignore_index=ignore_index, - ) - - @_performance_tracking - def merge( - self, - right, - how="inner", - on=None, - left_on=None, - right_on=None, - left_index=False, - right_index=False, - sort=False, - suffixes=("_x", "_y"), - indicator=False, - validate=None, - ): - """Merge GPU DataFrame objects by performing a database-style join - operation by columns or indexes. - - Parameters - ---------- - right : DataFrame - on : label or list; defaults to None - Column or index level names to join on. These must be found in - both DataFrames. - - If on is None and not merging on indexes then - this defaults to the intersection of the columns - in both DataFrames. - how : {'left', 'outer', 'inner', 'leftsemi', 'leftanti'}, \ - default 'inner' - Type of merge to be performed. - - - left : use only keys from left frame, similar to a SQL left - outer join. - - right : not supported. - - outer : use union of keys from both frames, similar to a SQL - full outer join. - - inner : use intersection of keys from both frames, similar to - a SQL inner join. - - leftsemi : similar to ``inner`` join, but only returns columns - from the left dataframe and ignores all columns from the - right dataframe. - - leftanti : returns only rows columns from the left dataframe - for non-matched records. This is exact opposite to ``leftsemi`` - join. - left_on : label or list, or array-like - Column or index level names to join on in the left DataFrame. - Can also be an array or list of arrays of the length of the - left DataFrame. These arrays are treated as if they are columns. - right_on : label or list, or array-like - Column or index level names to join on in the right DataFrame. - Can also be an array or list of arrays of the length of the - right DataFrame. These arrays are treated as if they are columns. - left_index : bool, default False - Use the index from the left DataFrame as the join key(s). - right_index : bool, default False - Use the index from the right DataFrame as the join key. - sort : bool, default False - Sort the resulting dataframe by the columns that were merged on, - starting from the left. - suffixes: Tuple[str, str], defaults to ('_x', '_y') - Suffixes applied to overlapping column names on the left and right - sides - - Returns - ------- - merged : DataFrame - - Examples - -------- - >>> import cudf - >>> df_a = cudf.DataFrame() - >>> df_a['key'] = [0, 1, 2, 3, 4] - >>> df_a['vals_a'] = [float(i + 10) for i in range(5)] - >>> df_b = cudf.DataFrame() - >>> df_b['key'] = [1, 2, 4] - >>> df_b['vals_b'] = [float(i+10) for i in range(3)] - >>> df_merged = df_a.merge(df_b, on=['key'], how='left') - >>> df_merged.sort_values('key') # doctest: +SKIP - key vals_a vals_b - 3 0 10.0 - 0 1 11.0 10.0 - 1 2 12.0 11.0 - 4 3 13.0 - 2 4 14.0 12.0 - - **Merging on categorical variables is only allowed in certain cases** - - Categorical variable typecasting logic depends on both `how` - and the specifics of the categorical variables to be merged. - Merging categorical variables when only one side is ordered - is ambiguous and not allowed. Merging when both categoricals - are ordered is allowed, but only when the categories are - exactly equal and have equal ordering, and will result in the - common dtype. - When both sides are unordered, the result categorical depends - on the kind of join: - - For inner joins, the result will be the intersection of the - categories - - For left or right joins, the result will be the left or - right dtype respectively. This extends to semi and anti joins. - - For outer joins, the result will be the union of categories - from both sides. - - .. pandas-compat:: - :meth:`pandas.DataFrame.merge` - - DataFrames merges in cuDF result in non-deterministic row - ordering. - """ - if indicator: - raise NotImplementedError( - "Only indicator=False is currently supported" - ) - if validate is not None: - raise NotImplementedError("validate is currently not supported.") - - lhs, rhs = self, right - merge_cls = Merge - if how == "right": - # Merge doesn't support right, so just swap - how = "left" - lhs, rhs = right, self - left_on, right_on = right_on, left_on - left_index, right_index = right_index, left_index - suffixes = (suffixes[1], suffixes[0]) - elif how in {"leftsemi", "leftanti"}: - merge_cls = MergeSemi - - return merge_cls( - lhs, - rhs, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - how=how, - sort=sort, - indicator=indicator, - suffixes=suffixes, - ).perform_merge() - - @_performance_tracking - def join( - self, - other, - on=None, - how="left", - lsuffix="", - rsuffix="", - sort=False, - validate: str | None = None, - ): - """Join columns with other DataFrame on index or on a key column. - - Parameters - ---------- - other : DataFrame - how : str - Only accepts "left", "right", "inner", "outer" - lsuffix, rsuffix : str - The suffices to add to the left (*lsuffix*) and right (*rsuffix*) - column names when avoiding conflicts. - sort : bool - Set to True to ensure sorted ordering. - validate : str, optional - If specified, checks if join is of specified type. - - * "one_to_one" or "1:1": check if join keys are unique in both left - and right datasets. - * "one_to_many" or "1:m": check if join keys are unique in left dataset. - * "many_to_one" or "m:1": check if join keys are unique in right dataset. - * "many_to_many" or "m:m": allowed, but does not result in checks. - - Currently not supported. - - Returns - ------- - joined : DataFrame - - .. pandas-compat:: - :meth:`pandas.DataFrame.join` - - - *other* must be a single DataFrame for now. - - *on* is not supported yet due to lack of multi-index support. - """ - if on is not None: - raise NotImplementedError("The on parameter is not yet supported") - elif validate is not None: - raise NotImplementedError( - "The validate parameter is not yet supported" - ) - - df = self.merge( - other, - left_index=True, - right_index=True, - how=how, - suffixes=(lsuffix, rsuffix), - sort=sort, - ) - df.index.name = ( - None if self.index.name != other.index.name else self.index.name - ) - return df - - @_performance_tracking - @docutils.doc_apply( - groupby_doc_template.format( - ret=textwrap.dedent( - """ - Returns - ------- - DataFrameGroupBy - Returns a DataFrameGroupBy object that contains - information about the groups. - """ - ) - ) - ) - def groupby( - self, - by=None, - axis=0, - level=None, - as_index=True, - sort=no_default, - group_keys=False, - observed=True, - dropna=True, - ): - return super().groupby( - by, - axis, - level, - as_index, - sort, - group_keys, - observed, - dropna, - ) - - def query(self, expr, local_dict=None): - """ - Query with a boolean expression using Numba to compile a GPU kernel. - - See :meth:`pandas.DataFrame.query`. - - Parameters - ---------- - expr : str - A boolean expression. Names in expression refer to columns. - `index` can be used instead of index name, but this is not - supported for MultiIndex. - - Names starting with `@` refer to Python variables. - - An output value will be `null` if any of the input values are - `null` regardless of expression. - - local_dict : dict - Containing the local variable to be used in query. - - Returns - ------- - filtered : DataFrame - - Examples - -------- - >>> df = cudf.DataFrame({ - ... "a": [1, 2, 2], - ... "b": [3, 4, 5], - ... }) - >>> expr = "(a == 2 and b == 4) or (b == 3)" - >>> df.query(expr) - a b - 0 1 3 - 1 2 4 - - DateTime conditionals: - - >>> import numpy as np - >>> import datetime - >>> df = cudf.DataFrame() - >>> data = np.array(['2018-10-07', '2018-10-08'], dtype='datetime64') - >>> df['datetimes'] = data - >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') - >>> df.query('datetimes==@search_date') - datetimes - 1 2018-10-08 - - Using local_dict: - - >>> import numpy as np - >>> import datetime - >>> df = cudf.DataFrame() - >>> data = np.array(['2018-10-07', '2018-10-08'], dtype='datetime64') - >>> df['datetimes'] = data - >>> search_date2 = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') - >>> df.query('datetimes==@search_date', - ... local_dict={'search_date': search_date2}) - datetimes - 1 2018-10-08 - - .. pandas-compat:: - :meth:`pandas.DataFrame.query` - - One difference from pandas is that ``query`` currently only - supports numeric, datetime, timedelta, or bool dtypes. - """ - # can't use `annotate` decorator here as we inspect the calling - # environment. - with annotate("DATAFRAME_QUERY", color="purple", domain="cudf_python"): - if local_dict is None: - local_dict = {} - - if self.empty: - return self.copy() - - if not isinstance(local_dict, dict): - raise TypeError( - f"local_dict type: expected dict but found " - f"{type(local_dict)}" - ) - - # Get calling environment - callframe = inspect.currentframe().f_back - callenv = { - "locals": callframe.f_locals, - "globals": callframe.f_globals, - "local_dict": local_dict, - } - # Run query - boolmask = queryutils.query_execute(self, expr, callenv) - return self._apply_boolean_mask( - BooleanMask.from_column_unchecked(boolmask) - ) - - @_performance_tracking - def apply( - self, - func, - axis=1, - raw=False, - result_type=None, - args=(), - by_row: Literal[False, "compat"] = "compat", - engine: Literal["python", "numba"] = "python", - engine_kwargs: dict[str, bool] | None = None, - **kwargs, - ): - """ - Apply a function along an axis of the DataFrame. - ``apply`` relies on Numba to JIT compile ``func``. - Thus the allowed operations within ``func`` are limited to `those - supported by the CUDA Python Numba target - `__. - For more information, see the `cuDF guide to user defined functions - `__. - - Some string functions and methods are supported. Refer to the guide - to UDFs for details. - - Parameters - ---------- - func : function - Function to apply to each row. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Axis along which the function is applied. - - 0 or 'index': apply function to each column (not yet supported). - - 1 or 'columns': apply function to each row. - raw: bool, default False - Not yet supported - result_type: {'expand', 'reduce', 'broadcast', None}, default None - Not yet supported - args: tuple - Positional arguments to pass to func in addition to the dataframe. - by_row : False or "compat", default "compat" - Only has an effect when ``func`` is a listlike or dictlike of funcs - and the func isn't a string. - If "compat", will if possible first translate the func into pandas - methods (e.g. ``Series().apply(np.sum)`` will be translated to - ``Series().sum()``). If that doesn't work, will try call to apply again with - ``by_row=True`` and if that fails, will call apply again with - ``by_row=False`` (backward compatible). - If False, the funcs will be passed the whole Series at once. - - Currently not supported. - engine : {'python', 'numba'}, default 'python' - Unused. Added for compatibility with pandas. - engine_kwargs : dict - Unused. Added for compatibility with pandas. - **kwargs - Additional keyword arguments to pass as keywords arguments to - `func`. - - Examples - -------- - Simple function of a single variable which could be NA: - - >>> def f(row): - ... if row['a'] is cudf.NA: - ... return 0 - ... else: - ... return row['a'] + 1 - ... - >>> df = cudf.DataFrame({'a': [1, cudf.NA, 3]}) - >>> df.apply(f, axis=1) - 0 2 - 1 0 - 2 4 - dtype: int64 - - Function of multiple variables will operate in - a null aware manner: - - >>> def f(row): - ... return row['a'] - row['b'] - ... - >>> df = cudf.DataFrame({ - ... 'a': [1, cudf.NA, 3, cudf.NA], - ... 'b': [5, 6, cudf.NA, cudf.NA] - ... }) - >>> df.apply(f) - 0 -4 - 1 - 2 - 3 - dtype: int64 - - Functions may conditionally return NA as in pandas: - - >>> def f(row): - ... if row['a'] + row['b'] > 3: - ... return cudf.NA - ... else: - ... return row['a'] + row['b'] - ... - >>> df = cudf.DataFrame({ - ... 'a': [1, 2, 3], - ... 'b': [2, 1, 1] - ... }) - >>> df.apply(f, axis=1) - 0 3 - 1 3 - 2 - dtype: int64 - - Mixed types are allowed, but will return the common - type, rather than object as in pandas: - - >>> def f(row): - ... return row['a'] + row['b'] - ... - >>> df = cudf.DataFrame({ - ... 'a': [1, 2, 3], - ... 'b': [0.5, cudf.NA, 3.14] - ... }) - >>> df.apply(f, axis=1) - 0 1.5 - 1 - 2 6.14 - dtype: float64 - - Functions may also return scalar values, however the - result will be promoted to a safe type regardless of - the data: - - >>> def f(row): - ... if row['a'] > 3: - ... return row['a'] - ... else: - ... return 1.5 - ... - >>> df = cudf.DataFrame({ - ... 'a': [1, 3, 5] - ... }) - >>> df.apply(f, axis=1) - 0 1.5 - 1 1.5 - 2 5.0 - dtype: float64 - - Ops against N columns are supported generally: - - >>> def f(row): - ... v, w, x, y, z = ( - ... row['a'], row['b'], row['c'], row['d'], row['e'] - ... ) - ... return x + (y - (z / w)) % v - ... - >>> df = cudf.DataFrame({ - ... 'a': [1, 2, 3], - ... 'b': [4, 5, 6], - ... 'c': [cudf.NA, 4, 4], - ... 'd': [8, 7, 8], - ... 'e': [7, 1, 6] - ... }) - >>> df.apply(f, axis=1) - 0 - 1 4.8 - 2 5.0 - dtype: float64 - - UDFs manipulating string data are allowed, as long as - they neither modify strings in place nor create new strings. - For example, the following UDF is allowed: - - >>> def f(row): - ... st = row['str_col'] - ... scale = row['scale'] - ... if len(st) == 0: - ... return -1 - ... elif st.startswith('a'): - ... return 1 - scale - ... elif 'example' in st: - ... return 1 + scale - ... else: - ... return 42 - ... - >>> df = cudf.DataFrame({ - ... 'str_col': ['', 'abc', 'some_example'], - ... 'scale': [1, 2, 3] - ... }) - >>> df.apply(f, axis=1) # doctest: +SKIP - 0 -1 - 1 -1 - 2 4 - dtype: int64 - - However, the following UDF is not allowed since it includes an - operation that requires the creation of a new string: a call to the - ``upper`` method. Methods that are not supported in this manner - will raise an ``AttributeError``. - - >>> def f(row): - ... st = row['str_col'].upper() - ... return 'ABC' in st - >>> df.apply(f, axis=1) # doctest: +SKIP - - For a complete list of supported functions and methods that may be - used to manipulate string data, see the UDF guide, - - """ - if axis != 1: - raise NotImplementedError( - "DataFrame.apply currently only supports row wise ops" - ) - if raw: - raise NotImplementedError("The `raw` kwarg is not yet supported.") - if result_type is not None: - raise NotImplementedError( - "The `result_type` kwarg is not yet supported." - ) - if by_row != "compat": - raise NotImplementedError("by_row is currently not supported.") - - return self._apply(func, _get_row_kernel, *args, **kwargs) - - def applymap( - self, - func: Callable[[Any], Any], - na_action: str | None = None, - **kwargs, - ) -> DataFrame: - """ - Apply a function to a Dataframe elementwise. - - This method applies a function that accepts and returns a scalar - to every element of a DataFrame. - - Parameters - ---------- - func : callable - Python function, returns a single value from a single value. - na_action : {None, 'ignore'}, default None - If 'ignore', propagate NaN values, without passing them to func. - - Returns - ------- - DataFrame - Transformed DataFrame. - """ - # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." - warnings.warn( - "DataFrame.applymap has been deprecated. Use DataFrame.map " - "instead.", - FutureWarning, - ) - return self.map(func=func, na_action=na_action, **kwargs) - - def map( - self, - func: Callable[[Any], Any], - na_action: str | None = None, - **kwargs, - ) -> DataFrame: - """ - Apply a function to a Dataframe elementwise. - - This method applies a function that accepts and returns a scalar - to every element of a DataFrame. - - Parameters - ---------- - func : callable - Python function, returns a single value from a single value. - na_action : {None, 'ignore'}, default None - If 'ignore', propagate NaN values, without passing them to func. - - Returns - ------- - DataFrame - Transformed DataFrame. - """ - - if kwargs: - raise NotImplementedError( - "DataFrame.applymap does not yet support **kwargs." - ) - - if na_action not in {"ignore", None}: - raise ValueError( - f"na_action must be 'ignore' or None. Got {repr(na_action)}" - ) - - if na_action == "ignore": - devfunc = numba.cuda.jit(device=True)(func) - - # promote to a null-ignoring function - # this code is never run in python, it only - # exists to provide numba with the correct - # bytecode to generate the equivalent PTX - # as a null-ignoring version of the function - def _func(x): # pragma: no cover - if x is NA: - return NA - else: - return devfunc(x) - - else: - _func = func - - # TODO: naive implementation - # this could be written as a single kernel - result = {} - for name, col in self._column_labels_and_values: - apply_sr = Series._from_column(col) - result[name] = apply_sr.apply(_func)._column - - return DataFrame._from_data(result, index=self.index) - - @_performance_tracking - @applyutils.doc_apply() - def apply_rows( - self, - func, - incols, - outcols, - kwargs, - pessimistic_nulls=True, - cache_key=None, - ): - """ - Apply a row-wise user defined function. - - Parameters - ---------- - {params} - - Examples - -------- - The user function should loop over the columns and set the output for - each row. Loop execution order is arbitrary, so each iteration of - the loop **MUST** be independent of each other. - - When ``func`` is invoked, the array args corresponding to the - input/output are strided so as to improve GPU parallelism. - The loop in the function resembles serial code, but executes - concurrently in multiple threads. - - >>> import cudf - >>> import numpy as np - >>> df = cudf.DataFrame() - >>> nelem = 3 - >>> df['in1'] = np.arange(nelem) - >>> df['in2'] = np.arange(nelem) - >>> df['in3'] = np.arange(nelem) - - Define input columns for the kernel - - >>> in1 = df['in1'] - >>> in2 = df['in2'] - >>> in3 = df['in3'] - >>> def kernel(in1, in2, in3, out1, out2, kwarg1, kwarg2): - ... for i, (x, y, z) in enumerate(zip(in1, in2, in3)): - ... out1[i] = kwarg2 * x - kwarg1 * y - ... out2[i] = y - kwarg1 * z - - Call ``.apply_rows`` with the name of the input columns, the name and - dtype of the output columns, and, optionally, a dict of extra - arguments. - - >>> df.apply_rows(kernel, - ... incols=['in1', 'in2', 'in3'], - ... outcols=dict(out1=np.float64, out2=np.float64), - ... kwargs=dict(kwarg1=3, kwarg2=4)) - in1 in2 in3 out1 out2 - 0 0 0 0 0.0 0.0 - 1 1 1 1 1.0 -2.0 - 2 2 2 2 2.0 -4.0 - """ - for col in incols: - current_col_dtype = self._data[col].dtype - if is_string_dtype(current_col_dtype) or isinstance( - current_col_dtype, cudf.CategoricalDtype - ): - raise TypeError( - "User defined functions are currently not " - "supported on Series with dtypes `str` and `category`." - ) - return applyutils.apply_rows( - self, - func, - incols, - outcols, - kwargs, - pessimistic_nulls, - cache_key=cache_key, - ) - - @_performance_tracking - @applyutils.doc_applychunks() - def apply_chunks( - self, - func, - incols, - outcols, - kwargs=None, - pessimistic_nulls=True, - chunks=None, - blkct=None, - tpb=None, - ): - """ - Transform user-specified chunks using the user-provided function. - - Parameters - ---------- - {params} - {params_chunks} - - Examples - -------- - For ``tpb > 1``, ``func`` is executed by ``tpb`` number of threads - concurrently. To access the thread id and count, - use ``numba.cuda.threadIdx.x`` and ``numba.cuda.blockDim.x``, - respectively (See `numba CUDA kernel documentation`_). - - .. _numba CUDA kernel documentation:\ - https://numba.readthedocs.io/en/stable/cuda/kernels.html - - In the example below, the *kernel* is invoked concurrently on each - specified chunk. The *kernel* computes the corresponding output - for the chunk. - - By looping over the range - ``range(cuda.threadIdx.x, in1.size, cuda.blockDim.x)``, the *kernel* - function can be used with any *tpb* in an efficient manner. - - >>> from numba import cuda - >>> @cuda.jit - ... def kernel(in1, in2, in3, out1): - ... for i in range(cuda.threadIdx.x, in1.size, cuda.blockDim.x): - ... x = in1[i] - ... y = in2[i] - ... z = in3[i] - ... out1[i] = x * y + z - - See Also - -------- - DataFrame.apply_rows - """ - if kwargs is None: - kwargs = {} - if chunks is None: - raise ValueError("*chunks* must be defined") - return applyutils.apply_chunks( - self, - func, - incols, - outcols, - kwargs, - pessimistic_nulls, - chunks, - tpb=tpb, - ) - - @_performance_tracking - def partition_by_hash(self, columns, nparts, keep_index=True): - """Partition the dataframe by the hashed value of data in *columns*. - - Parameters - ---------- - columns : sequence of str - The names of the columns to be hashed. - Must have at least one name. - nparts : int - Number of output partitions - keep_index : boolean - Whether to keep the index or drop it - - Returns - ------- - partitioned: list of DataFrame - """ - key_indices = [self._column_names.index(k) for k in columns] - if keep_index: - cols = [*self.index._columns, *self._columns] - key_indices = [i + len(self.index._columns) for i in key_indices] - else: - cols = [*self._columns] - - output_columns, offsets = libcudf.hash.hash_partition( - cols, key_indices, nparts - ) - outdf = self._from_columns_like_self( - output_columns, - self._column_names, - self._index_names if keep_index else None, - ) - # Slice into partitions. Notice, `hash_partition` returns the start - # offset of each partition thus we skip the first offset - ret = outdf._split(offsets[1:], keep_index=keep_index) - - # Calling `_split()` on an empty dataframe returns an empty list - # so we add empty partitions here - ret += [self._empty_like(keep_index) for _ in range(nparts - len(ret))] - return ret - - def info( - self, - verbose=None, - buf=None, - max_cols=None, - memory_usage=None, - null_counts=None, - ): - """ - Print a concise summary of a DataFrame. - - This method prints information about a DataFrame including - the index dtype and column dtypes, non-null values and memory usage. - - Parameters - ---------- - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - max_cols : int, optional - When to switch from the verbose to the truncated output. If the - DataFrame has more than `max_cols` columns, the truncated output - is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. - memory_usage : bool, str, optional - Specifies whether total memory usage of the DataFrame - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the frame is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a DataFrame and returns None. - - See Also - -------- - DataFrame.describe: Generate descriptive statistics of DataFrame - columns. - DataFrame.memory_usage: Memory usage of DataFrame columns. - - Examples - -------- - >>> import cudf - >>> int_values = [1, 2, 3, 4, 5] - >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] - >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - >>> df = cudf.DataFrame({"int_col": int_values, - ... "text_col": text_values, - ... "float_col": float_values}) - >>> df - int_col text_col float_col - 0 1 alpha 0.00 - 1 2 beta 0.25 - 2 3 gamma 0.50 - 3 4 delta 0.75 - 4 5 epsilon 1.00 - - Prints information of all columns: - - >>> df.info(verbose=True) - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 130.0+ bytes - - Prints a summary of columns count and its dtypes but not per column - information: - - >>> df.info(verbose=False) - - RangeIndex: 5 entries, 0 to 4 - Columns: 3 entries, int_col to float_col - dtypes: float64(1), int64(1), object(1) - memory usage: 130.0+ bytes - - Pipe output of DataFrame.info to a buffer instead of sys.stdout and - print buffer contents: - - >>> import io - >>> buffer = io.StringIO() - >>> df.info(buf=buffer) - >>> print(buffer.getvalue()) - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 130.0+ bytes - - The `memory_usage` parameter allows deep introspection mode, specially - useful for big DataFrames and fine-tune memory optimization: - - >>> import numpy as np - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) - >>> df = cudf.DataFrame({ - ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) - ... }) - >>> df.info(memory_usage='deep') - - RangeIndex: 1000000 entries, 0 to 999999 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object - dtypes: object(3) - memory usage: 14.3 MB - """ - if buf is None: - buf = sys.stdout - - lines = [str(type(self))] - - index_name = type(self.index).__name__ - if len(self.index) > 0: - entries_summary = f", {self.index[0]} to {self.index[-1]}" - else: - entries_summary = "" - index_summary = ( - f"{index_name}: {len(self.index)} entries{entries_summary}" - ) - lines.append(index_summary) - - if self._num_columns == 0: - lines.append(f"Empty {type(self).__name__}") - cudf.utils.ioutils.buffer_write_lines(buf, lines) - return - - cols = self._column_names - col_count = len(cols) - - if max_cols is None: - max_cols = pd.options.display.max_info_columns - - max_rows = pd.options.display.max_info_rows - - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(self) < max_rows) - else: - show_counts = null_counts - - exceeds_info_cols = col_count > max_cols - - def _put_str(s, space): - return str(s)[:space].ljust(space) - - def _verbose_repr(): - lines.append(f"Data columns (total {col_count} columns):") - - id_head = " # " - column_head = "Column" - col_space = 2 - - max_col = max(len(pprint_thing(k)) for k in cols) - len_column = len(pprint_thing(column_head)) - space = max(max_col, len_column) + col_space - - max_id = len(pprint_thing(col_count)) - len_id = len(pprint_thing(id_head)) - space_num = max(max_id, len_id) + col_space - counts = None - - header = _put_str(id_head, space_num) + _put_str( - column_head, space - ) - if show_counts: - counts = self.count().to_pandas().tolist() - if col_count != len(counts): - raise AssertionError( - f"Columns must equal " - f"counts ({col_count} != {len(counts)})" - ) - count_header = "Non-Null Count" - len_count = len(count_header) - non_null = " non-null" - max_count = max(len(pprint_thing(k)) for k in counts) + len( - non_null - ) - space_count = max(len_count, max_count) + col_space - count_temp = "{count}" + non_null - else: - count_header = "" - space_count = len(count_header) - len_count = space_count - count_temp = "{count}" - - dtype_header = "Dtype" - len_dtype = len(dtype_header) - max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) - space_dtype = max(len_dtype, max_dtypes) - header += ( - _put_str(count_header, space_count) - + _put_str(dtype_header, space_dtype).rstrip() - ) - - lines.append(header) - lines.append( - _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) - + _put_str("-" * len_count, space_count) - + _put_str("-" * len_dtype, space_dtype).rstrip() - ) - - for i, col in enumerate(self._column_names): - dtype = self.dtypes.iloc[i] - col = pprint_thing(col) - - line_no = _put_str(f" {i}", space_num) - count = "" - if show_counts: - count = counts[i] - - lines.append( - line_no - + _put_str(col, space) - + _put_str(count_temp.format(count=count), space_count) - + _put_str(dtype, space_dtype).rstrip() - ) - - def _non_verbose_repr(): - if col_count > 0: - entries_summary = f", {cols[0]} to {cols[-1]}" - else: - entries_summary = "" - columns_summary = f"Columns: {col_count} entries{entries_summary}" - lines.append(columns_summary) - - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" - - if verbose: - _verbose_repr() - elif verbose is False: # specifically set to False, not nesc None - _non_verbose_repr() - else: - if exceeds_info_cols: - _non_verbose_repr() - else: - _verbose_repr() - - dtype_counts = defaultdict(int) - for col in self._data: - dtype_counts[self._data[col].dtype.name] += 1 - - dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(dtype_counts.items())] - lines.append(f"dtypes: {', '.join(dtypes)}") - - if memory_usage is None: - memory_usage = pd.options.display.memory_usage - - if memory_usage: - # append memory usage of df to display - size_qualifier = "" - if memory_usage == "deep": - deep = True - else: - deep = False - if "object" in dtype_counts or self.index.dtype == "object": - size_qualifier = "+" - mem_usage = self.memory_usage(index=True, deep=deep).sum() - lines.append( - f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n" - ) - - cudf.utils.ioutils.buffer_write_lines(buf, lines) - - @_performance_tracking - @docutils.doc_describe() - def describe( - self, - percentiles=None, - include=None, - exclude=None, - ): - """{docstring}""" - - if not include and not exclude: - default_include = [np.number, "datetime"] - data_to_describe = self.select_dtypes(include=default_include) - if data_to_describe._num_columns == 0: - data_to_describe = self - - elif include == "all": - if exclude is not None: - raise ValueError("exclude must be None when include is 'all'") - - data_to_describe = self - else: - data_to_describe = self.select_dtypes( - include=include, exclude=exclude - ) - - if data_to_describe.empty: - raise ValueError("No data of included types.") - - describe_series_list = [ - data_to_describe[col].describe( - percentiles=percentiles, - ) - for col in data_to_describe._column_names - ] - if len(describe_series_list) == 1: - return describe_series_list[0].to_frame() - else: - ldesc_indexes = sorted( - (x.index for x in describe_series_list), key=len - ) - names = dict.fromkeys( - [ - name - for idxnames in ldesc_indexes - for name in idxnames.to_pandas() - ], - None, - ) - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - res = cudf.concat( - [ - series.reindex(names, copy=False) - for series in describe_series_list - ], - axis=1, - sort=False, - ) - return res - - @_performance_tracking - def to_pandas( - self, *, nullable: bool = False, arrow_type: bool = False - ) -> pd.DataFrame: - """ - Convert to a Pandas DataFrame. - - Parameters - ---------- - nullable : Boolean, Default False - If ``nullable`` is ``True``, the resulting columns - in the dataframe will be having a corresponding - nullable Pandas dtype. If there is no corresponding - nullable Pandas dtype present, the resulting dtype - will be a regular pandas dtype. - If ``nullable`` is ``False``, - the resulting columns will either convert null - values to ``np.nan`` or ``None`` depending on the dtype. - arrow_type : bool, Default False - Return the columns with a ``pandas.ArrowDtype`` - - Returns - ------- - out : Pandas DataFrame - - Notes - ----- - nullable and arrow_type cannot both be set to ``True`` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [0, 1, 2], 'b': [-3, 2, 0]}) - >>> pdf = df.to_pandas() - >>> pdf - a b - 0 0 -3 - 1 1 2 - 2 2 0 - >>> type(pdf) - - - ``nullable=True`` converts the result to pandas nullable types: - - >>> df = cudf.DataFrame({'a': [0, None, 2], 'b': [True, False, None]}) - >>> df - a b - 0 0 True - 1 False - 2 2 - >>> pdf = df.to_pandas(nullable=True) - >>> pdf - a b - 0 0 True - 1 False - 2 2 - >>> pdf.dtypes - a Int64 - b boolean - dtype: object - >>> pdf = df.to_pandas(nullable=False) - >>> pdf - a b - 0 0.0 True - 1 NaN False - 2 2.0 None - >>> pdf.dtypes - a float64 - b object - dtype: object - - ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``: - - >>> df.to_pandas(arrow_type=True).dtypes - a int64[pyarrow] - b bool[pyarrow] - dtype: object - """ - out_index = self.index.to_pandas() - out_data = { - i: col.to_pandas(nullable=nullable, arrow_type=arrow_type) - for i, col in enumerate(self._columns) - } - - out_df = pd.DataFrame(out_data, index=out_index) - out_df.columns = self._data.to_pandas_index() - - return out_df - - @classmethod - @_performance_tracking - def from_pandas(cls, dataframe, nan_as_null=no_default): - """ - Convert from a Pandas DataFrame. - - Parameters - ---------- - dataframe : Pandas DataFrame object - A Pandas DataFrame object which has to be converted - to cuDF DataFrame. - nan_as_null : bool, Default True - If ``True``, converts ``np.nan`` values to ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Raises - ------ - TypeError for invalid input type. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> data = [[0,1], [1,2], [3,4]] - >>> pdf = pd.DataFrame(data, columns=['a', 'b'], dtype=int) - >>> cudf.from_pandas(pdf) - a b - 0 0 1 - 1 1 2 - 2 3 4 - """ - if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) - - if isinstance(dataframe, pd.DataFrame): - data = { - i: column.as_column(col_value.array, nan_as_null=nan_as_null) - for i, (_, col_value) in enumerate(dataframe.items()) - } - if isinstance(dataframe.index, pd.MultiIndex): - index = cudf.MultiIndex.from_pandas( - dataframe.index, nan_as_null=nan_as_null - ) - else: - index = cudf.Index.from_pandas( - dataframe.index, nan_as_null=nan_as_null - ) - df = cls._from_data(data, index) - # Checks duplicate columns and sets column metadata - df.columns = dataframe.columns - return df - elif hasattr(dataframe, "__dataframe__"): - # TODO: Probably should be handled in the constructor as - # this isn't pandas specific - return from_dataframe(dataframe, allow_copy=True) - else: - raise TypeError( - f"Could not construct DataFrame from {type(dataframe)}" - ) - - @classmethod - @_performance_tracking - def from_arrow(cls, table): - """ - Convert from PyArrow Table to DataFrame. - - Parameters - ---------- - table : PyArrow Table Object - PyArrow Table Object which has to be converted to cudf DataFrame. - - Raises - ------ - TypeError for invalid input type. - - Returns - ------- - cudf DataFrame - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> data = pa.table({"a":[1, 2, 3], "b":[4, 5, 6]}) - >>> cudf.DataFrame.from_arrow(data) - a b - 0 1 4 - 1 2 5 - 2 3 6 - - .. pandas-compat:: - `pandas.DataFrame.from_arrow` - - This method does not exist in pandas but it is similar to - how :meth:`pyarrow.Table.to_pandas` works for PyArrow Tables i.e. - it does not support automatically setting index column(s). - """ - index_col = None - col_index_names = None - physical_column_md = [] - if isinstance(table, pa.Table) and isinstance( - table.schema.pandas_metadata, dict - ): - physical_column_md = table.schema.pandas_metadata["columns"] - index_col = table.schema.pandas_metadata["index_columns"] - if "column_indexes" in table.schema.pandas_metadata: - col_index_names = [] - for col_meta in table.schema.pandas_metadata["column_indexes"]: - col_index_names.append(col_meta["name"]) - - out = super().from_arrow(table) - if col_index_names is not None: - out._data._level_names = col_index_names - if index_col: - if isinstance(index_col[0], dict): - range_meta = index_col[0] - idx = cudf.RangeIndex( - start=range_meta["start"], - stop=range_meta["stop"], - step=range_meta["step"], - name=range_meta["name"], - ) - if len(idx) == len(out): - # `idx` is generated from arrow `pandas_metadata` - # which can get out of date with many of the - # arrow operations. Hence verifying if the - # lengths match, or else don't need to set - # an index at all i.e., Default RangeIndex - # will be set. - # See more about the discussion here: - # https://github.com/apache/arrow/issues/15178 - out = out.set_index(idx) - else: - out = out.set_index(index_col) - - if ( - "__index_level_0__" in out.index.names - and len(out.index.names) == 1 - ): - real_index_name = None - for md in physical_column_md: - if md["field_name"] == "__index_level_0__": - real_index_name = md["name"] - break - out.index.name = real_index_name - - return out - - @_performance_tracking - def to_arrow(self, preserve_index=None) -> pa.Table: - """ - Convert to a PyArrow Table. - - Parameters - ---------- - preserve_index : bool, optional - whether index column and its meta data needs to be saved - or not. The default of None will store the index as a - column, except for a RangeIndex which is stored as - metadata only. Setting preserve_index to True will force - a RangeIndex to be materialized. - - Returns - ------- - PyArrow Table - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame( - ... {"a":[1, 2, 3], "b":[4, 5, 6]}, index=[1, 2, 3]) - >>> df.to_arrow() - pyarrow.Table - a: int64 - b: int64 - index: int64 - ---- - a: [[1,2,3]] - b: [[4,5,6]] - index: [[1,2,3]] - >>> df.to_arrow(preserve_index=False) - pyarrow.Table - a: int64 - b: int64 - ---- - a: [[1,2,3]] - b: [[4,5,6]] - """ - - data = self - index_descr = [] - write_index = preserve_index is not False - keep_range_index = write_index and preserve_index is None - index = self.index - index_levels = [self.index] - if write_index: - if isinstance(index, cudf.RangeIndex) and keep_range_index: - index_descr = [ - { - "kind": "range", - "name": index.name, - "start": index.start, - "stop": index.stop, - "step": index.step, - } - ] - else: - if isinstance(index, cudf.RangeIndex): - index = index._as_int_index() - index.name = "__index_level_0__" - if isinstance(index, MultiIndex): - index_descr = index._column_names - index_levels = index.levels - else: - index_descr = ( - index.names if index.name is not None else ("index",) - ) - data = data.copy(deep=False) - for gen_name, col_name in zip( - index_descr, index._column_names - ): - data._insert( - data.shape[1], - gen_name, - index._data[col_name], - ) - - out = super(DataFrame, data).to_arrow() - metadata = pa.pandas_compat.construct_metadata( - columns_to_convert=[self[col] for col in self._column_names], - df=self, - column_names=out.schema.names, - index_levels=index_levels, - index_descriptors=index_descr, - preserve_index=preserve_index, - types=out.schema.types, - ) - - return out.replace_schema_metadata(metadata) - - @_performance_tracking - def to_records(self, index=True, column_dtypes=None, index_dtypes=None): - """Convert to a numpy recarray - - Parameters - ---------- - index : bool - Whether to include the index in the output. - column_dtypes : str, type, dict, default None - If a string or type, the data type to store all columns. If - a dictionary, a mapping of column names and indices (zero-indexed) - to specific data types. Currently not supported. - index_dtypes : str, type, dict, default None - If a string or type, the data type to store all index levels. If - a dictionary, a mapping of index level names and indices - (zero-indexed) to specific data types. - This mapping is applied only if `index=True`. - Currently not supported. - - Returns - ------- - numpy recarray - """ - if column_dtypes is not None: - raise NotImplementedError( - "column_dtypes is currently not supported." - ) - elif index_dtypes is not None: - raise NotImplementedError( - "column_dtypes is currently not supported." - ) - members = [("index", self.index.dtype)] if index else [] - members += list(self._dtypes) - dtype = np.dtype(members) - ret = np.recarray(len(self), dtype=dtype) - if index: - ret["index"] = self.index.to_numpy() - for col in self._column_names: - ret[col] = self[col].to_numpy() - return ret - - @classmethod - @_performance_tracking - def from_records( - cls, - data, - index=None, - exclude=None, - columns=None, - coerce_float: bool = False, - nrows: int | None = None, - nan_as_null=False, - ): - """ - Convert structured or record ndarray to DataFrame. - - Parameters - ---------- - data : numpy structured dtype or recarray of ndim=2 - index : str, array-like - The name of the index column in *data*. - If None, the default index is used. - exclude : sequence, default None - Columns or fields to exclude. - Currently not implemented. - columns : list of str - List of column names to include. - coerce_float : bool, default False - Attempt to convert values of non-string, non-numeric objects (like - decimal.Decimal) to floating point, useful for SQL result sets. - Currently not implemented. - nrows : int, default None - Number of rows to read if data is an iterator. - Currently not implemented. - - Returns - ------- - DataFrame - """ - if exclude is not None: - raise NotImplementedError("exclude is currently not supported.") - if coerce_float is not False: - raise NotImplementedError( - "coerce_float is currently not supported." - ) - if nrows is not None: - raise NotImplementedError("nrows is currently not supported.") - - if data.ndim != 1 and data.ndim != 2: - raise ValueError( - f"records dimension expected 1 or 2 but found {data.ndim}" - ) - - num_cols = len(data[0]) - - if columns is None and data.dtype.names is None: - names = range(num_cols) - - elif data.dtype.names is not None: - names = data.dtype.names - - else: - if len(columns) != num_cols: - raise ValueError( - f"columns length expected {num_cols} " - f"but found {len(columns)}" - ) - names = columns - - if data.ndim == 2: - ca_data = { - k: column.as_column(data[:, i], nan_as_null=nan_as_null) - for i, k in enumerate(names) - } - elif data.ndim == 1: - ca_data = { - name: column.as_column(data[name], nan_as_null=nan_as_null) - for name in names - } - - if not is_scalar(index): - new_index = ensure_index(index) - else: - new_index = None - - if isinstance(columns, (pd.Index, cudf.Index)): - level_names = tuple(columns.names) - else: - level_names = None - - df = cls._from_data( - ColumnAccessor( - data=ca_data, # type: ignore[arg-type] - multiindex=isinstance( - columns, (pd.MultiIndex, cudf.MultiIndex) - ), - rangeindex=isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ), - level_names=level_names, - label_dtype=getattr(columns, "dtype", None), - verify=False, - ), - index=new_index, - ) - if is_scalar(index) and index is not None: - df = df.set_index(index) - return df - - @classmethod - @_performance_tracking - def _from_arrays( - cls, - data, - index=None, - columns=None, - nan_as_null=False, - ): - """ - Convert an object implementing an array interface to DataFrame. - - Parameters - ---------- - data : object of ndim 1 or 2, - Object implementing ``__array_interface__`` or ``__cuda_array_interface__`` - index : Index or array-like - Index to use for resulting frame. Will default to - RangeIndex if no indexing information part of input data and - no index provided. - columns : list of str - List of column names to include. - - Returns - ------- - DataFrame - """ - array_data: np.ndarray | cupy.ndarray - if hasattr(data, "__cuda_array_interface__"): - array_data = cupy.asarray(data, order="F") - elif hasattr(data, "__array_interface__"): - array_data = np.asarray(data, order="F") - else: - raise ValueError( - "data must be an object implementing __cuda_array_interface__ or __array_interface__" - ) - - if array_data.ndim not in {1, 2}: - raise ValueError( - f"records dimension expected 1 or 2 but found: {array_data.ndim}" - ) - - if data.ndim == 2: - num_cols = array_data.shape[1] - else: - # Since we validate ndim to be either 1 or 2 above, - # this case can be assumed to be ndim == 1. - num_cols = 1 - - if columns is None: - names = range(num_cols) - else: - if len(columns) != num_cols: - raise ValueError( - f"columns length expected {num_cols} but " - f"found {len(columns)}" - ) - elif len(columns) != len(set(columns)): - raise ValueError("Duplicate column names are not allowed") - names = columns - - if array_data.ndim == 2: - ca_data = { - k: column.as_column(array_data[:, i], nan_as_null=nan_as_null) - for i, k in enumerate(names) - } - elif array_data.ndim == 1: - ca_data = { - names[0]: column.as_column(array_data, nan_as_null=nan_as_null) - } - - if index is not None: - index = ensure_index(index) - - if isinstance(columns, (pd.Index, cudf.Index)): - level_names = tuple(columns.names) - else: - level_names = None - - return cls._from_data( - ColumnAccessor( - data=ca_data, - multiindex=isinstance( - columns, (pd.MultiIndex, cudf.MultiIndex) - ), - rangeindex=isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ), - level_names=level_names, - label_dtype=getattr(columns, "dtype", None), - verify=False, - ), - index=index, - ) - - @_performance_tracking - def interpolate( - self, - method="linear", - axis=0, - limit=None, - inplace=False, - limit_direction=None, - limit_area=None, - downcast=None, - **kwargs, - ): - if all(dt == np.dtype("object") for dt in self.dtypes): - raise TypeError( - "Cannot interpolate with all object-dtype " - "columns in the DataFrame. Try setting at " - "least one column to a numeric dtype." - ) - - return super().interpolate( - method=method, - axis=axis, - limit=limit, - inplace=inplace, - limit_direction=limit_direction, - limit_area=limit_area, - downcast=downcast, - **kwargs, - ) - - @_performance_tracking - def quantile( - self, - q=0.5, - axis=0, - numeric_only=True, - interpolation=None, - method="single", - columns=None, - exact=True, - ): - """ - Return values at the given quantile. - - Parameters - ---------- - q : float or array-like - 0 <= q <= 1, the quantile(s) to compute - axis : int - axis is a NON-FUNCTIONAL parameter - numeric_only : bool, default True - If False, the quantile of datetime and timedelta data will be - computed as well. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - This parameter specifies the interpolation method to use, - when the desired quantile lies between two data points i and j. - Default is ``'linear'`` for ``method="single"``, and ``'nearest'`` - for ``method="table"``. - - * linear: `i + (j - i) * fraction`, where `fraction` is the - fractional part of the index surrounded by `i` and `j`. - * lower: `i`. - * higher: `j`. - * nearest: `i` or `j` whichever is nearest. - * midpoint: (`i` + `j`) / 2. - method : {'single', 'table'}, default `'single'` - Whether to compute quantiles per-column ('single') or over all - columns ('table'). When 'table', the only allowed interpolation - methods are 'nearest', 'lower', and 'higher'. - columns : list of str - List of column names to include. - exact : boolean - Whether to use approximate or exact quantile algorithm. - - Returns - ------- - Series or DataFrame - If q is an array or numeric_only is set to False, a DataFrame - will be returned where index is q, the columns are the columns - of self, and the values are the quantile. - - If q is a float, a Series will be returned where the index is - the columns of self and the values are the quantiles. - - Examples - -------- - >>> import cupy as cp - >>> import cudf - >>> df = cudf.DataFrame(cp.array([[1, 1], [2, 10], [3, 100], [4, 100]]), - ... columns=['a', 'b']) - >>> df - a b - 0 1 1 - 1 2 10 - 2 3 100 - 3 4 100 - >>> df.quantile(0.1) - a 1.3 - b 3.7 - Name: 0.1, dtype: float64 - >>> df.quantile([.1, .5]) - a b - 0.1 1.3 3.7 - 0.5 2.5 55.0 - - .. pandas-compat:: - :meth:`pandas.DataFrame.quantile` - - One notable difference from Pandas is when DataFrame is of - non-numeric types and result is expected to be a Series in case of - Pandas. cuDF will return a DataFrame as it doesn't support mixed - types under Series. - """ # noqa: E501 - if axis not in (0, None): - raise NotImplementedError("axis is not implemented yet") - - data_df = self - if numeric_only: - data_df = data_df.select_dtypes( - include=[np.number], exclude=["datetime64", "timedelta64"] - ) - - if columns is None: - columns = set(data_df._column_names) - - if isinstance(q, numbers.Number): - q_is_number = True - qs = [float(q)] - elif pd.api.types.is_list_like(q): - q_is_number = False - qs = q - else: - msg = "`q` must be either a single element or list" - raise TypeError(msg) - - if method == "table": - interpolation = interpolation or "nearest" - result = self._quantile_table(qs, interpolation.upper()) - - if q_is_number: - result = result.transpose() - return Series._from_column( - result._columns[0], name=q, index=result.index - ) - else: - # Ensure that qs is non-scalar so that we always get a column back. - interpolation = interpolation or "linear" - result = {} - for k in data_df._column_names: - if k in columns: - ser = data_df[k] - res = ser.quantile( - qs, - interpolation=interpolation, - exact=exact, - quant_index=False, - )._column - if len(res) == 0: - res = column.column_empty_like( - qs, dtype=ser.dtype, masked=True, newsize=len(qs) - ) - result[k] = res - result = DataFrame._from_data(result) - - if q_is_number and numeric_only: - result = result.fillna(np.nan).iloc[0] - result.index = data_df.keys() - result.name = q - return result - - result.index = cudf.Index(list(map(float, qs)), dtype="float64") - return result - - @_performance_tracking - def isin(self, values): - """ - Whether each element in the DataFrame is contained in values. - - Parameters - ---------- - values : iterable, Series, DataFrame or dict - The result will only be true at a location if all - the labels match. If values is a Series, that's the index. - If values is a dict, the keys must be the column names, - which must match. If values is a DataFrame, then both the - index and column labels must match. - - Returns - ------- - DataFrame: - DataFrame of booleans showing whether each element in - the DataFrame is contained in values. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, - ... index=['falcon', 'dog']) - >>> df - num_legs num_wings - falcon 2 2 - dog 4 0 - - When ``values`` is a list check whether every value in the DataFrame - is present in the list (which animals have 0 or 2 legs or wings) - - >>> df.isin([0, 2]) - num_legs num_wings - falcon True True - dog False True - - When ``values`` is a dict, we can pass values to check for each - column separately: - - >>> df.isin({'num_wings': [0, 3]}) - num_legs num_wings - falcon False False - dog False True - - When ``values`` is a Series or DataFrame the index and column must - match. Note that 'falcon' does not match based on the number of legs - in other. - - >>> other = cudf.DataFrame({'num_legs': [8, 2], 'num_wings': [0, 2]}, - ... index=['spider', 'falcon']) - >>> df.isin(other) - num_legs num_wings - falcon True True - dog False False - """ - # TODO: propagate nulls through isin - # https://github.com/rapidsai/cudf/issues/7556 - - fill_value = cudf.Scalar(False) - - def make_false_column_like_self(): - return column.as_column(fill_value, length=len(self), dtype="bool") - - # Preprocess different input types into a mapping from column names to - # a list of values to check. - result = {} - if isinstance(values, IndexedFrame): - # Note: In the case where values is a Series, computing some - # information about the values column outside the loop may result - # in performance gains. However, since categorical conversion - # depends on the current column in the loop, using the correct - # precomputed variables inside the loop requires nontrivial logic. - # This optimization could be attempted if `isin` ever becomes a - # bottleneck. - if ( - isinstance(values, (Series, DataFrame)) - and not values.index.is_unique - ): - # if DataFrame ever supports duplicate columns - # would need to check that here - raise ValueError("cannot compute isin with a duplicate axis.") - values = values.reindex(self.index) - other_cols = ( - values._data - if isinstance(values, DataFrame) - else {name: values._column for name in self._data} - ) - for col, self_col in self._column_labels_and_values: - if col in other_cols: - other_col = other_cols[col] - self_is_cat = isinstance(self_col, CategoricalColumn) - other_is_cat = isinstance(other_col, CategoricalColumn) - - if self_is_cat != other_is_cat: - # It is valid to compare the levels of a categorical - # column to a non-categorical column. - if self_is_cat: - self_col = self_col._get_decategorized_column() - else: - other_col = other_col._get_decategorized_column() - - # We use the type checks from _before_ the conversion - # because if only one was categorical then it's already - # been converted and we have to check if they're strings. - if self_is_cat and other_is_cat: - self_is_str = other_is_str = False - else: - # These checks must happen after the conversions above - # since numpy can't handle categorical dtypes. - self_is_str = is_string_dtype(self_col.dtype) - other_is_str = is_string_dtype(other_col.dtype) - - if self_is_str != other_is_str: - # Strings can't compare to anything else. - result[col] = make_false_column_like_self() - else: - result[col] = (self_col == other_col).fillna(False) - else: - result[col] = make_false_column_like_self() - elif is_dict_like(values): - for name, col in self._column_labels_and_values: - if name in values: - result[name] = col.isin(values[name]) - else: - result[name] = make_false_column_like_self() - elif is_list_like(values): - for name, col in self._column_labels_and_values: - result[name] = col.isin(values) - else: - raise TypeError( - "only list-like or dict-like objects are " - "allowed to be passed to DataFrame.isin(), " - "you passed a " - f"'{type(values).__name__}'" - ) - - # TODO: Update this logic to properly preserve MultiIndex columns. - return DataFrame._from_data(result, self.index) - - # - # Stats - # - @_performance_tracking - def _prepare_for_rowwise_op(self, method, skipna, numeric_only): - """Prepare a DataFrame for CuPy-based row-wise operations.""" - - if method not in _cupy_nan_methods_map and any( - col.nullable for col in self._columns - ): - msg = ( - f"Row-wise operations to calculate '{method}' do not " - f"currently support columns with null values. " - f"Consider removing them with .dropna() " - f"or using .fillna()." - ) - raise ValueError(msg) - - if numeric_only: - filtered = self.select_dtypes(include=[np.number, np.bool_]) - else: - filtered = self.copy(deep=False) - - is_pure_dt = all(dt.kind == "M" for dt in filtered.dtypes) - - common_dtype = find_common_type(filtered.dtypes) - if ( - not numeric_only - and is_string_dtype(common_dtype) - and any(not is_string_dtype(dt) for dt in filtered.dtypes) - ): - raise TypeError( - f"Cannot perform row-wise {method} across mixed-dtype columns," - " try type-casting all the columns to same dtype." - ) - - if not skipna and any(col.nullable for col in filtered._columns): - length = filtered._data.nrows - ca = ColumnAccessor( - { - name: col._get_mask_as_column() - if col.nullable - else as_column(True, length=length) - for name, col in filtered._data.items() - }, - verify=False, - ) - mask = DataFrame._from_data(ca) - mask = mask.all(axis=1) - else: - mask = None - - coerced = filtered.astype(common_dtype, copy=False) - if is_pure_dt: - # Further convert into cupy friendly types - coerced = coerced.astype("int64", copy=False) - return coerced, mask, common_dtype - - @_performance_tracking - def count(self, axis=0, numeric_only=False): - """ - Count ``non-NA`` cells for each column or row. - - The values ``None``, ``NaN``, ``NaT`` are considered ``NA``. - - Returns - ------- - Series - For each column/row the number of non-NA/null entries. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> df = cudf.DataFrame({"Person": - ... ["John", "Myla", "Lewis", "John", "Myla"], - ... "Age": [24., np.nan, 21., 33, 26], - ... "Single": [False, True, True, True, False]}) - >>> df.count() - Person 5 - Age 4 - Single 5 - dtype: int64 - - .. pandas-compat:: - :meth:`pandas.DataFrame.count` - - Parameters currently not supported are `axis` and `numeric_only`. - """ - axis = self._get_axis_from_axis_arg(axis) - if axis != 0: - raise NotImplementedError("Only axis=0 is currently supported.") - length = len(self) - return Series._from_column( - as_column([length - col.null_count for col in self._columns]), - index=cudf.Index(self._column_names), - ) - - _SUPPORT_AXIS_LOOKUP = { - 0: 0, - 1: 1, - "index": 0, - "columns": 1, - } - - @_performance_tracking - def _reduce( - self, - op, - axis=None, - numeric_only=False, - **kwargs, - ): - source = self - - if axis is None: - assert PANDAS_LT_300, "Replace if/else with just axis=2" - # TODO(pandas3.0): Remove if/else for just axis = 2 - if op in {"sum", "product", "std", "var"}: - # pandas only raises FutureWarning for these ops - # though it applies for all reductions - warnings.warn( - f"In a future version, {type(self).__name__}" - f".{op}(axis=None) will return a scalar {op} over " - "the entire DataFrame. To retain the old behavior, " - f"use '{type(self).__name__}.{op}(axis=0)' or " - f"just '{type(self)}.{op}()'", - FutureWarning, - ) - axis = 0 - else: - axis = 2 - elif axis is no_default: - axis = 0 - else: - axis = source._get_axis_from_axis_arg(axis) - - if numeric_only: - numeric_cols = ( - name for name, dtype in self._dtypes if is_numeric_dtype(dtype) - ) - source = self._get_columns_by_label(numeric_cols) - if source.empty: - return Series( - index=self._data.to_pandas_index()[:0] - if axis == 0 - else source.index, - dtype="float64", - ) - if ( - axis == 2 - and op in {"kurtosis", "skew"} - and self._num_rows < 4 - and self._num_columns > 1 - ): - # Total number of elements may satisfy the min number of values - # to compute skew/kurtosis - return getattr(concat_columns(source._columns), op)(**kwargs) - elif axis == 1: - return source._apply_cupy_method_axis_1(op, **kwargs) - else: - axis_0_results = [] - for col_label, col in source._column_labels_and_values: - try: - axis_0_results.append(getattr(col, op)(**kwargs)) - except AttributeError as err: - if numeric_only: - raise NotImplementedError( - f"Column {col_label} with type {col.dtype} does not support {op}" - ) from err - elif not is_numeric_dtype(col.dtype): - raise TypeError( - "Non numeric columns passed with " - "`numeric_only=False`, pass `numeric_only=True` " - f"to perform DataFrame.{op}" - ) from err - else: - raise - if axis == 2: - return getattr( - as_column(axis_0_results, nan_as_null=False), op - )(**kwargs) - else: - source_dtypes = [dtype for _, dtype in source._dtypes] - common_dtype = find_common_type(source_dtypes) - if ( - is_object_dtype(common_dtype) - and any( - not is_object_dtype(dtype) for dtype in source_dtypes - ) - or common_dtype.kind != "b" - and any(dtype.kind == "b" for dtype in source_dtypes) - ): - raise TypeError( - "Columns must all have the same dtype to " - f"perform {op=} with {axis=}" - ) - pd_index = source._data.to_pandas_index() - if source._data.multiindex: - idx = MultiIndex.from_pandas(pd_index) - else: - idx = cudf.Index.from_pandas(pd_index) - return Series._from_column( - as_column(axis_0_results), index=idx - ) - - @_performance_tracking - def _scan( - self, - op, - axis=None, - *args, - **kwargs, - ): - if axis is None: - axis = 0 - axis = self._get_axis_from_axis_arg(axis) - - if axis == 0: - return super()._scan(op, axis=axis, *args, **kwargs) - elif axis == 1: - return self._apply_cupy_method_axis_1(op, **kwargs) - - @_performance_tracking - def mode(self, axis=0, numeric_only=False, dropna=True): - """ - Get the mode(s) of each element along the selected axis. - - The mode of a set of values is the value that appears most often. - It can be multiple values. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to iterate over while searching for the mode: - - - 0 or 'index' : get mode of each column - - 1 or 'columns' : get mode of each row. - numeric_only : bool, default False - If True, only apply to numeric columns. - dropna : bool, default True - Don't consider counts of NA/NaN/NaT. - - Returns - ------- - DataFrame - The modes of each column or row. - - See Also - -------- - cudf.Series.mode : Return the highest frequency value - in a Series. - cudf.Series.value_counts : Return the counts of values - in a Series. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({ - ... "species": ["bird", "mammal", "arthropod", "bird"], - ... "legs": [2, 4, 8, 2], - ... "wings": [2.0, None, 0.0, None] - ... }) - >>> df - species legs wings - 0 bird 2 2.0 - 1 mammal 4 - 2 arthropod 8 0.0 - 3 bird 2 - - By default, missing values are not considered, and the mode of wings - are both 0 and 2. The second row of species and legs contains ``NA``, - because they have only one mode, but the DataFrame has two rows. - - >>> df.mode() - species legs wings - 0 bird 2 0.0 - 1 2.0 - - Setting ``dropna=False``, ``NA`` values are considered and they can be - the mode (like for wings). - - >>> df.mode(dropna=False) - species legs wings - 0 bird 2 - - Setting ``numeric_only=True``, only the mode of numeric columns is - computed, and columns of other types are ignored. - - >>> df.mode(numeric_only=True) - legs wings - 0 2 0.0 - 1 2.0 - - .. pandas-compat:: - :meth:`pandas.DataFrame.transpose` - - ``axis`` parameter is currently not supported. - """ - if axis not in (0, "index"): - raise NotImplementedError("Only axis=0 is currently supported") - - if numeric_only: - data_df = self.select_dtypes( - include=[np.number], exclude=["datetime64", "timedelta64"] - ) - else: - data_df = self - - mode_results = [ - data_df[col].mode(dropna=dropna) for col in data_df._data - ] - - if len(mode_results) == 0: - return DataFrame() - - with warnings.catch_warnings(): - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.simplefilter("ignore", FutureWarning) - df = cudf.concat(mode_results, axis=1) - - if isinstance(df, Series): - df = df.to_frame() - - df._set_columns_like(data_df._data) - - return df - - @_performance_tracking - def all(self, axis=0, bool_only=None, skipna=True, **kwargs): - obj = self.select_dtypes(include="bool") if bool_only else self - return super(DataFrame, obj).all(axis, skipna, **kwargs) - - @_performance_tracking - def any(self, axis=0, bool_only=None, skipna=True, **kwargs): - obj = self.select_dtypes(include="bool") if bool_only else self - return super(DataFrame, obj).any(axis, skipna, **kwargs) - - @_performance_tracking - def _apply_cupy_method_axis_1(self, method, *args, **kwargs): - # This method uses cupy to perform scans and reductions along rows of a - # DataFrame. Since cuDF is designed around columnar storage and - # operations, we convert DataFrames to 2D cupy arrays for these ops. - - # for dask metadata compatibility - skipna = kwargs.pop("skipna", None) - skipna = True if skipna is None else skipna - if method not in _cupy_nan_methods_map and skipna not in ( - None, - True, - 1, - ): - raise NotImplementedError( - f"Row-wise operations to calculate '{method}'" - f" currently do not support `skipna=False`." - ) - - level = kwargs.pop("level", None) - if level not in (None,): - raise NotImplementedError( - "Row-wise operations currently do not support `level`." - ) - - numeric_only = kwargs.pop("numeric_only", False) - - min_count = kwargs.pop("min_count", None) - if min_count not in (None, 0): - raise NotImplementedError( - "Row-wise operations currently do not support `min_count`." - ) - - bool_only = kwargs.pop("bool_only", None) - if bool_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not support `bool_only`." - ) - - # This parameter is only necessary for axis 0 reductions that cuDF - # performs internally. cupy already upcasts smaller integer/bool types - # to int64 when accumulating. - kwargs.pop("cast_to_int", None) - - prepared, mask, common_dtype = self._prepare_for_rowwise_op( - method, skipna, numeric_only - ) - for col in prepared._column_names: - if prepared._data[col].nullable: - prepared._data[col] = ( - prepared._data[col] - .astype( - cudf.utils.dtypes.get_min_float_dtype( - prepared._data[col] - ) - if common_dtype.kind != "M" - else cudf.dtype("float64") - ) - .fillna(np.nan) - ) - arr = prepared.to_cupy() - - if skipna is not False and method in _cupy_nan_methods_map: - method = _cupy_nan_methods_map[method] - - result = getattr(cupy, method)(arr, axis=1, **kwargs) - - if result.ndim == 1: - type_coerced_methods = { - "count", - "min", - "max", - "sum", - "prod", - "cummin", - "cummax", - "cumsum", - "cumprod", - } - result_dtype = ( - common_dtype - if method in type_coerced_methods - or (common_dtype is not None and common_dtype.kind == "M") - else None - ) - result = column.as_column(result, dtype=result_dtype) - if mask is not None: - result = result.set_mask( - cudf._lib.transform.bools_to_mask(mask._column) - ) - return Series._from_column(result, index=self.index) - else: - result_df = DataFrame(result, index=self.index) - result_df._set_columns_like(prepared._data) - return result_df - - @_performance_tracking - def select_dtypes(self, include=None, exclude=None): - """Return a subset of the DataFrame's columns based on the column dtypes. - - Parameters - ---------- - include : str or list - which columns to include based on dtypes - exclude : str or list - which columns to exclude based on dtypes - - Returns - ------- - DataFrame - The subset of the frame including the dtypes - in ``include`` and excluding the dtypes in ``exclude``. - - Raises - ------ - ValueError - - If both of ``include`` and ``exclude`` are empty - - If ``include`` and ``exclude`` have overlapping elements - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2] * 3, - ... 'b': [True, False] * 3, - ... 'c': [1.0, 2.0] * 3}) - >>> df - a b c - 0 1 True 1.0 - 1 2 False 2.0 - 2 1 True 1.0 - 3 2 False 2.0 - 4 1 True 1.0 - 5 2 False 2.0 - >>> df.select_dtypes(include='bool') - b - 0 True - 1 False - 2 True - 3 False - 4 True - 5 False - >>> df.select_dtypes(include=['float64']) - c - 0 1.0 - 1 2.0 - 2 1.0 - 3 2.0 - 4 1.0 - 5 2.0 - >>> df.select_dtypes(exclude=['int']) - b c - 0 True 1.0 - 1 False 2.0 - 2 True 1.0 - 3 False 2.0 - 4 True 1.0 - 5 False 2.0 - """ # noqa: E501 - - # code modified from: - # https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L3196 - - if not isinstance(include, (list, tuple)): - include = (include,) if include is not None else () - if not isinstance(exclude, (list, tuple)): - exclude = (exclude,) if exclude is not None else () - - # cudf_dtype_from_pydata_dtype can distinguish between - # np.float and np.number - selection = tuple(map(frozenset, (include, exclude))) - - if not any(selection): - raise ValueError( - "at least one of include or exclude must be nonempty" - ) - - include, exclude = map( - lambda x: frozenset(map(cudf_dtype_from_pydata_dtype, x)), - selection, - ) - - # can't both include AND exclude! - if not include.isdisjoint(exclude): - raise ValueError( - f"include and exclude overlap on {(include & exclude)}" - ) - - # include all subtypes - include_subtypes = set() - for dtype in self.dtypes: - for i_dtype in include: - # category handling - if i_dtype == cudf.CategoricalDtype: - # Matches cudf & pandas dtype objects - include_subtypes.add(i_dtype) - elif inspect.isclass(dtype.type): - if issubclass(dtype.type, i_dtype): - include_subtypes.add(dtype.type) - - # exclude all subtypes - exclude_subtypes = set() - for dtype in self.dtypes: - for e_dtype in exclude: - # category handling - if e_dtype == cudf.CategoricalDtype: - # Matches cudf & pandas dtype objects - exclude_subtypes.add(e_dtype) - elif inspect.isclass(dtype.type): - if issubclass(dtype.type, e_dtype): - exclude_subtypes.add(dtype.type) - - include_all = {cudf_dtype_from_pydata_dtype(d) for d in self.dtypes} - - if include: - inclusion = include_all & include_subtypes - elif exclude: - inclusion = include_all - else: - inclusion = set() - # remove all exclude types - inclusion = inclusion - exclude_subtypes - - to_select = [ - label - for label, dtype in self._dtypes - if cudf_dtype_from_pydata_dtype(dtype) in inclusion - ] - return self.loc[:, to_select] - - @ioutils.doc_to_parquet() - def to_parquet( - self, - path, - engine="cudf", - compression="snappy", - index=None, - partition_cols=None, - partition_file_name=None, - partition_offsets=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - storage_options=None, - return_metadata=False, - use_dictionary=True, - header_version="1.0", - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, - *args, - **kwargs, - ): - """{docstring}""" - from cudf.io import parquet - - return parquet.to_parquet( - self, - path=path, - engine=engine, - compression=compression, - index=index, - partition_cols=partition_cols, - partition_file_name=partition_file_name, - partition_offsets=partition_offsets, - statistics=statistics, - metadata_file_path=metadata_file_path, - int96_timestamps=int96_timestamps, - row_group_size_bytes=row_group_size_bytes, - row_group_size_rows=row_group_size_rows, - max_page_size_bytes=max_page_size_bytes, - max_page_size_rows=max_page_size_rows, - storage_options=storage_options, - return_metadata=return_metadata, - use_dictionary=use_dictionary, - header_version=header_version, - skip_compression=skip_compression, - column_encoding=column_encoding, - column_type_length=column_type_length, - output_as_binary=output_as_binary, - *args, - **kwargs, - ) - - @ioutils.doc_to_feather() - def to_feather(self, path, *args, **kwargs): - """{docstring}""" - from cudf.io import feather - - feather.to_feather(self, path, *args, **kwargs) - - @ioutils.doc_dataframe_to_csv() - def to_csv( - self, - path_or_buf=None, - sep=",", - na_rep="", - columns=None, - header=True, - index=True, - encoding=None, - compression=None, - lineterminator=None, - chunksize=None, - storage_options=None, - ): - """{docstring}""" - from cudf.io import csv - - if lineterminator is None: - lineterminator = os.linesep - return csv.to_csv( - self, - path_or_buf=path_or_buf, - sep=sep, - na_rep=na_rep, - columns=columns, - header=header, - index=index, - lineterminator=lineterminator, - chunksize=chunksize, - encoding=encoding, - compression=compression, - storage_options=storage_options, - ) - - @ioutils.doc_to_orc() - def to_orc( - self, - fname, - compression="snappy", - statistics="ROWGROUP", - stripe_size_bytes=None, - stripe_size_rows=None, - row_index_stride=None, - cols_as_map_type=None, - storage_options=None, - index=None, - ): - """{docstring}""" - from cudf.io import orc - - return orc.to_orc( - df=self, - fname=fname, - compression=compression, - statistics=statistics, - stripe_size_bytes=stripe_size_bytes, - stripe_size_rows=stripe_size_rows, - row_index_stride=row_index_stride, - cols_as_map_type=cols_as_map_type, - storage_options=storage_options, - index=index, - ) - - @_performance_tracking - def stack(self, level=-1, dropna=no_default, future_stack=False): - """Stack the prescribed level(s) from columns to index - - Return a reshaped DataFrame or Series having a multi-level - index with one or more new inner-most levels compared to - the current DataFrame. The new inner-most levels are created - by pivoting the columns of the current dataframe: - - - if the columns have a single level, the output is a Series; - - if the columns have multiple levels, the new index - level(s) is (are) taken from the prescribed level(s) and - the output is a DataFrame. - - Parameters - ---------- - level : int, str, list default -1 - Level(s) to stack from the column axis onto the index axis, - defined as one index or label, or a list of indices or labels. - dropna : bool, default True - Whether to drop rows in the resulting Frame/Series with missing - values. When multiple levels are specified, `dropna==False` is - unsupported. - - Returns - ------- - DataFrame or Series - Stacked dataframe or series. - - See Also - -------- - DataFrame.unstack : Unstack prescribed level(s) from index axis - onto column axis. - DataFrame.pivot : Reshape dataframe from long format to wide - format. - DataFrame.pivot_table : Create a spreadsheet-style pivot table - as a DataFrame. - - Notes - ----- - The function is named by analogy with a collection of books - being reorganized from being side by side on a horizontal - position (the columns of the dataframe) to being stacked - vertically on top of each other (in the index of the - dataframe). - - Examples - -------- - **Single level columns** - - >>> df_single_level_cols = cudf.DataFrame([[0, 1], [2, 3]], - ... index=['cat', 'dog'], - ... columns=['weight', 'height']) - - Stacking a dataframe with a single level column axis returns a Series: - - >>> df_single_level_cols - weight height - cat 0 1 - dog 2 3 - >>> df_single_level_cols.stack() - cat height 1 - weight 0 - dog height 3 - weight 2 - dtype: int64 - - **Multi level columns: simple case** - - >>> import pandas as pd - >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), - ... ('weight', 'pounds')]) - >>> df_multi_level_cols1 = cudf.DataFrame([[1, 2], [2, 4]], - ... index=['cat', 'dog'], - ... columns=multicol1) - - Stacking a dataframe with a multi-level column axis: - - >>> df_multi_level_cols1 - weight - kg pounds - cat 1 2 - dog 2 4 - >>> df_multi_level_cols1.stack() - weight - cat kg 1 - pounds 2 - dog kg 2 - pounds 4 - - **Missing values** - - >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), - ... ('height', 'm')]) - >>> df_multi_level_cols2 = cudf.DataFrame([[1.0, 2.0], [3.0, 4.0]], - ... index=['cat', 'dog'], - ... columns=multicol2) - - It is common to have missing values when stacking a dataframe - with multi-level columns, as the stacked dataframe typically - has more values than the original dataframe. Missing values - are filled with NULLs: - - >>> df_multi_level_cols2 - weight height - kg m - cat 1.0 2.0 - dog 3.0 4.0 - >>> df_multi_level_cols2.stack() - weight height - cat kg 1.0 - m 2.0 - dog kg 3.0 - m 4.0 - - **Prescribing the level(s) to be stacked** - - The first parameter controls which level or levels are stacked: - - >>> df_multi_level_cols2.stack(0) - kg m - cat height 2.0 - weight 1.0 - dog height 4.0 - weight 3.0 - - >>> df_multi_level_cols2.stack([0, 1]) - cat height m 2.0 - weight kg 1.0 - dog height m 4.0 - weight kg 3.0 - dtype: float64 - """ - if future_stack: - if dropna is not no_default: - raise ValueError( - "dropna must be unspecified with future_stack=True as " - "the new implementation does not introduce rows of NA " - "values. This argument will be removed in a future " - "version of cudf." - ) - else: - if dropna is not no_default or self._data.nlevels > 1: - warnings.warn( - "The previous implementation of stack is deprecated and " - "will be removed in a future version of cudf. Specify " - "future_stack=True to adopt the new implementation and " - "silence this warning.", - FutureWarning, - ) - if dropna is no_default: - dropna = True - - if isinstance(level, (int, str)): - level = [level] - elif isinstance(level, list): - if not all(isinstance(lv, (int, str)) for lv in level): - raise ValueError( - "level must be either an int/str, or a list of int/str." - ) - else: - raise ValueError( - "level must be either an int/str, or a list of int/str." - ) - - level = [level] if not isinstance(level, list) else level - - if not future_stack and len(level) > 1 and not dropna: - raise NotImplementedError( - "When stacking multiple levels, setting `dropna` to False " - "will generate new column combination that does not exist " - "in original dataframe. This behavior is unsupported in " - "cuDF. See pandas deprecation note: " - "https://github.com/pandas-dev/pandas/issues/53515" - ) - - # Compute the columns to stack based on specified levels - - level_indices: list[int] = [] - - # If all passed in level names match up to the dataframe column's level - # names, cast them to indices - if all(lv in self._data.level_names for lv in level): - level_indices = [self._data.level_names.index(lv) for lv in level] - elif not all(isinstance(lv, int) for lv in level): - raise ValueError( - "`level` must either be a list of names or positions, not a " - "mixture of both." - ) - else: - # Must be a list of positions, normalize negative positions - level_indices = [ - lv + self._data.nlevels if lv < 0 else lv for lv in level - ] - - unnamed_levels_indices = [ - i for i in range(self._data.nlevels) if i not in level_indices - ] - has_unnamed_levels = len(unnamed_levels_indices) > 0 - - column_name_idx = self._data.to_pandas_index() - # Construct new index from the levels specified by `level` - named_levels = pd.MultiIndex.from_arrays( - [column_name_idx.get_level_values(lv) for lv in level_indices] - ) - - # Since `level` may only specify a subset of all levels, `unique()` is - # required to remove duplicates. In pandas, the order of the keys in - # the specified levels are always sorted. - unique_named_levels = named_levels.unique() - if not future_stack: - unique_named_levels = unique_named_levels.sort_values() - - # Each index from the original dataframe should repeat by the number - # of unique values in the named_levels - repeated_index = self.index.repeat(len(unique_named_levels)) - - # Each column name should tile itself by len(df) times - tiled_index = libcudf.reshape.tile( - [ - as_column(unique_named_levels.get_level_values(i)) - for i in range(unique_named_levels.nlevels) - ], - self.shape[0], - ) - - # Assemble the final index - new_index_columns = [*repeated_index._columns, *tiled_index] - index_names = [*self.index.names, *unique_named_levels.names] - new_index = MultiIndex._from_data(dict(enumerate(new_index_columns))) - new_index.names = index_names - - # Compute the column indices that serves as the input for - # `interleave_columns` - column_idx_df = pd.DataFrame( - data=range(self._num_columns), index=named_levels - ) - - column_indices: list[list[int]] = [] - if has_unnamed_levels: - unnamed_level_values = list( - map(column_name_idx.get_level_values, unnamed_levels_indices) - ) - unnamed_level_values = pd.MultiIndex.from_arrays( - unnamed_level_values - ) - - def unnamed_group_generator(): - if has_unnamed_levels: - for _, grpdf in column_idx_df.groupby(by=unnamed_level_values): - # When stacking part of the levels, some combinations - # of keys may not be present in this group but can be - # present in others. Reindexing with the globally computed - # `unique_named_levels` assigns -1 to these key - # combinations, representing an all-null column that - # is used in the subsequent libcudf call. - if future_stack: - yield grpdf.reindex( - unique_named_levels, axis=0, fill_value=-1 - ).values - else: - yield ( - grpdf.reindex( - unique_named_levels, axis=0, fill_value=-1 - ) - .sort_index() - .values - ) - else: - if future_stack: - yield column_idx_df.values - else: - yield column_idx_df.sort_index().values - - column_indices = list(unnamed_group_generator()) - - # For each of the group constructed from the unnamed levels, - # invoke `interleave_columns` to stack the values. - stacked = [] - - for column_idx in column_indices: - # Collect columns based on indices, append None for -1 indices. - columns = [ - None if i == -1 else self._data.select_by_index(i).columns[0] - for i in column_idx - ] - - # Collect datatypes and cast columns as that type - common_type = np.result_type( - *(col.dtype for col in columns if col is not None) - ) - - all_nulls = functools.cache( - functools.partial( - column_empty, self.shape[0], common_type, masked=True - ) - ) - - # homogenize the dtypes of the columns - homogenized = [ - col.astype(common_type) if col is not None else all_nulls() - for col in columns - ] - - stacked.append(libcudf.reshape.interleave_columns(homogenized)) - - # Construct the resulting dataframe / series - if not has_unnamed_levels: - result = Series._from_column(stacked[0], index=new_index) - else: - if unnamed_level_values.nlevels == 1: - unnamed_level_values = unnamed_level_values.get_level_values(0) - unnamed_level_values = unnamed_level_values.unique() - - data = ColumnAccessor( - dict( - zip( - unnamed_level_values, - [ - stacked[i] - for i in unnamed_level_values.argsort().argsort() - ] - if not future_stack - else [ - stacked[i] for i in unnamed_level_values.argsort() - ], - ) - ), - isinstance(unnamed_level_values, pd.MultiIndex), - unnamed_level_values.names, - ) - - result = DataFrame._from_data(data, index=new_index) - - if not future_stack and dropna: - return result.dropna(how="all") - else: - return result - - @_performance_tracking - def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False): - """Compute the covariance matrix of a DataFrame. - - Parameters - ---------- - min_periods : int, optional - Minimum number of observations required per pair of columns to - have a valid result. - Currently not supported. - - ddof : int, default 1 - Delta degrees of freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - Currently not supported. - - Returns - ------- - cov : DataFrame - """ - if min_periods is not None: - raise NotImplementedError( - "min_periods is currently not supported." - ) - - if numeric_only is not False: - raise NotImplementedError( - "numeric_only is currently not supported." - ) - - cov = cupy.cov(self.values, ddof=ddof, rowvar=False) - cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(cov), index=cols) - df._set_columns_like(self._data) - return df - - def corr( - self, method="pearson", min_periods=None, numeric_only: bool = False - ): - """Compute the correlation matrix of a DataFrame. - - Parameters - ---------- - method : {'pearson', 'spearman'}, default 'pearson' - Method used to compute correlation: - - - pearson : Standard correlation coefficient - - spearman : Spearman rank correlation - - min_periods : int, optional - Minimum number of observations required per pair of columns to - have a valid result. - - Returns - ------- - DataFrame - The requested correlation matrix. - """ - if method == "pearson": - values = self.values - elif method == "spearman": - values = self.rank().values - else: - raise ValueError("method must be either 'pearson', 'spearman'") - - if min_periods is not None: - raise NotImplementedError("Unsupported argument 'min_periods'") - - if numeric_only is not False: - raise NotImplementedError( - "numeric_only is currently not supported." - ) - - corr = cupy.corrcoef(values, rowvar=False) - cols = self._data.to_pandas_index() - df = DataFrame(cupy.asfortranarray(corr), index=cols) - df._set_columns_like(self._data) - return df - - @_performance_tracking - def to_struct(self, name=None): - """ - Return a struct Series composed of the columns of the DataFrame. - - Parameters - ---------- - name: optional - Name of the resulting Series - - Notes - ----- - Note: a copy of the columns is made. - """ - if not all(isinstance(name, str) for name in self._column_names): - warnings.warn( - "DataFrame contains non-string column name(s). Struct column " - "requires field name to be string. Non-string column names " - "will be casted to string as the field name." - ) - fields = {str(name): dtype for name, dtype in self._dtypes} - col = StructColumn( - data=None, - dtype=cudf.StructDtype(fields=fields), - children=tuple(col.copy(deep=True) for col in self._columns), - size=len(self), - offset=0, - ) - return cudf.Series._from_column( - col, - index=self.index, - name=name, - ) - - @_performance_tracking - def keys(self): - """ - Get the columns. - This is index for Series, columns for DataFrame. - - Returns - ------- - Index - Columns of DataFrame. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'one' : [1, 2, 3], 'five' : ['a', 'b', 'c']}) - >>> df - one five - 0 1 a - 1 2 b - 2 3 c - >>> df.keys() - Index(['one', 'five'], dtype='object') - >>> df = cudf.DataFrame(columns=[0, 1, 2, 3]) - >>> df - Empty DataFrame - Columns: [0, 1, 2, 3] - Index: [] - >>> df.keys() - Index([0, 1, 2, 3], dtype='int64') - """ - return self._data.to_pandas_index() - - def itertuples(self, index=True, name="Pandas"): - """ - Iteration is unsupported. - - See :ref:`iteration ` for more - information. - """ - raise TypeError( - "cuDF does not support iteration of DataFrame " - "via itertuples. Consider using " - "`.to_pandas().itertuples()` " - "if you wish to iterate over namedtuples." - ) - - def iterrows(self): - """ - Iteration is unsupported. - - See :ref:`iteration ` for more - information. - """ - raise TypeError( - "cuDF does not support iteration of DataFrame " - "via iterrows. Consider using " - "`.to_pandas().iterrows()` " - "if you wish to iterate over each row." - ) - - @_performance_tracking - @copy_docstring(reshape.pivot) - def pivot(self, *, columns, index=no_default, values=no_default): - return cudf.core.reshape.pivot( - self, index=index, columns=columns, values=values - ) - - @_performance_tracking - @copy_docstring(reshape.pivot_table) - def pivot_table( - self, - values=None, - index=None, - columns=None, - aggfunc="mean", - fill_value=None, - margins=False, - dropna=None, - margins_name="All", - observed=False, - sort=True, - ): - return cudf.core.reshape.pivot_table( - self, - values=values, - index=index, - columns=columns, - aggfunc=aggfunc, - fill_value=fill_value, - margins=margins, - dropna=dropna, - margins_name=margins_name, - observed=observed, - sort=sort, - ) - - @_performance_tracking - @copy_docstring(reshape.unstack) - def unstack(self, level=-1, fill_value=None, sort: bool = True): - return cudf.core.reshape.unstack( - self, level=level, fill_value=fill_value, sort=sort - ) - - @_performance_tracking - def explode(self, column, ignore_index=False): - """ - Transform each element of a list-like to a row, replicating index - values. - - Parameters - ---------- - column : str - Column to explode. - ignore_index : bool, default False - If True, the resulting index will be labeled 0, 1, …, n - 1. - - Returns - ------- - DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({ - ... "a": [[1, 2, 3], [], None, [4, 5]], - ... "b": [11, 22, 33, 44], - ... }) - >>> df - a b - 0 [1, 2, 3] 11 - 1 [] 22 - 2 None 33 - 3 [4, 5] 44 - >>> df.explode('a') - a b - 0 1 11 - 0 2 11 - 0 3 11 - 1 22 - 2 33 - 3 4 44 - 3 5 44 - """ - return super()._explode(column, ignore_index) - - def pct_change( - self, - periods=1, - fill_method=no_default, - limit=no_default, - freq=None, - **kwargs, - ): - """ - Calculates the percent change between sequential elements - in the DataFrame. - - Parameters - ---------- - periods : int, default 1 - Periods to shift for forming percent change. - fill_method : str, default 'ffill' - How to handle NAs before computing percent changes. - - .. deprecated:: 24.04 - All options of `fill_method` are deprecated - except `fill_method=None`. - limit : int, optional - The number of consecutive NAs to fill before stopping. - Not yet implemented. - - .. deprecated:: 24.04 - `limit` is deprecated. - freq : str, optional - Increment to use from time series API. - Not yet implemented. - **kwargs - Additional keyword arguments are passed into - `DataFrame.shift`. - - Returns - ------- - DataFrame - """ - if limit is not no_default: - raise NotImplementedError("limit parameter not supported yet.") - if freq is not None: - raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in { - no_default, - None, - "ffill", - "pad", - "bfill", - "backfill", - }: - raise ValueError( - "fill_method must be one of None, 'ffill', 'pad', " - "'bfill', or 'backfill'." - ) - - if fill_method not in (no_default, None) or limit is not no_default: - # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn( - "The 'fill_method' and 'limit' keywords in " - f"{type(self).__name__}.pct_change are deprecated and will be " - "removed in a future version. Either fill in any non-leading " - "NA values prior to calling pct_change or specify " - "'fill_method=None' to not fill NA values.", - FutureWarning, - ) - if fill_method is no_default: - fill_method = "ffill" - if limit is no_default: - limit = None - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - data = self.fillna(method=fill_method, limit=limit) - - return data.diff(periods=periods) / data.shift( - periods=periods, freq=freq, **kwargs - ) - - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ): - return df_protocol.__dataframe__( - self, nan_as_null=nan_as_null, allow_copy=allow_copy - ) - - def nunique(self, axis=0, dropna: bool = True) -> Series: - """ - Count number of distinct elements in specified axis. - Return Series with number of distinct elements. Can ignore NaN values. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for - column-wise. - dropna : bool, default True - Don't include NaN in the counts. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) - >>> df.nunique() - A 3 - B 2 - dtype: int64 - """ - if axis != 0: - raise NotImplementedError("axis parameter is not supported yet.") - counts = [col.distinct_count(dropna=dropna) for col in self._columns] - return self._constructor_sliced( - counts, index=self._data.to_pandas_index() - ) - - def _sample_axis_1( - self, - n: int, - weights: ColumnLike | None, - replace: bool, - random_state: np.random.RandomState, - ignore_index: bool, - ): - if replace: - # Since cuDF does not support multiple columns with same name, - # sample with replace=True at axis 1 is unsupported. - raise NotImplementedError( - "Sample is not supported for axis 1/`columns` when" - "`replace=True`." - ) - - sampled_column_labels = random_state.choice( - self._column_names, size=n, replace=False, p=weights - ) - - result = self._get_columns_by_label(sampled_column_labels) - if ignore_index: - result.reset_index(drop=True) - - return result - - def _from_columns_like_self( - self, - columns: list[ColumnBase], - column_names: abc.Iterable[str] | None = None, - index_names: list[str] | None = None, - ) -> DataFrame: - result = super()._from_columns_like_self( - columns, - column_names, - index_names, - ) - result._set_columns_like(self._data) - return result - - @_performance_tracking - def interleave_columns(self): - """ - Interleave Series columns of a table into a single column. - - Converts the column major table `cols` into a row major column. - - Parameters - ---------- - cols : input Table containing columns to interleave. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({0: ['A1', 'A2', 'A3'], 1: ['B1', 'B2', 'B3']}) - >>> df - 0 1 - 0 A1 B1 - 1 A2 B2 - 2 A3 B3 - >>> df.interleave_columns() - 0 A1 - 1 B1 - 2 A2 - 3 B2 - 4 A3 - 5 B3 - dtype: object - - Returns - ------- - The interleaved columns as a single column - - .. pandas-compat:: - `pandas.DataFrame.interleave_columns` - - This method does not exist in pandas but it can be run - as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``. - """ - if ("category" == self.dtypes).any(): - raise ValueError( - "interleave_columns does not support 'category' dtype." - ) - - return self._constructor_sliced._from_column( - libcudf.reshape.interleave_columns([*self._columns]) - ) - - @_performance_tracking - def eval(self, expr: str, inplace: bool = False, **kwargs): - """Evaluate a string describing operations on DataFrame columns. - - Operates on columns only, not specific rows or elements. - - Parameters - ---------- - expr : str - The expression string to evaluate. - inplace : bool, default False - If the expression contains an assignment, whether to perform the - operation inplace and mutate the existing DataFrame. Otherwise, - a new DataFrame is returned. - **kwargs - Not supported. - - Returns - ------- - DataFrame, Series, or None - Series if a single column is returned (the typical use case), - DataFrame if any assignment statements are included in - ``expr``, or None if ``inplace=True``. - - - Examples - -------- - >>> df = cudf.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) - >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 - >>> df.eval('A + B') - 0 11 - 1 10 - 2 9 - 3 8 - 4 7 - dtype: int64 - - Assignment is allowed though by default the original DataFrame is not - modified. - - >>> df.eval('C = A + B') - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 - >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 - - Use ``inplace=True`` to modify the original DataFrame. - - >>> df.eval('C = A + B', inplace=True) - >>> df - A B C - 0 1 10 11 - 1 2 8 10 - 2 3 6 9 - 3 4 4 8 - 4 5 2 7 - - Multiple columns can be assigned to using multi-line expressions: - - >>> df.eval( - ... ''' - ... C = A + B - ... D = A - B - ... ''' - ... ) - A B C D - 0 1 10 11 -9 - 1 2 8 10 -6 - 2 3 6 9 -3 - 3 4 4 8 0 - 4 5 2 7 3 - - .. pandas-compat:: - :meth:`pandas.DataFrame.eval` - - * Additional kwargs are not supported. - * Bitwise and logical operators are not dtype-dependent. - Specifically, `&` must be used for bitwise operators on integers, - not `and`, which is specifically for the logical and between - booleans. - * Only numerical types are currently supported. - * Operators generally will not cast automatically. Users are - responsible for casting columns to suitable types before - evaluating a function. - * Multiple assignments to the same name (i.e. a sequence of - assignment statements where later statements are conditioned upon - the output of earlier statements) is not supported. - """ - if kwargs: - raise ValueError( - "Keyword arguments other than `inplace` are not supported" - ) - - # Have to use a regex match to avoid capturing ==, >=, or <= - equals_sign_regex = "[^=><]=[^=]" - includes_assignment = re.search(equals_sign_regex, expr) is not None - - # Check if there were multiple statements. Filter out empty lines. - statements = tuple(filter(None, expr.strip().split("\n"))) - if len(statements) > 1 and any( - re.search(equals_sign_regex, st) is None for st in statements - ): - raise ValueError( - "Multi-line expressions are only valid if all expressions " - "contain an assignment." - ) - - if not includes_assignment: - if inplace: - raise ValueError( - "Cannot operate inplace if there is no assignment" - ) - return Series._from_column( - libcudf.transform.compute_column( - [*self._columns], self._column_names, statements[0] - ) - ) - - targets = [] - exprs = [] - for st in statements: - try: - t, e = re.split("[^=]=[^=]", st) - except ValueError as err: - if "too many values" in str(err): - raise ValueError( - f"Statement {st} contains too many assignments ('=')" - ) - raise - targets.append(t.strip()) - exprs.append(e.strip()) - - cols = ( - libcudf.transform.compute_column( - [*self._columns], self._column_names, e - ) - for e in exprs - ) - ret = self if inplace else self.copy(deep=False) - for name, col in zip(targets, cols): - ret._data[name] = col - if not inplace: - return ret - - def value_counts( - self, - subset=None, - normalize=False, - sort=True, - ascending=False, - dropna=True, - ): - """ - Return a Series containing counts of unique rows in the DataFrame. - - Parameters - ---------- - subset: list-like, optional - Columns to use when counting unique combinations. - normalize: bool, default False - Return proportions rather than frequencies. - sort: bool, default True - Sort by frequencies. - ascending: bool, default False - Sort in ascending order. - dropna: bool, default True - Don't include counts of rows that contain NA values. - - Returns - ------- - Series - - Notes - ----- - The returned Series will have a MultiIndex with one level per input - column. By default, rows that contain any NA values are omitted from - the result. By default, the resulting Series will be in descending - order so that the first element is the most frequently-occurring row. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'num_legs': [2, 4, 4, 6], - ... 'num_wings': [2, 0, 0, 0]}, - ... index=['falcon', 'dog', 'cat', 'ant']) - >>> df - num_legs num_wings - falcon 2 2 - dog 4 0 - cat 4 0 - ant 6 0 - >>> df.value_counts().sort_index() - num_legs num_wings - 2 2 1 - 4 0 2 - 6 0 1 - Name: count, dtype: int64 - """ - if subset: - diff = set(subset) - set(self._data) - if len(diff) != 0: - raise KeyError(f"columns {diff} do not exist") - columns = list(self._column_names) if subset is None else subset - result = ( - self.groupby( - by=columns, - dropna=dropna, - ) - .size() - .astype("int64") - ) - if sort: - result = result.sort_values(ascending=ascending) - if normalize: - result = result / result._column.sum() - # Pandas always returns MultiIndex even if only one column. - if not isinstance(result.index, MultiIndex): - result.index = MultiIndex._from_data(result.index._data) - result.name = "proportion" if normalize else "count" - return result - - -def from_dataframe(df, allow_copy: bool = False) -> DataFrame: - """ - Build a :class:`DataFrame` from an object supporting the dataframe interchange protocol. - - .. note:: - - If you have a ``pandas.DataFrame``, use :func:`from_pandas` instead. - - Parameters - ---------- - df : DataFrameXchg - Object supporting the interchange protocol, i.e. ``__dataframe__`` method. - allow_copy : bool, default: True - Whether to allow copying the memory to perform the conversion - (if false then zero-copy approach is requested). - - Returns - ------- - :class:`DataFrame` - """ - return df_protocol.from_dataframe(df, allow_copy=allow_copy) - - -def make_binop_func(op, postprocess=None): - # This function is used to wrap binary operations in Frame with an - # appropriate API for DataFrame as required for pandas compatibility. The - # main effect is reordering and error-checking parameters in - # DataFrame-specific ways. The postprocess argument is a callable that may - # optionally be provided to modify the result of the binop if additional - # processing is needed for pandas compatibility. The callable must have the - # signature - # def postprocess(left, right, output) - # where left and right are the inputs to the binop and output is the result - # of calling the wrapped Frame binop. - wrapped_func = getattr(IndexedFrame, op) - - @functools.wraps(wrapped_func) - def wrapper(self, other, axis="columns", level=None, fill_value=None): - if axis not in (1, "columns"): - raise NotImplementedError("Only axis=1 supported at this time.") - output = wrapped_func(self, other, axis, level, fill_value) - if postprocess is None: - return output - return postprocess(self, other, output) - - # functools.wraps copies module level attributes to `wrapper` and sets - # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature - # string of a function by recursively delving into __wrapped__ until - # it hits the first function that has __signature__ attribute set. To make - # the signature string of `wrapper` matches with its actual parameter list, - # we directly set the __signature__ attribute of `wrapper` below. - - new_sig = inspect.signature( - lambda self, other, axis="columns", level=None, fill_value=None: None - ) - - wrapper.__signature__ = new_sig - return wrapper - - -# Wrap arithmetic Frame binop functions with the expected API for Series. -for binop in [ - "add", - "radd", - "subtract", - "sub", - "rsub", - "multiply", - "mul", - "rmul", - "mod", - "rmod", - "pow", - "rpow", - "floordiv", - "rfloordiv", - "truediv", - "div", - "divide", - "rtruediv", - "rdiv", -]: - setattr(DataFrame, binop, make_binop_func(binop)) - - -def _make_replacement_func(value): - # This function generates a postprocessing function suitable for use with - # make_binop_func that fills null columns with the desired fill value. - - def func(left, right, output): - # This function may be passed as the postprocess argument to - # make_binop_func. Columns that are only present in one of the inputs - # will be null in the output. This function postprocesses the output to - # replace those nulls with some desired output. - if isinstance(right, Series): - uncommon_columns = set(left._column_names) ^ set(right.index) - elif isinstance(right, DataFrame): - uncommon_columns = set(left._column_names) ^ set( - right._column_names - ) - elif _is_scalar_or_zero_d_array(right): - for name, col in output._column_labels_and_values: - output._data[name] = col.fillna(value) - return output - else: - return output - - for name in uncommon_columns: - output._data[name] = as_column( - value, length=len(output), dtype="bool" - ) - return output - - return func - - -# The ne comparator needs special postprocessing because elements that missing -# in one operand should be treated as null and result in True in the output -# rather than simply propagating nulls. -DataFrame.ne = make_binop_func("ne", _make_replacement_func(True)) - - -# All other comparison operators needs return False when one of the operands is -# missing in the input. -for binop in [ - "eq", - "lt", - "le", - "gt", - "ge", -]: - setattr( - DataFrame, binop, make_binop_func(binop, _make_replacement_func(False)) - ) - - -@_performance_tracking -def from_pandas(obj, nan_as_null=no_default): - """ - Convert certain Pandas objects into the cudf equivalent. - - Supports DataFrame, Series, Index, or MultiIndex. - - Returns - ------- - DataFrame/Series/Index/MultiIndex - Return type depends on the passed input. - - Raises - ------ - TypeError for invalid input type. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> data = [[0, 1], [1, 2], [3, 4]] - >>> pdf = pd.DataFrame(data, columns=['a', 'b'], dtype=int) - >>> pdf - a b - 0 0 1 - 1 1 2 - 2 3 4 - >>> gdf = cudf.from_pandas(pdf) - >>> gdf - a b - 0 0 1 - 1 1 2 - 2 3 4 - >>> type(gdf) - - >>> type(pdf) - - - Converting a Pandas Series to cuDF Series: - - >>> psr = pd.Series(['a', 'b', 'c', 'd'], name='apple', dtype='str') - >>> psr - 0 a - 1 b - 2 c - 3 d - Name: apple, dtype: object - >>> gsr = cudf.from_pandas(psr) - >>> gsr - 0 a - 1 b - 2 c - 3 d - Name: apple, dtype: object - >>> type(gsr) - - >>> type(psr) - - - Converting a Pandas Index to cuDF Index: - - >>> pidx = pd.Index([1, 2, 10, 20]) - >>> pidx - Index([1, 2, 10, 20], dtype='int64') - >>> gidx = cudf.from_pandas(pidx) - >>> gidx - Index([1, 2, 10, 20], dtype='int64') - >>> type(gidx) - - >>> type(pidx) - - - Converting a Pandas MultiIndex to cuDF MultiIndex: - - >>> pmidx = pd.MultiIndex( - ... levels=[[1, 3, 4, 5], [1, 2, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> pmidx - MultiIndex([(1, 1), - (1, 5), - (3, 2), - (4, 2), - (5, 1)], - names=['x', 'y']) - >>> gmidx = cudf.from_pandas(pmidx) - >>> gmidx - MultiIndex([(1, 1), - (1, 5), - (3, 2), - (4, 2), - (5, 1)], - names=['x', 'y']) - >>> type(gmidx) - - >>> type(pmidx) - - """ - if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) - - if isinstance(obj, pd.DataFrame): - return DataFrame.from_pandas(obj, nan_as_null=nan_as_null) - elif isinstance(obj, pd.Series): - return Series.from_pandas(obj, nan_as_null=nan_as_null) - # This carveout for cudf.pandas is undesirable, but fixes crucial issues - # for core RAPIDS projects like cuML and cuGraph that rely on - # `cudf.from_pandas`, so we allow it for now. - elif (ret := getattr(obj, "_fsproxy_wrapped", None)) is not None: - return ret - elif isinstance(obj, pd.MultiIndex): - return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) - elif isinstance(obj, pd.Index): - return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null) - elif isinstance(obj, pd.CategoricalDtype): - return cudf.CategoricalDtype.from_pandas(obj) - elif isinstance(obj, pd.IntervalDtype): - return cudf.IntervalDtype.from_pandas(obj) - else: - raise TypeError( - f"from_pandas unsupported for object of type {type(obj).__name__}" - ) - - -@_performance_tracking -def merge(left, right, *args, **kwargs): - if isinstance(left, Series): - left = left.to_frame() - return left.merge(right, *args, **kwargs) - - -# a bit of fanciness to inject docstring with left parameter -merge_doc = DataFrame.merge.__doc__ -if merge_doc is not None: - idx = merge_doc.find("right") - merge.__doc__ = "".join( - [ - merge_doc[:idx], - "\n\tleft : Series or DataFrame\n\t", - merge_doc[idx:], - ] - ) - - -def _align_indices(lhs, rhs): - """ - Internal util to align the indices of two DataFrames. Returns a tuple of - the aligned dataframes, or the original arguments if the indices are the - same, or if rhs isn't a DataFrame. - """ - lhs_out, rhs_out = lhs, rhs - if isinstance(rhs, DataFrame) and not lhs.index.equals(rhs.index): - df = lhs.merge( - rhs, - sort=True, - how="outer", - left_index=True, - right_index=True, - suffixes=("_x", "_y"), - ) - df = df.sort_index() - lhs_out = DataFrame(index=df.index) - rhs_out = DataFrame(index=df.index) - common = set(lhs._column_names) & set(rhs._column_names) - common_x = {f"{x}_x": x for x in common} - common_y = {f"{x}_y": x for x in common} - for col in df._column_names: - if col in common_x: - lhs_out[common_x[col]] = df[col] - elif col in common_y: - rhs_out[common_y[col]] = df[col] - elif col in lhs: - lhs_out[col] = df[col] - elif col in rhs: - rhs_out[col] = df[col] - - return lhs_out, rhs_out - - -def _setitem_with_dataframe( - input_df: DataFrame, - replace_df: DataFrame, - input_cols: Any = None, - mask: ColumnBase | None = None, - ignore_index: bool = False, -): - """ - This function sets item dataframes relevant columns with replacement df - :param input_df: Dataframe to be modified inplace - :param replace_df: Replacement DataFrame to replace values with - :param input_cols: columns to replace in the input dataframe - :param mask: boolean mask in case of masked replacing - :param ignore_index: Whether to conduct index equality and reindex - """ - - if input_cols is None: - input_cols = input_df._column_names - - if len(input_cols) != len(replace_df._column_names): - raise ValueError( - "Number of Input Columns must be same replacement Dataframe" - ) - - if ( - not ignore_index - and len(input_df) != 0 - and not input_df.index.equals(replace_df.index) - ): - replace_df = replace_df.reindex(input_df.index) - - for col_1, col_2 in zip(input_cols, replace_df._column_names): - if col_1 in input_df._column_names: - if mask is not None: - input_df._data[col_1][mask] = column.as_column( - replace_df[col_2] - ) - else: - input_df._data[col_1] = column.as_column(replace_df[col_2]) - else: - if mask is not None: - raise ValueError("Can not insert new column with a bool mask") - else: - # handle append case - input_df._insert( - loc=len(input_df._data), - name=col_1, - value=replace_df[col_2], - ) - - -def extract_col(df, col): - """ - Extract column from dataframe `df` with their name `col`. - If `col` is index and there are no columns with name `index`, - then this will return index column. - """ - try: - return df._data[col] - except KeyError: - if ( - col == "index" - and col not in df.index._data - and not isinstance(df.index, MultiIndex) - ): - return df.index._column - return df.index._data[col] - - -def _get_union_of_indices(indexes): - if len(indexes) == 1: - return indexes[0] - else: - merged_index = cudf.core.index.Index._concat(indexes) - return merged_index.drop_duplicates() - - -def _get_union_of_series_names(series_list): - names_list = [] - unnamed_count = 0 - for series in series_list: - if series.name is None: - names_list.append(f"Unnamed {unnamed_count}") - unnamed_count += 1 - else: - names_list.append(series.name) - if unnamed_count == len(series_list): - names_list = range(len(series_list)) - - return names_list - - -# Create a dictionary of the common, non-null columns -def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns): - # A mapping of {idx: np.dtype} - dtypes = dict() - # A mapping of {idx: [...columns]}, where `[...columns]` - # is a list of columns with at least one valid value for each - # column name across all input frames - non_null_columns = dict() - for idx in col_idxs: - for cols in list_of_columns: - # Skip columns not in this frame - if idx >= len(cols) or cols[idx] is None: - continue - # Store the first dtype we find for a column, even if it's - # all-null. This ensures we always have at least one dtype - # for each name. This dtype will be overwritten later if a - # non-null Column with the same name is found. - if idx not in dtypes: - dtypes[idx] = cols[idx].dtype - if cols[idx].null_count != len(cols[idx]): - if idx not in non_null_columns: - non_null_columns[idx] = [cols[idx]] - else: - non_null_columns[idx].append(cols[idx]) - return non_null_columns, dtypes - - -def _find_common_dtypes_and_categories( - non_null_columns, dtypes -) -> dict[Any, ColumnBase]: - # A mapping of {idx: categories}, where `categories` is a - # column of all the unique categorical values from each - # categorical column across all input frames - categories = dict() - for idx, cols in non_null_columns.items(): - # default to the first non-null dtype - dtypes[idx] = cols[0].dtype - # If all the non-null dtypes are int/float, find a common dtype - if all(is_numeric_dtype(col.dtype) for col in cols): - dtypes[idx] = find_common_type([col.dtype for col in cols]) - # If all categorical dtypes, combine the categories - elif all( - isinstance(col, cudf.core.column.CategoricalColumn) for col in cols - ): - # Combine and de-dupe the categories - categories[idx] = concat_columns( - [col.categories for col in cols] - ).unique() - # Set the column dtype to the codes' dtype. The categories - # will be re-assigned at the end - dtypes[idx] = min_signed_type(len(categories[idx])) - # Otherwise raise an error if columns have different dtypes - elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols): - raise ValueError("All columns must be the same type") - return categories - - -def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories): - # Cast all columns to a common dtype, assign combined categories, - # and back-fill missing columns with all-null columns - for idx in col_idxs: - dtype = dtypes[idx] - for cols in list_of_columns: - # If column not in this df, fill with an all-null column - if idx >= len(cols) or cols[idx] is None: - n = len(next(x for x in cols if x is not None)) - cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True) - else: - # If column is categorical, rebase the codes with the - # combined categories, and cast the new codes to the - # min-scalar-sized dtype - if idx in categories: - cols[idx] = ( - cols[idx] - ._set_categories( - categories[idx], - is_unique=True, - ) - .codes - ) - cols[idx] = cols[idx].astype(dtype) - - -def _reassign_categories(categories, cols, col_idxs): - for name, idx in zip(cols, col_idxs): - if idx in categories: - codes = as_unsigned_codes(len(categories[idx]), cols[name]) - cols[name] = CategoricalColumn( - data=None, - size=codes.size, - dtype=cudf.CategoricalDtype( - categories=categories[idx], ordered=False - ), - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), - ) - - -def _from_dict_create_index(indexlist, namelist, library): - if len(namelist) > 1: - index = library.MultiIndex.from_tuples(indexlist, names=namelist) - else: - index = library.Index(indexlist, name=namelist[0]) - return index diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py deleted file mode 100644 index 5250a741d3d..00000000000 --- a/python/cudf/cudf/core/df_protocol.py +++ /dev/null @@ -1,899 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import enum -from collections import abc -from typing import Any, Iterable, Mapping, Sequence, Tuple, cast - -import cupy as cp -import numpy as np -from numba.cuda import as_cuda_array - -import rmm - -import cudf -from cudf.core.buffer import Buffer, as_buffer -from cudf.core.column import ( - CategoricalColumn, - NumericalColumn, - as_column, - build_column, -) - -# Implementation of interchange protocol classes -# ---------------------------------------------- - - -class _DtypeKind(enum.IntEnum): - INT = 0 - UINT = 1 - FLOAT = 2 - BOOL = 20 - STRING = 21 # UTF-8 - DATETIME = 22 - CATEGORICAL = 23 - - -class _Device(enum.IntEnum): - CPU = 1 - CUDA = 2 - CPU_PINNED = 3 - OPENCL = 4 - VULKAN = 7 - METAL = 8 - VPI = 9 - ROCM = 10 - - -class _MaskKind(enum.IntEnum): - NON_NULLABLE = 0 - NAN = 1 - SENTINEL = 2 - BITMASK = 3 - BYTEMASK = 4 - - -_SUPPORTED_KINDS = { - _DtypeKind.INT, - _DtypeKind.UINT, - _DtypeKind.FLOAT, - _DtypeKind.CATEGORICAL, - _DtypeKind.BOOL, - _DtypeKind.STRING, -} -ProtoDtype = Tuple[_DtypeKind, int, str, str] - - -class _CuDFBuffer: - """ - Data in the buffer is guaranteed to be contiguous in memory. - """ - - def __init__( - self, - buf: Buffer, - dtype: np.dtype, - allow_copy: bool = True, - ) -> None: - """ - Use Buffer object. - """ - # Store the cudf buffer where the data resides as a private - # attribute, so we can use it to retrieve the public attributes - self._buf = buf - self._dtype = dtype - self._allow_copy = allow_copy - - @property - def bufsize(self) -> int: - """ - The Buffer size in bytes. - """ - return self._buf.size - - @property - def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ - return self._buf.get_ptr(mode="write") - - def __dlpack__(self): - # DLPack not implemented in NumPy yet, so leave it out here. - try: - cuda_array = as_cuda_array(self._buf).view(self._dtype) - return cp.asarray(cuda_array).toDlpack() - except ValueError: - raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`") - - def __dlpack_device__(self) -> tuple[_Device, int]: - """ - _Device type and _Device ID for where the data in the buffer resides. - """ - return (_Device.CUDA, cp.asarray(self._buf).device.id) - - def __repr__(self) -> str: - return f"{self.__class__.__name__}(" + str( - { - "bufsize": self.bufsize, - "ptr": self.ptr, - "device": self.__dlpack_device__()[0].name, - } - ) - +")" - - -class _CuDFColumn: - """ - A column object, with only the methods and properties required by the - interchange protocol defined. - - A column can contain one or more chunks. Each chunk can contain up to three - buffers - a data buffer, a mask buffer (depending on null representation), - and an offsets buffer (if variable-size binary; e.g., variable-length - strings). - - Note: this Column object can only be produced by ``__dataframe__``, so - doesn't need its own version or ``__column__`` protocol. - - """ - - def __init__( - self, - column: cudf.core.column.ColumnBase, - nan_as_null: bool = True, - allow_copy: bool = True, - ) -> None: - """ - Note: doesn't deal with extension arrays yet, just assume a regular - Series/ndarray for now. - """ - if not isinstance(column, cudf.core.column.ColumnBase): - raise TypeError( - "column must be a subtype of df.core.column.ColumnBase," - f"got {type(column)}" - ) - self._col = column - self._nan_as_null = nan_as_null - self._allow_copy = allow_copy - - def size(self) -> int: - """ - Size of the column, in elements. - """ - return self._col.size - - @property - def offset(self) -> int: - """ - Offset of first element. Always zero. - """ - return 0 - - @property - def dtype(self) -> ProtoDtype: - """ - Dtype description as a tuple - ``(kind, bit-width, format string, endianness)`` - - Kind : - - - INT = 0 - - UINT = 1 - - FLOAT = 2 - - BOOL = 20 - - STRING = 21 # UTF-8 - - DATETIME = 22 - - CATEGORICAL = 23 - - Bit-width : the number of bits as an integer - Format string : data type description format string in Apache Arrow C - Data Interface format. - Endianness : current only native endianness (``=``) is supported - - Notes - ----- - - Kind specifiers are aligned with DLPack where possible - (hence the jump to 20, leave enough room for future extension) - - Masks must be specified as boolean with either bit width 1 - (for bit masks) or 8 (for byte masks). - - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case - in the future we need to support non-native endianness - - Went with Apache Arrow format strings over NumPy format strings - because they're more complete from a dataframe perspective - - Format strings are mostly useful for datetime specification, - and for categoricals. - - For categoricals, the format string describes the type of the - categorical in the data buffer. In case of a separate encoding - of the categorical (e.g. an integer to string mapping), - this can be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, - binary, decimal, and nested (list, struct, map, union) dtypes. - """ - dtype = self._col.dtype - - # For now, assume that, if the column dtype is 'O' (i.e., `object`), - # then we have an array of strings - if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == "O": - return (_DtypeKind.STRING, 8, "u", "=") - - return self._dtype_from_cudfdtype(dtype) - - def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: - """ - See `self.dtype` for details. - """ - # Note: 'c' (complex) not handled yet (not in array spec v1). - # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) - # not handled datetime and timedelta both map to datetime - # (is timedelta handled?) - _np_kinds = { - "i": _DtypeKind.INT, - "u": _DtypeKind.UINT, - "f": _DtypeKind.FLOAT, - "b": _DtypeKind.BOOL, - "U": _DtypeKind.STRING, - "M": _DtypeKind.DATETIME, - "m": _DtypeKind.DATETIME, - } - kind = _np_kinds.get(dtype.kind, None) - if kind is None: - # Not a NumPy/CuPy dtype. Check if it's a categorical maybe - if isinstance(dtype, cudf.CategoricalDtype): - kind = _DtypeKind.CATEGORICAL - # Codes and categories' dtypes are different. - # We use codes' dtype as these are stored in the buffer. - codes = cast( - cudf.core.column.CategoricalColumn, self._col - ).codes - dtype = codes.dtype - else: - raise ValueError( - f"Data type {dtype} not supported by exchange protocol" - ) - - if kind not in _SUPPORTED_KINDS: - raise NotImplementedError(f"Data type {dtype} not handled yet") - - bitwidth = dtype.itemsize * 8 - format_str = dtype.str - endianness = dtype.byteorder if kind != _DtypeKind.CATEGORICAL else "=" - return (kind, bitwidth, format_str, endianness) - - @property - def describe_categorical(self) -> tuple[bool, bool, dict[int, Any]]: - """ - If the dtype is categorical, there are two options: - - - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. - - Raises TypeError if the dtype is not categorical - - Content of returned dict: - - - "is_ordered" : bool, whether the ordering of dictionary - indices is semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of - categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. - """ - if not self.dtype[0] == _DtypeKind.CATEGORICAL: - raise TypeError( - "`describe_categorical only works on " - "a column with categorical dtype!" - ) - categ_col = cast(cudf.core.column.CategoricalColumn, self._col) - ordered = bool(categ_col.dtype.ordered) - is_dictionary = True - # NOTE: this shows the children approach is better, transforming - # `categories` to a "mapping" dict is inefficient - categories = categ_col.categories - mapping = {ix: val for ix, val in enumerate(categories.values_host)} - return ordered, is_dictionary, mapping - - @property - def describe_null(self) -> tuple[int, Any]: - """ - Return the missing value (or "null") representation the column dtype - uses, as a tuple ``(kind, value)``. - - Kind: - - - 0 : non-nullable - - 1 : NaN/NaT - - 2 : sentinel value - - 3 : bit mask - - 4 : byte mask - - Value : if kind is "sentinel value", the actual value. - If kind is a bit mask or a byte mask, the value (0 or 1) - indicating a missing value. - None otherwise. - """ - kind = self.dtype[0] - if self.null_count == 0: - # there is no validity mask so it is non-nullable - return _MaskKind.NON_NULLABLE, None - - elif kind in _SUPPORTED_KINDS: - # currently, we return a bit mask - return _MaskKind.BITMASK, 0 - - else: - raise NotImplementedError( - f"Data type {self.dtype} not yet supported" - ) - - @property - def null_count(self) -> int: - """ - Number of null elements. Should always be known. - """ - return self._col.null_count - - @property - def metadata(self) -> dict[str, Any]: - """ - Store specific metadata of the column. - """ - return {} - - def num_chunks(self) -> int: - """ - Return the number of chunks the column consists of. - """ - return 1 - - def get_chunks( - self, n_chunks: int | None = None - ) -> Iterable["_CuDFColumn"]: - """ - Return an iterable yielding the chunks. - - See `DataFrame.get_chunks` for details on ``n_chunks``. - """ - return (self,) - - def get_buffers( - self, - ) -> Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None]: - """ - Return a dictionary containing the underlying buffers. - - The returned dictionary has the following contents: - - - "data": a two-element tuple whose first element is a buffer - containing the data and whose second element is the data - buffer's associated dtype. - - "validity": a two-element tuple whose first element is a buffer - containing mask values indicating missing data and - whose second element is the mask value buffer's - associated dtype. None if the null representation is - not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a buffer - containing the offset values for variable-size binary - data (e.g., variable-length strings) and whose second - element is the offsets buffer's associated dtype. None - if the data buffer does not have an associated offsets - buffer. - """ - buffers = {} - try: - buffers["validity"] = self._get_validity_buffer() - except RuntimeError: - buffers["validity"] = None - - try: - buffers["offsets"] = self._get_offsets_buffer() - except RuntimeError: - buffers["offsets"] = None - - buffers["data"] = self._get_data_buffer() - - return buffers - - def _get_validity_buffer( - self, - ) -> tuple[_CuDFBuffer, ProtoDtype] | None: - """ - Return the buffer containing the mask values - indicating missing data and the buffer's associated dtype. - - Raises RuntimeError if null representation is not a bit or byte mask. - """ - null, invalid = self.describe_null - - if null == _MaskKind.BITMASK: - assert self._col.mask is not None - buffer = _CuDFBuffer( - self._col.mask, cp.uint8, allow_copy=self._allow_copy - ) - dtype = (_DtypeKind.UINT, 8, "C", "=") - return buffer, dtype - - elif null == _MaskKind.NAN: - raise RuntimeError( - "This column uses NaN as null " - "so does not have a separate mask" - ) - elif null == _MaskKind.NON_NULLABLE: - raise RuntimeError( - "This column is non-nullable so does not have a mask" - ) - else: - raise NotImplementedError( - f"See {self.__class__.__name__}.describe_null method." - ) - - def _get_offsets_buffer( - self, - ) -> tuple[_CuDFBuffer, ProtoDtype] | None: - """ - Return the buffer containing the offset values for - variable-size binary data (e.g., variable-length strings) - and the buffer's associated dtype. - - Raises RuntimeError if the data buffer does not have an associated - offsets buffer. - """ - if self.dtype[0] == _DtypeKind.STRING: - offsets = self._col.children[0] - assert (offsets is not None) and (offsets.data is not None), " " - "offsets(.data) should not be None for string column" - - buffer = _CuDFBuffer( - offsets.data, offsets.dtype, allow_copy=self._allow_copy - ) - dtype = self._dtype_from_cudfdtype(offsets.dtype) - else: - raise RuntimeError( - "This column has a fixed-length dtype " - "so does not have an offsets buffer" - ) - - return buffer, dtype - - def _get_data_buffer( - self, - ) -> tuple[_CuDFBuffer, ProtoDtype]: - """ - Return the buffer containing the data and - the buffer's associated dtype. - """ - if self.dtype[0] in ( - _DtypeKind.INT, - _DtypeKind.UINT, - _DtypeKind.FLOAT, - _DtypeKind.BOOL, - ): - col_data = self._col - dtype = self.dtype - - elif self.dtype[0] == _DtypeKind.CATEGORICAL: - col_data = cast( - cudf.core.column.CategoricalColumn, self._col - ).codes - dtype = self._dtype_from_cudfdtype(col_data.dtype) - - elif self.dtype[0] == _DtypeKind.STRING: - col_data = build_column( - data=self._col.data, dtype=np.dtype("int8") - ) - dtype = self._dtype_from_cudfdtype(col_data.dtype) - - else: - raise NotImplementedError( - f"Data type {self._col.dtype} not handled yet" - ) - assert (col_data is not None) and (col_data.data is not None), " " - f"col_data(.data) should not be None when dtype = {dtype}" - buffer = _CuDFBuffer( - col_data.data, col_data.dtype, allow_copy=self._allow_copy - ) - - return buffer, dtype - - -class _CuDFDataFrame: - """ - A data frame class, with only the methods required by the interchange - protocol defined. - - Instances of this (private) class are returned from - ``cudf.DataFrame.__dataframe__`` as objects with the methods and - attributes defined on this class. - """ - - def __init__( - self, - df: "cudf.core.dataframe.DataFrame", - nan_as_null: bool = True, - allow_copy: bool = True, - ) -> None: - """ - Constructor - an instance of this (private) class is returned from - `cudf.DataFrame.__dataframe__`. - """ - self._df = df - # ``nan_as_null`` is a keyword intended for the consumer to tell the - # producer to overwrite null values in the data with - # ``NaN`` (or ``NaT``). - # This currently has no effect; once support for nullable extension - # dtypes is added, this value should be propagated to columns. - self._nan_as_null = nan_as_null - self._allow_copy = allow_copy - - def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True - ) -> "_CuDFDataFrame": - """ - See the docstring of the `cudf.DataFrame.__dataframe__` for details - """ - return _CuDFDataFrame( - self._df, nan_as_null=nan_as_null, allow_copy=allow_copy - ) - - @property - def metadata(self): - # `index` isn't a regular column, and the protocol doesn't support row - # labels - so we export it as cuDF-specific metadata here. - return {"cudf.index": self._df.index} - - def num_columns(self) -> int: - return len(self._df._column_names) - - def num_rows(self) -> int: - return len(self._df) - - def num_chunks(self) -> int: - return 1 - - def column_names(self) -> Iterable[str]: - return self._df._column_names - - def get_column(self, i: int) -> _CuDFColumn: - return _CuDFColumn( - as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy - ) - - def get_column_by_name(self, name: str) -> _CuDFColumn: - return _CuDFColumn( - as_column(self._df[name]), allow_copy=self._allow_copy - ) - - def get_columns(self) -> Iterable[_CuDFColumn]: - return [ - _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy) - for name in self._df.columns - ] - - def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame": - if not isinstance(indices, abc.Sequence): - raise ValueError("`indices` is not a sequence") - - return _CuDFDataFrame(self._df.iloc[:, indices]) - - def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": - if not isinstance(names, abc.Sequence): - raise ValueError("`names` is not a sequence") - - return _CuDFDataFrame( - self._df.loc[:, names], self._nan_as_null, self._allow_copy - ) - - def get_chunks( - self, n_chunks: int | None = None - ) -> Iterable["_CuDFDataFrame"]: - """ - Return an iterator yielding the chunks. - """ - return (self,) - - -def __dataframe__( - self, nan_as_null: bool = False, allow_copy: bool = True -) -> _CuDFDataFrame: - """ - The public method to attach to cudf.DataFrame. - - ``nan_as_null`` is a keyword intended for the consumer to tell the - producer to overwrite null values in the data with ``NaN`` (or ``NaT``). - This currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. - - ``allow_copy`` is a keyword that defines whether or not the library is - allowed to make a copy of the data. For example, copying data would be - necessary if a library supports strided buffers, given that this protocol - specifies contiguous buffers. - """ - return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy) - - -""" -Implementation of the dataframe exchange protocol. - -Public API ----------- - -from_dataframe : construct a cudf.DataFrame from an input data frame which - implements the exchange protocol - -Notes ------ - -- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and - unsafe to do in pure Python. It's more general but definitely less friendly - than having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which - lack ``__dlpack__`` (e.g., because the column dtype isn't supported by - DLPack), this is worth looking at again. - -""" - - -# A typing protocol could be added later to let Mypy validate code using -# `from_dataframe` better. -DataFrameObject = Any -ColumnObject = Any - - -_INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} -_UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} -_FLOATS = {32: cp.float32, 64: cp.float64} -_CP_DTYPES = { - 0: _INTS, - 1: _UINTS, - 2: _FLOATS, - 20: {8: bool}, - 21: {8: cp.uint8}, -} - - -def from_dataframe( - df: DataFrameObject, allow_copy: bool = False -) -> cudf.DataFrame: - """ - Construct a ``DataFrame`` from ``df`` if it supports the - dataframe interchange protocol (``__dataframe__``). - - Parameters - ---------- - df : DataFrameObject - Object supporting dataframe interchange protocol - allow_copy : bool - If ``True``, allow copying of the data. If ``False``, a - ``TypeError`` is raised if data copying is required to - construct the ``DataFrame`` (e.g., if ``df`` lives in CPU - memory). - - Returns - ------- - DataFrame - - Examples - -------- - >>> import pandas as pd - >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']}) - >>> df = cudf.from_dataframe(pdf, allow_copy=True) - >>> type(df) - cudf.core.dataframe.DataFrame - >>> df - a b - 0 1 x - 1 2 y - 2 3 z - - Notes - ----- - See https://data-apis.org/dataframe-protocol/latest/index.html - for the dataframe interchange protocol spec and API - """ - if isinstance(df, cudf.DataFrame): - return df - - if not hasattr(df, "__dataframe__"): - raise ValueError("`df` does not support __dataframe__") - - df = df.__dataframe__(allow_copy=allow_copy) - - # Check number of chunks, if there's more than one we need to iterate - if df.num_chunks() > 1: - raise NotImplementedError("More than one chunk not handled yet") - - # We need a dict of columns here, with each column being a cudf column. - columns = dict() - _buffers = [] # hold on to buffers, keeps memory alive - for name in df.column_names(): - col = df.get_column_by_name(name) - - if col.dtype[0] in ( - _DtypeKind.INT, - _DtypeKind.UINT, - _DtypeKind.FLOAT, - _DtypeKind.BOOL, - ): - columns[name], _buf = _protocol_to_cudf_column_numeric( - col, allow_copy - ) - - elif col.dtype[0] == _DtypeKind.CATEGORICAL: - columns[name], _buf = _protocol_to_cudf_column_categorical( - col, allow_copy - ) - - elif col.dtype[0] == _DtypeKind.STRING: - columns[name], _buf = _protocol_to_cudf_column_string( - col, allow_copy - ) - - else: - raise NotImplementedError( - f"Data type {col.dtype[0]} not handled yet" - ) - - _buffers.append(_buf) - - df_new = cudf.DataFrame._from_data(columns) - df_new._buffers = _buffers - return df_new - - -def _protocol_to_cudf_column_numeric( - col, allow_copy: bool -) -> tuple[ - cudf.core.column.ColumnBase, - Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], -]: - """ - Convert an int, uint, float or bool protocol column - to the corresponding cudf column - """ - if col.offset != 0: - raise NotImplementedError("column.offset > 0 not handled yet") - - buffers = col.get_buffers() - assert buffers["data"] is not None, "data buffer should not be None" - _dbuffer, _ddtype = buffers["data"] - _dbuffer = _ensure_gpu_buffer(_dbuffer, _ddtype, allow_copy) - cudfcol_num = build_column( - _dbuffer._buf, - protocol_dtype_to_cupy_dtype(_ddtype), - ) - return _set_missing_values(col, cudfcol_num, allow_copy), buffers - - -def _ensure_gpu_buffer(buf, data_type, allow_copy: bool) -> _CuDFBuffer: - # if `buf` is a (protocol) buffer that lives on the GPU already, - # return it as is. Otherwise, copy it to the device and return - # the resulting buffer. - if buf.__dlpack_device__()[0] != _Device.CUDA: - if allow_copy: - dbuf = rmm.DeviceBuffer(ptr=buf.ptr, size=buf.bufsize) - return _CuDFBuffer( - as_buffer(dbuf, exposed=True), - protocol_dtype_to_cupy_dtype(data_type), - allow_copy, - ) - else: - raise TypeError( - "This operation must copy data from CPU to GPU. " - "Set `allow_copy=True` to allow it." - ) - return buf - - -def _set_missing_values( - protocol_col, - cudf_col: cudf.core.column.ColumnBase, - allow_copy: bool, -) -> cudf.core.column.ColumnBase: - valid_mask = protocol_col.get_buffers()["validity"] - if valid_mask is not None: - null, invalid = protocol_col.describe_null - if null == _MaskKind.BYTEMASK: - valid_mask = _ensure_gpu_buffer( - valid_mask[0], valid_mask[1], allow_copy - ) - boolmask = as_column(valid_mask._buf, dtype="bool") - bitmask = cudf._lib.transform.bools_to_mask(boolmask) - return cudf_col.set_mask(bitmask) - elif null == _MaskKind.BITMASK: - valid_mask = _ensure_gpu_buffer( - valid_mask[0], valid_mask[1], allow_copy - ) - bitmask = valid_mask._buf - return cudf_col.set_mask(bitmask) - return cudf_col - - -def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: - kind = _dtype[0] - bitwidth = _dtype[1] - if _dtype[0] not in _SUPPORTED_KINDS: - raise RuntimeError(f"Data type {_dtype[0]} not handled yet") - - return _CP_DTYPES[kind][bitwidth] - - -def _protocol_to_cudf_column_categorical( - col, allow_copy: bool -) -> tuple[ - cudf.core.column.ColumnBase, - Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], -]: - """ - Convert a categorical column to a Series instance - """ - ordered, is_dict, categories = col.describe_categorical - if not is_dict: - raise NotImplementedError( - "Non-dictionary categoricals not supported yet" - ) - buffers = col.get_buffers() - assert buffers["data"] is not None, "data buffer should not be None" - codes_buffer, codes_dtype = buffers["data"] - codes_buffer = _ensure_gpu_buffer(codes_buffer, codes_dtype, allow_copy) - cdtype = np.dtype(protocol_dtype_to_cupy_dtype(codes_dtype)) - codes = NumericalColumn( - data=codes_buffer._buf, - size=None, - dtype=cdtype, - ) - cudfcol = CategoricalColumn( - data=None, - size=codes.size, - dtype=cudf.CategoricalDtype(categories=categories, ordered=ordered), - mask=codes.base_mask, - offset=codes.offset, - children=(codes,), - ) - - return _set_missing_values(col, cudfcol, allow_copy), buffers - - -def _protocol_to_cudf_column_string( - col, allow_copy: bool -) -> tuple[ - cudf.core.column.ColumnBase, - Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], -]: - """ - Convert a string ColumnObject to cudf Column object. - """ - # Retrieve the data buffers - buffers = col.get_buffers() - - # Retrieve the data buffer containing the UTF-8 code units - assert buffers["data"] is not None, "data buffer should never be None" - data_buffer, data_dtype = buffers["data"] - data_buffer = _ensure_gpu_buffer(data_buffer, data_dtype, allow_copy) - encoded_string = build_column( - data_buffer._buf, - protocol_dtype_to_cupy_dtype(data_dtype), - ) - - # Retrieve the offsets buffer containing the index offsets demarcating - # the beginning and end of each string - assert buffers["offsets"] is not None, "not possible for string column" - offset_buffer, offset_dtype = buffers["offsets"] - offset_buffer = _ensure_gpu_buffer(offset_buffer, offset_dtype, allow_copy) - offsets = build_column( - offset_buffer._buf, - protocol_dtype_to_cupy_dtype(offset_dtype), - ) - offsets = offsets.astype("int32") - cudfcol_str = build_column( - None, dtype=cp.dtype("O"), children=(offsets, encoded_string) - ) - return _set_missing_values(col, cudfcol_str, allow_copy), buffers - - -def _protocol_buffer_to_cudf_buffer(protocol_buffer): - return as_buffer( - rmm.DeviceBuffer( - ptr=protocol_buffer.ptr, size=protocol_buffer.bufsize - ), - exposed=True, - ) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py deleted file mode 100644 index 2110e610c37..00000000000 --- a/python/cudf/cudf/core/dtypes.py +++ /dev/null @@ -1,1213 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import decimal -import operator -import pickle -import textwrap -import warnings -from functools import cached_property -from typing import TYPE_CHECKING, Any - -import numpy as np -import pandas as pd -import pyarrow as pa -from pandas.api import types as pd_types -from pandas.api.extensions import ExtensionDtype -from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - -import cudf -from cudf.core._compat import PANDAS_GE_210, PANDAS_LT_300 -from cudf.core.abc import Serializable -from cudf.utils.docutils import doc_apply - -if PANDAS_GE_210: - PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.NumpyEADtype -else: - PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.PandasDtype - -if TYPE_CHECKING: - from collections.abc import Callable - - from cudf._typing import Dtype - from cudf.core.buffer import Buffer - - -def dtype(arbitrary): - """ - Return the cuDF-supported dtype corresponding to `arbitrary`. - - Parameters - ---------- - arbitrary: dtype or scalar-like - - Returns - ------- - dtype: the cuDF-supported dtype that best matches `arbitrary` - """ - # first, check if `arbitrary` is one of our extension types: - if isinstance(arbitrary, cudf.core.dtypes._BaseDtype): - return arbitrary - - # next, try interpreting arbitrary as a NumPy dtype that we support: - try: - np_dtype = np.dtype(arbitrary) - except TypeError: - pass - else: - if np_dtype.kind in set("OU"): - return np.dtype("object") - elif np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES: - raise TypeError(f"Unsupported type {np_dtype}") - return np_dtype - - if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}: - # read_csv only accepts "hex" - # e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow - return arbitrary - - # use `pandas_dtype` to try and interpret - # `arbitrary` as a Pandas extension type. - # Return the corresponding NumPy/cuDF type. - pd_dtype = pd.api.types.pandas_dtype(arbitrary) - if cudf.api.types._is_pandas_nullable_extension_dtype(pd_dtype): - if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError( - "Nullable types not supported in pandas compatibility mode" - ) - elif isinstance(pd_dtype, pd.StringDtype): - return np.dtype("object") - else: - return dtype(pd_dtype.numpy_dtype) - elif isinstance(pd_dtype, PANDAS_NUMPY_DTYPE): - return dtype(pd_dtype.numpy_dtype) - elif isinstance(pd_dtype, pd.CategoricalDtype): - return cudf.CategoricalDtype.from_pandas(pd_dtype) - elif isinstance(pd_dtype, pd.IntervalDtype): - return cudf.IntervalDtype.from_pandas(pd_dtype) - elif isinstance(pd_dtype, pd.DatetimeTZDtype): - return pd_dtype - else: - raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype") - - -def _decode_type( - cls: type, - header: dict, - frames: list, - is_valid_class: Callable[[type, type], bool] = operator.is_, -) -> tuple[dict, list, type]: - """Decode metadata-encoded type and check validity - - Parameters - ---------- - cls : type - class performing deserialization - header : dict - metadata for deserialization - frames : list - buffers containing data for deserialization - is_valid_class : Callable - function to call to check if the encoded class type is valid for - serialization by `cls` (default is to check type equality), called - as `is_valid_class(decoded_class, cls)`. - - Returns - ------- - tuple - Tuple of validated headers, frames, and the decoded class - constructor. - - Raises - ------ - AssertionError - if the number of frames doesn't match the count encoded in the - headers, or `is_valid_class` is not true. - """ - assert header["frame_count"] == len(frames), ( - f"Deserialization expected {header['frame_count']} frames, " - f"but received {len(frames)}." - ) - klass = pickle.loads(header["type-serialized"]) - assert is_valid_class( - klass, cls - ), f"Header-encoded {klass=} does not match decoding {cls=}." - return header, frames, klass - - -class _BaseDtype(ExtensionDtype, Serializable): - # Base type for all cudf-specific dtypes - pass - - -class CategoricalDtype(_BaseDtype): - """ - Type for categorical data with the categories and orderedness. - - Parameters - ---------- - categories : sequence, optional - Must be unique, and must not contain any nulls. - The categories are stored in an Index, - and if an index is provided the dtype of that index will be used. - ordered : bool or None, default False - Whether or not this categorical is treated as a ordered categorical. - None can be used to maintain the ordered value of existing categoricals - when used in operations that combine categoricals, e.g. astype, and - will resolve to False if there is no existing ordered to maintain. - - Attributes - ---------- - categories - ordered - - Methods - ------- - from_pandas - to_pandas - - Examples - -------- - >>> import cudf - >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True) - >>> cudf.Series(['a', 'b', 'a', 'c'], dtype=dtype) - 0 a - 1 b - 2 a - 3 - dtype: category - Categories (2, object): ['b' < 'a'] - """ - - def __init__(self, categories=None, ordered: bool = False) -> None: - self._categories = self._init_categories(categories) - self._ordered = ordered - - @property - def categories(self) -> cudf.Index: - """ - An ``Index`` containing the unique categories allowed. - - Examples - -------- - >>> import cudf - >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True) - >>> dtype.categories - Index(['b', 'a'], dtype='object') - """ - if self._categories is None: - col = cudf.core.column.column_empty( - 0, dtype="object", masked=False - ) - else: - col = self._categories - return cudf.Index._from_column(col) - - @property - def type(self): - return self._categories.dtype.type - - @property - def name(self): - return "category" - - @property - def str(self): - return "|O08" - - @property - def ordered(self) -> bool: - """ - Whether the categories have an ordered relationship. - """ - return self._ordered - - @classmethod - def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype": - """ - Convert a ``pandas.CategrocialDtype`` to ``cudf.CategoricalDtype`` - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> pd_dtype = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) - >>> pd_dtype - CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) - >>> cudf_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) - >>> cudf_dtype - CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) - """ # noqa: E501 - return CategoricalDtype( - categories=dtype.categories, ordered=dtype.ordered - ) - - def to_pandas(self) -> pd.CategoricalDtype: - """ - Convert a ``cudf.CategoricalDtype`` to ``pandas.CategoricalDtype`` - - Examples - -------- - >>> import cudf - >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True) - >>> dtype - CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) - >>> dtype.to_pandas() - CategoricalDtype(categories=['b', 'a'], ordered=True, categories_dtype=object) - """ # noqa: E501 - if self._categories is None: - categories = None - elif self._categories.dtype.kind == "f": - categories = self._categories.dropna().to_pandas() - else: - categories = self._categories.to_pandas() - return pd.CategoricalDtype(categories=categories, ordered=self.ordered) - - def _init_categories( - self, categories: Any - ) -> cudf.core.column.ColumnBase | None: - if categories is None: - return categories - if len(categories) == 0 and not isinstance( - getattr(categories, "dtype", None), - (cudf.IntervalDtype, pd.IntervalDtype), - ): - dtype = "object" # type: Any - else: - dtype = None - - column = cudf.core.column.as_column(categories, dtype=dtype) - - if isinstance(column, cudf.core.column.CategoricalColumn): - return column.categories - else: - return column - - def __eq__(self, other: Dtype) -> bool: - if isinstance(other, str): - return other == self.name - elif other is self: - return True - elif not isinstance(other, self.__class__): - return False - elif self.ordered != other.ordered: - return False - elif self._categories is None or other._categories is None: - return True - else: - return ( - self._categories.dtype == other._categories.dtype - and self._categories.equals(other._categories) - ) - - def construct_from_string(self): - raise NotImplementedError() - - def serialize(self): - header = {} - header["type-serialized"] = pickle.dumps(type(self)) - header["ordered"] = self.ordered - - frames = [] - - if self.categories is not None: - categories_header, categories_frames = self.categories.serialize() - header["categories"] = categories_header - frames.extend(categories_frames) - header["frame_count"] = len(frames) - return header, frames - - @classmethod - def deserialize(cls, header, frames): - header, frames, klass = _decode_type(cls, header, frames) - ordered = header["ordered"] - categories_header = header["categories"] - categories_frames = frames - categories_type = pickle.loads(categories_header["type-serialized"]) - categories = categories_type.deserialize( - categories_header, categories_frames - ) - return klass(categories=categories, ordered=ordered) - - def __repr__(self): - return self.to_pandas().__repr__() - - -class ListDtype(_BaseDtype): - """ - Type to represent list data. - - Parameters - ---------- - element_type : object - A dtype with which represents the element types in the list. - - Attributes - ---------- - element_type - leaf_type - - Methods - ------- - from_arrow - to_arrow - - Examples - -------- - >>> import cudf - >>> list_dtype = cudf.ListDtype("int32") - >>> list_dtype - ListDtype(int32) - - A nested list dtype can be created by: - - >>> nested_list_dtype = cudf.ListDtype(list_dtype) - >>> nested_list_dtype - ListDtype(ListDtype(int32)) - """ - - _typ: pa.ListType - name: str = "list" - - def __init__(self, element_type: Any) -> None: - if isinstance(element_type, ListDtype): - self._typ = pa.list_(element_type._typ) - else: - element_type = cudf.utils.dtypes.cudf_dtype_to_pa_type( - element_type - ) - self._typ = pa.list_(element_type) - - @cached_property - def element_type(self) -> Dtype: - """ - Returns the element type of the ``ListDtype``. - - Returns - ------- - Dtype - - Examples - -------- - >>> import cudf - >>> deep_nested_type = cudf.ListDtype(cudf.ListDtype(cudf.ListDtype("float32"))) - >>> deep_nested_type - ListDtype(ListDtype(ListDtype(float32))) - >>> deep_nested_type.element_type - ListDtype(ListDtype(float32)) - >>> deep_nested_type.element_type.element_type - ListDtype(float32) - >>> deep_nested_type.element_type.element_type.element_type - 'float32' - """ # noqa: E501 - if isinstance(self._typ.value_type, pa.ListType): - return ListDtype.from_arrow(self._typ.value_type) - elif isinstance(self._typ.value_type, pa.StructType): - return StructDtype.from_arrow(self._typ.value_type) - else: - return cudf.dtype(self._typ.value_type.to_pandas_dtype()) - - @cached_property - def leaf_type(self): - """ - Returns the type of the leaf values. - - Examples - -------- - >>> import cudf - >>> deep_nested_type = cudf.ListDtype(cudf.ListDtype(cudf.ListDtype("float32"))) - >>> deep_nested_type - ListDtype(ListDtype(ListDtype(float32))) - >>> deep_nested_type.leaf_type - 'float32' - """ # noqa: E501 - if isinstance(self.element_type, ListDtype): - return self.element_type.leaf_type - else: - return self.element_type - - @property - def type(self): - # TODO: we should change this to return something like a - # ListDtypeType, once we figure out what that should look like - return pa.array - - @classmethod - def from_arrow(cls, typ): - """ - Creates a ``ListDtype`` from ``pyarrow.ListType``. - - Parameters - ---------- - typ : pyarrow.ListType - A ``pyarrow.ListType`` that has to be converted to - ``ListDtype``. - - Returns - ------- - obj : ``ListDtype`` - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> arrow_type = pa.infer_type([[1]]) - >>> arrow_type - ListType(list) - >>> list_dtype = cudf.ListDtype.from_arrow(arrow_type) - >>> list_dtype - ListDtype(int64) - """ - obj = object.__new__(cls) - obj._typ = typ - return obj - - def to_arrow(self): - """ - Convert to a ``pyarrow.ListType`` - - Examples - -------- - >>> import cudf - >>> list_dtype = cudf.ListDtype(cudf.ListDtype("float32")) - >>> list_dtype - ListDtype(ListDtype(float32)) - >>> list_dtype.to_arrow() - ListType(list>) - """ - return self._typ - - def __eq__(self, other): - if isinstance(other, str): - return other == self.name - if not isinstance(other, ListDtype): - return False - return self._typ.equals(other._typ) - - def __repr__(self): - if isinstance(self.element_type, (ListDtype, StructDtype)): - return f"{type(self).__name__}({repr(self.element_type)})" - else: - return f"{type(self).__name__}({self.element_type})" - - def __hash__(self): - return hash(self._typ) - - def serialize(self) -> tuple[dict, list]: - header: dict[str, Dtype] = {} - header["type-serialized"] = pickle.dumps(type(self)) - - frames = [] - - if isinstance(self.element_type, _BaseDtype): - header["element-type"], frames = self.element_type.serialize() - else: - header["element-type"] = getattr( - self.element_type, "name", self.element_type - ) - header["frame_count"] = len(frames) - return header, frames - - @classmethod - def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) - if isinstance(header["element-type"], dict): - element_type = pickle.loads( - header["element-type"]["type-serialized"] - ).deserialize(header["element-type"], frames) - else: - element_type = header["element-type"] - return klass(element_type=element_type) - - @cached_property - def itemsize(self): - return self.element_type.itemsize - - -class StructDtype(_BaseDtype): - """ - Type to represent a struct data. - - Parameters - ---------- - fields : dict - A mapping of field names to dtypes, the dtypes can themselves - be of ``StructDtype`` too. - - Attributes - ---------- - fields - itemsize - - Methods - ------- - from_arrow - to_arrow - - Examples - -------- - >>> import cudf - >>> struct_dtype = cudf.StructDtype({"a": "int64", "b": "string"}) - >>> struct_dtype - StructDtype({'a': dtype('int64'), 'b': dtype('O')}) - - A nested ``StructDtype`` can also be constructed in the following way: - - >>> nested_struct_dtype = cudf.StructDtype({"dict_data": struct_dtype, "c": "uint8"}) - >>> nested_struct_dtype - StructDtype({'dict_data': StructDtype({'a': dtype('int64'), 'b': dtype('O')}), 'c': dtype('uint8')}) - """ # noqa: E501 - - name = "struct" - - def __init__(self, fields): - pa_fields = { - k: cudf.utils.dtypes.cudf_dtype_to_pa_type(v) - for k, v in fields.items() - } - self._typ = pa.struct(pa_fields) - - @property - def fields(self): - """ - Returns an ordered dict of column name and dtype key-value. - - Examples - -------- - >>> import cudf - >>> struct_dtype = cudf.StructDtype({"a": "int64", "b": "string"}) - >>> struct_dtype - StructDtype({'a': dtype('int64'), 'b': dtype('O')}) - >>> struct_dtype.fields - {'a': dtype('int64'), 'b': dtype('O')} - """ - return { - field.name: cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type) - for field in self._typ - } - - @property - def type(self): - # TODO: we should change this to return something like a - # StructDtypeType, once we figure out what that should look like - return dict - - @classmethod - def from_arrow(cls, typ): - """ - Convert a ``pyarrow.StructType`` to ``StructDtype``. - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> pa_struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()}) - >>> pa_struct_type - StructType(struct) - >>> cudf.StructDtype.from_arrow(pa_struct_type) - StructDtype({'x': dtype('int32'), 'y': dtype('O')}) - """ - obj = object.__new__(cls) - obj._typ = typ - return obj - - def to_arrow(self): - """ - Convert a ``StructDtype`` to a ``pyarrow.StructType``. - - Examples - -------- - >>> import cudf - >>> struct_type = cudf.StructDtype({"x": "int32", "y": "string"}) - >>> struct_type - StructDtype({'x': dtype('int32'), 'y': dtype('O')}) - >>> struct_type.to_arrow() - StructType(struct) - """ - return self._typ - - def __eq__(self, other): - if isinstance(other, str): - return other == self.name - if not isinstance(other, StructDtype): - return False - return self._typ.equals(other._typ) - - def __repr__(self): - return f"{type(self).__name__}({self.fields})" - - def __hash__(self): - return hash(self._typ) - - def serialize(self) -> tuple[dict, list]: - header: dict[str, Any] = {} - header["type-serialized"] = pickle.dumps(type(self)) - - frames: list[Buffer] = [] - - fields: dict[str, bytes | tuple[Any, tuple[int, int]]] = {} - - for k, dtype in self.fields.items(): - if isinstance(dtype, _BaseDtype): - dtype_header, dtype_frames = dtype.serialize() - fields[k] = ( - dtype_header, - (len(frames), len(frames) + len(dtype_frames)), - ) - frames.extend(dtype_frames) - else: - fields[k] = pickle.dumps(dtype) - header["fields"] = fields - header["frame_count"] = len(frames) - return header, frames - - @classmethod - def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) - fields = {} - for k, dtype in header["fields"].items(): - if isinstance(dtype, tuple): - dtype_header, (start, stop) = dtype - fields[k] = pickle.loads( - dtype_header["type-serialized"] - ).deserialize( - dtype_header, - frames[start:stop], - ) - else: - fields[k] = pickle.loads(dtype) - return cls(fields) - - @cached_property - def itemsize(self): - return sum( - cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type).itemsize - for field in self._typ - ) - - -decimal_dtype_template = textwrap.dedent( - """ - Type to represent a ``decimal{size}`` data. - - Parameters - ---------- - precision : int - The total number of digits in each value of this dtype - scale : int, optional - The scale of the dtype. See Notes below. - - Attributes - ---------- - precision - scale - itemsize - - Methods - ------- - to_arrow - from_arrow - - Notes - ----- - When the scale is positive: - - numbers with fractional parts (e.g., 0.0042) can be represented - - the scale is the total number of digits to the right of the - decimal point - - When the scale is negative: - - only multiples of powers of 10 (including 10**0) can be - represented (e.g., 1729, 4200, 1000000) - - the scale represents the number of trailing zeros in the value. - - For example, 42 is representable with precision=2 and scale=0. - 13.0051 is representable with precision=6 and scale=4, - and *not* representable with precision<6 or scale<4. - - Examples - -------- - >>> import cudf - >>> decimal{size}_dtype = cudf.Decimal{size}Dtype(precision=9, scale=2) - >>> decimal{size}_dtype - Decimal{size}Dtype(precision=9, scale=2) - """ # noqa: E501 -) - - -class DecimalDtype(_BaseDtype): - _metadata = ("precision", "scale") - - def __init__(self, precision, scale=0): - self._validate(precision, scale) - self._typ = pa.decimal128(precision, scale) - - @property - def str(self): - return f"{str(self.name)}({self.precision}, {self.scale})" - - @property - def precision(self): - """ - The decimal precision, in number of decimal digits (an integer). - """ - return self._typ.precision - - @precision.setter - def precision(self, value): - self._validate(value, self.scale) - self._typ = pa.decimal128(precision=value, scale=self.scale) - - @property - def scale(self): - """ - The decimal scale (an integer). - """ - return self._typ.scale - - @property - def itemsize(self): - """ - Length of one column element in bytes. - """ - return self.ITEMSIZE - - @property - def type(self): - # might need to account for precision and scale here - return decimal.Decimal - - def to_arrow(self): - """ - Return the equivalent ``pyarrow`` dtype. - """ - return self._typ - - @classmethod - def from_arrow(cls, typ): - """ - Construct a cudf decimal dtype from a ``pyarrow`` dtype - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> pa_type = pa.decimal128(precision=9, scale=2) - - Constructing a ``Decimal32Dtype``: - - >>> cudf.Decimal32Dtype.from_arrow(pa_type) - Decimal64Dtype(precision=9, scale=2) - - Constructing a ``Decimal64Dtype``: - - >>> cudf.Decimal64Dtype.from_arrow(pa_type) - Decimal64Dtype(precision=9, scale=2) - - Constructing a ``Decimal128Dtype``: - - >>> cudf.Decimal128Dtype.from_arrow(pa_type) - Decimal128Dtype(precision=9, scale=2) - """ - return cls(typ.precision, typ.scale) - - def __repr__(self): - return ( - f"{self.__class__.__name__}" - f"(precision={self.precision}, scale={self.scale})" - ) - - @classmethod - def _validate(cls, precision, scale=0): - if precision > cls.MAX_PRECISION: - raise ValueError( - f"Cannot construct a {cls.__name__}" - f" with precision > {cls.MAX_PRECISION}" - ) - if abs(scale) > precision: - raise ValueError(f"scale={scale} exceeds precision={precision}") - - @classmethod - def _from_decimal(cls, decimal): - """ - Create a cudf.DecimalDtype from a decimal.Decimal object - """ - metadata = decimal.as_tuple() - precision = max(len(metadata.digits), -metadata.exponent) - return cls(precision, -metadata.exponent) - - def serialize(self) -> tuple[dict, list]: - return ( - { - "type-serialized": pickle.dumps(type(self)), - "precision": self.precision, - "scale": self.scale, - "frame_count": 0, - }, - [], - ) - - @classmethod - def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type( - cls, header, frames, is_valid_class=issubclass - ) - klass = pickle.loads(header["type-serialized"]) - return klass(header["precision"], header["scale"]) - - def __eq__(self, other: Dtype) -> bool: - if other is self: - return True - elif not isinstance(other, self.__class__): - return False - return self.precision == other.precision and self.scale == other.scale - - def __hash__(self): - return hash(self._typ) - - -@doc_apply( - decimal_dtype_template.format( - size="32", - ) -) -class Decimal32Dtype(DecimalDtype): - name = "decimal32" - MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max)) - ITEMSIZE = 4 - - -@doc_apply( - decimal_dtype_template.format( - size="64", - ) -) -class Decimal64Dtype(DecimalDtype): - name = "decimal64" - MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max)) - ITEMSIZE = 8 - - -@doc_apply( - decimal_dtype_template.format( - size="128", - ) -) -class Decimal128Dtype(DecimalDtype): - name = "decimal128" - MAX_PRECISION = 38 - ITEMSIZE = 16 - - -class IntervalDtype(StructDtype): - """ - subtype: str, np.dtype - The dtype of the Interval bounds. - closed: {'right', 'left', 'both', 'neither'}, default 'right' - Whether the interval is closed on the left-side, right-side, - both or neither. See the Notes for more detailed explanation. - """ - - name = "interval" - - def __init__(self, subtype, closed="right"): - super().__init__(fields={"left": subtype, "right": subtype}) - - if closed is None: - closed = "right" - if closed in ["left", "right", "neither", "both"]: - self.closed = closed - else: - raise ValueError("closed value is not valid") - - @property - def subtype(self): - return self.fields["left"] - - def __repr__(self) -> str: - return f"interval[{self.subtype}, {self.closed}]" - - def __str__(self) -> str: - return self.__repr__() - - @classmethod - def from_arrow(cls, typ): - return IntervalDtype(typ.subtype.to_pandas_dtype(), typ.closed) - - def to_arrow(self): - return ArrowIntervalType( - pa.from_numpy_dtype(self.subtype), self.closed - ) - - @classmethod - def from_pandas(cls, pd_dtype: pd.IntervalDtype) -> "IntervalDtype": - return cls(subtype=pd_dtype.subtype, closed=pd_dtype.closed) - - def to_pandas(self) -> pd.IntervalDtype: - return pd.IntervalDtype(subtype=self.subtype, closed=self.closed) - - def __eq__(self, other): - if isinstance(other, str): - # This means equality isn't transitive but mimics pandas - return other in (self.name, str(self)) - return ( - type(self) == type(other) - and self.subtype == other.subtype - and self.closed == other.closed - ) - - def __hash__(self): - return hash((self.subtype, self.closed)) - - def serialize(self) -> tuple[dict, list]: - header = { - "type-serialized": pickle.dumps(type(self)), - "fields": pickle.dumps((self.subtype, self.closed)), - "frame_count": 0, - } - return header, [] - - @classmethod - def deserialize(cls, header: dict, frames: list): - header, frames, klass = _decode_type(cls, header, frames) - klass = pickle.loads(header["type-serialized"]) - subtype, closed = pickle.loads(header["fields"]) - return klass(subtype, closed=closed) - - -def _is_categorical_dtype(obj): - if obj is None: - return False - - if isinstance( - obj, - ( - pd.CategoricalDtype, - cudf.CategoricalDtype, - cudf.core.index.CategoricalIndex, - cudf.core.column.CategoricalColumn, - pd.Categorical, - pd.CategoricalIndex, - ), - ): - return True - # Note that we cannot directly use `obj in (...)` because that triggers - # equality as well as identity checks and pandas extension dtypes won't - # allow converting that equality check to a boolean; `__nonzero__` is - # disabled because they treat dtypes as "array-like". - if any( - obj is t - for t in ( - cudf.CategoricalDtype, - pd.CategoricalDtype, - pd.CategoricalDtype.type, - ) - ): - return True - if isinstance(obj, (np.ndarray, np.dtype)): - return False - if isinstance(obj, str) and obj == "category": - return True - if isinstance(obj, cudf.core.index.BaseIndex): - return obj._is_categorical() - if isinstance( - obj, - ( - cudf.Series, - cudf.core.column.ColumnBase, - pd.Index, - pd.Series, - ), - ): - try: - return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype) - except TypeError: - return False - if hasattr(obj, "type"): - if obj.type is pd.CategoricalDtype.type: - return True - # TODO: A lot of the above checks are probably redundant and should be - # farmed out to this function here instead. - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - return pd_types.is_categorical_dtype(obj) - - -def is_categorical_dtype(obj): - """Check whether an array-like or dtype is of the Categorical dtype. - - .. deprecated:: 24.04 - Use isinstance(dtype, cudf.CategoricalDtype) instead - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of a categorical dtype. - """ - # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." - warnings.warn( - "is_categorical_dtype is deprecated and will be removed in a future " - "version. Use isinstance(dtype, cudf.CategoricalDtype) instead", - DeprecationWarning, - ) - return _is_categorical_dtype(obj) - - -def is_list_dtype(obj): - """Check whether an array-like or dtype is of the list dtype. - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of the list dtype. - """ - return ( - type(obj) is cudf.core.dtypes.ListDtype - or obj is cudf.core.dtypes.ListDtype - or type(obj) is cudf.core.column.ListColumn - or obj is cudf.core.column.ListColumn - or (isinstance(obj, str) and obj == cudf.core.dtypes.ListDtype.name) - or ( - hasattr(obj, "dtype") - and isinstance(obj.dtype, cudf.core.dtypes.ListDtype) - ) - ) - - -def is_struct_dtype(obj): - """Check whether an array-like or dtype is of the struct dtype. - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of the struct dtype. - """ - # TODO: This behavior is currently inconsistent for interval types. the - # actual class IntervalDtype will return False, but instances (e.g. - # IntervalDtype(int)) will return True. For now this is not being changed - # since the interval dtype is being modified as part of the array refactor, - # but this behavior should be made consistent afterwards. - return ( - isinstance(obj, cudf.core.dtypes.StructDtype) - or obj is cudf.core.dtypes.StructDtype - or (isinstance(obj, str) and obj == cudf.core.dtypes.StructDtype.name) - or ( - hasattr(obj, "dtype") - and isinstance(obj.dtype, cudf.core.dtypes.StructDtype) - ) - ) - - -def is_decimal_dtype(obj): - """Check whether an array-like or dtype is of the decimal dtype. - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of the decimal dtype. - """ - return ( - is_decimal32_dtype(obj) - or is_decimal64_dtype(obj) - or is_decimal128_dtype(obj) - ) - - -def _is_interval_dtype(obj): - return ( - isinstance( - obj, - ( - cudf.core.dtypes.IntervalDtype, - pd.IntervalDtype, - ), - ) - or obj is cudf.core.dtypes.IntervalDtype - or (isinstance(obj, cudf.core.index.BaseIndex) and obj._is_interval()) - or ( - isinstance(obj, str) and obj == cudf.core.dtypes.IntervalDtype.name - ) - or ( - isinstance( - getattr(obj, "dtype", None), - (pd.IntervalDtype, cudf.core.dtypes.IntervalDtype), - ) - ) - ) - - -def is_interval_dtype(obj): - """Check whether an array-like or dtype is of the interval dtype. - - Parameters - ---------- - obj : array-like or dtype - The array-like or dtype to check. - - Returns - ------- - bool - Whether or not the array-like or dtype is of the interval dtype. - """ - warnings.warn( - "is_interval_dtype is deprecated and will be removed in a " - "future version. Use `isinstance(dtype, cudf.IntervalDtype)` instead", - DeprecationWarning, - ) - return _is_interval_dtype(obj) - - -def is_decimal32_dtype(obj): - return ( - type(obj) is cudf.core.dtypes.Decimal32Dtype - or obj is cudf.core.dtypes.Decimal32Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal32Dtype.name - ) - or (hasattr(obj, "dtype") and is_decimal32_dtype(obj.dtype)) - ) - - -def is_decimal64_dtype(obj): - return ( - type(obj) is cudf.core.dtypes.Decimal64Dtype - or obj is cudf.core.dtypes.Decimal64Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal64Dtype.name - ) - or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype)) - ) - - -def is_decimal128_dtype(obj): - return ( - type(obj) is cudf.core.dtypes.Decimal128Dtype - or obj is cudf.core.dtypes.Decimal128Dtype - or ( - isinstance(obj, str) - and obj == cudf.core.dtypes.Decimal128Dtype.name - ) - or (hasattr(obj, "dtype") and is_decimal128_dtype(obj.dtype)) - ) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py deleted file mode 100644 index 37ad6b8fabb..00000000000 --- a/python/cudf/cudf/core/frame.py +++ /dev/null @@ -1,1902 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import operator -import pickle -import warnings -from collections import abc -from typing import TYPE_CHECKING, Any, Literal, MutableMapping - -# TODO: The `numpy` import is needed for typing purposes during doc builds -# only, need to figure out why the `np` alias is insufficient then remove. -import cupy -import numpy -import numpy as np -import pyarrow as pa -from typing_extensions import Self - -import cudf -from cudf import _lib as libcudf -from cudf.api.types import is_dtype_equal, is_scalar -from cudf.core._compat import PANDAS_LT_300 -from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ( - ColumnBase, - as_column, - deserialize_columns, - serialize_columns, -) -from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes -from cudf.core.column_accessor import ColumnAccessor -from cudf.core.mixins import BinaryOperand, Scannable -from cudf.utils import ioutils -from cudf.utils.dtypes import find_common_type -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf - -if TYPE_CHECKING: - from types import ModuleType - - from cudf._typing import Dtype, ScalarLike - - -# TODO: It looks like Frame is missing a declaration of `copy`, need to add -class Frame(BinaryOperand, Scannable): - """A collection of Column objects with an optional index. - - Parameters - ---------- - data : dict - An dict mapping column names to Columns - index : Table - A Frame representing the (optional) index columns. - """ - - _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS - - def __init__(self, data: ColumnAccessor | MutableMapping[Any, ColumnBase]): - self._data = ColumnAccessor(data) - - @property - def _num_columns(self) -> int: - return len(self._data) - - @property - def _num_rows(self) -> int: - return self._data.nrows - - @property - def _column_names(self) -> tuple[Any, ...]: - return self._data.names - - @property - def _columns(self) -> tuple[ColumnBase, ...]: - return self._data.columns - - @property - def _column_labels_and_values( - self, - ) -> abc.Iterable[tuple[abc.Hashable, ColumnBase]]: - return zip(self._column_names, self._columns) - - @property - def _dtypes(self) -> abc.Generator[tuple[abc.Hashable, Dtype], None, None]: - for label, col in self._column_labels_and_values: - yield label, col.dtype - - @property - def ndim(self) -> int: - raise NotImplementedError() - - @_performance_tracking - def serialize(self): - # TODO: See if self._data can be serialized outright - header = { - "type-serialized": pickle.dumps(type(self)), - "column_names": pickle.dumps(self._column_names), - "column_rangeindex": pickle.dumps(self._data.rangeindex), - "column_multiindex": pickle.dumps(self._data.multiindex), - "column_label_dtype": pickle.dumps(self._data.label_dtype), - "column_level_names": pickle.dumps(self._data._level_names), - } - header["columns"], frames = serialize_columns(self._columns) - return header, frames - - @classmethod - @_performance_tracking - def deserialize(cls, header, frames): - cls_deserialize = pickle.loads(header["type-serialized"]) - column_names = pickle.loads(header["column_names"]) - columns = deserialize_columns(header["columns"], frames) - kwargs = {} - for metadata in [ - "rangeindex", - "multiindex", - "label_dtype", - "level_names", - ]: - key = f"column_{metadata}" - if key in header: - kwargs[metadata] = pickle.loads(header[key]) - col_accessor = ColumnAccessor( - data=dict(zip(column_names, columns)), **kwargs - ) - return cls_deserialize._from_data(col_accessor) - - @classmethod - @_performance_tracking - def _from_data(cls, data: MutableMapping) -> Self: - """ - Construct cls from a ColumnAccessor-like mapping. - """ - obj = cls.__new__(cls) - Frame.__init__(obj, data) - return obj - - @_performance_tracking - def _from_data_like_self(self, data: MutableMapping) -> Self: - """ - Return type(self) from a ColumnAccessor-like mapping but - with the external properties, e.g. .index, .name, of self. - """ - return self._from_data(data) - - @_performance_tracking - def _from_columns_like_self( - self, - columns: list[ColumnBase], - column_names: abc.Iterable[str] | None = None, - ): - """Construct a Frame from a list of columns with metadata from self. - - If `column_names` is None, use column names from self. - """ - if column_names is None: - column_names = self._column_names - data = dict(zip(column_names, columns)) - frame = self.__class__._from_data(data) - return frame._copy_type_metadata(self) - - @_performance_tracking - def _mimic_inplace( - self, result: Self, inplace: bool = False - ) -> Self | None: - if inplace: - for col in self._column_names: - if col in result._data: - self._data[col]._mimic_inplace( - result._data[col], inplace=True - ) - self._data = result._data - return None - else: - return result - - @property - @_performance_tracking - def size(self) -> int: - """ - Return the number of elements in the underlying data. - - Returns - ------- - size : Size of the DataFrame / Index / Series / MultiIndex - - Examples - -------- - Size of an empty dataframe is 0. - - >>> import cudf - >>> df = cudf.DataFrame() - >>> df - Empty DataFrame - Columns: [] - Index: [] - >>> df.size - 0 - >>> df = cudf.DataFrame(index=[1, 2, 3]) - >>> df - Empty DataFrame - Columns: [] - Index: [1, 2, 3] - >>> df.size - 0 - - DataFrame with values - - >>> df = cudf.DataFrame({'a': [10, 11, 12], - ... 'b': ['hello', 'rapids', 'ai']}) - >>> df - a b - 0 10 hello - 1 11 rapids - 2 12 ai - >>> df.size - 6 - >>> df.index - RangeIndex(start=0, stop=3) - >>> df.index.size - 3 - - Size of an Index - - >>> index = cudf.Index([]) - >>> index - Index([], dtype='float64') - >>> index.size - 0 - >>> index = cudf.Index([1, 2, 3, 10]) - >>> index - Index([1, 2, 3, 10], dtype='int64') - >>> index.size - 4 - - Size of a MultiIndex - - >>> midx = cudf.MultiIndex( - ... levels=[["a", "b", "c", None], ["1", None, "5"]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> midx - MultiIndex([( 'a', '1'), - ( 'a', '5'), - ( 'b', ), - ( 'c', ), - (, '1')], - names=['x', 'y']) - >>> midx.size - 5 - """ - return self._num_columns * self._num_rows - - def memory_usage(self, deep=False): - """Return the memory usage of an object. - - Parameters - ---------- - deep : bool - The deep parameter is ignored and is only included for pandas - compatibility. - - Returns - ------- - The total bytes used. - """ - raise NotImplementedError - - @_performance_tracking - def __len__(self) -> int: - return self._num_rows - - @_performance_tracking - def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self: - casted = ( - col.astype(dtype.get(col_name, col.dtype), copy=copy) - for col_name, col in self._column_labels_and_values - ) - ca = self._data._from_columns_like_self(casted, verify=False) - return self._from_data_like_self(ca) - - @_performance_tracking - def equals(self, other) -> bool: - """ - Test whether two objects contain the same elements. - - This function allows two objects to be compared against - each other to see if they have the same shape and elements. NaNs in - the same location are considered equal. The column headers do not - need to have the same type. - - Parameters - ---------- - other : Index, Series, DataFrame - The other object to be compared with. - - Returns - ------- - bool - True if all elements are the same in both objects, False - otherwise. - - Examples - -------- - >>> import cudf - - Comparing Series with `equals`: - - >>> s = cudf.Series([1, 2, 3]) - >>> other = cudf.Series([1, 2, 3]) - >>> s.equals(other) - True - >>> different = cudf.Series([1.5, 2, 3]) - >>> s.equals(different) - False - - Comparing DataFrames with `equals`: - - >>> df = cudf.DataFrame({1: [10], 2: [20]}) - >>> df - 1 2 - 0 10 20 - >>> exactly_equal = cudf.DataFrame({1: [10], 2: [20]}) - >>> exactly_equal - 1 2 - 0 10 20 - >>> df.equals(exactly_equal) - True - - For two DataFrames to compare equal, the types of column - values must be equal, but the types of column labels - need not: - - >>> different_column_type = cudf.DataFrame({1.0: [10], 2.0: [20]}) - >>> different_column_type - 1.0 2.0 - 0 10 20 - >>> df.equals(different_column_type) - True - """ - if self is other: - return True - if not isinstance(other, type(self)) or len(self) != len(other): - return False - - return all( - self_col.equals(other_col, check_dtypes=True) - for self_col, other_col in zip(self._columns, other._columns) - ) - - @_performance_tracking - def _get_columns_by_label(self, labels) -> Self: - """ - Returns columns of the Frame specified by `labels`. - - Akin to cudf.DataFrame(...).loc[:, labels] - """ - return self._from_data_like_self(self._data.select_by_label(labels)) - - @property - @_performance_tracking - def values(self) -> cupy.ndarray: - """ - Return a CuPy representation of the DataFrame. - - Only the values in the DataFrame will be returned, the axes labels will - be removed. - - Returns - ------- - cupy.ndarray - The values of the DataFrame. - """ - return self.to_cupy() - - @property - @_performance_tracking - def values_host(self) -> np.ndarray: - """ - Return a NumPy representation of the data. - - Only the values in the DataFrame will be returned, the axes labels will - be removed. - - Returns - ------- - numpy.ndarray - A host representation of the underlying data. - """ - return self.to_numpy() - - @_performance_tracking - def __array__(self, dtype=None, copy=None): - raise TypeError( - "Implicit conversion to a host NumPy array via __array__ is not " - "allowed, To explicitly construct a GPU matrix, consider using " - ".to_cupy()\nTo explicitly construct a host matrix, consider " - "using .to_numpy()." - ) - - @_performance_tracking - def __arrow_array__(self, type=None): - raise TypeError( - "Implicit conversion to a host PyArrow object via __arrow_array__ " - "is not allowed. Consider using .to_arrow()" - ) - - @_performance_tracking - def _to_array( - self, - get_array: abc.Callable, - module: ModuleType, - copy: bool, - dtype: Dtype | None = None, - na_value=None, - ) -> cupy.ndarray | numpy.ndarray: - # Internal function to implement to_cupy and to_numpy, which are nearly - # identical except for the attribute they access to generate values. - - def to_array( - col: ColumnBase, dtype: np.dtype - ) -> cupy.ndarray | numpy.ndarray: - if na_value is not None: - col = col.fillna(na_value) - array = get_array(col) - casted_array = module.asarray(array, dtype=dtype) - if copy and casted_array is array: - # Don't double copy after asarray - casted_array = casted_array.copy() - return casted_array - - ncol = self._num_columns - if ncol == 0: - return module.empty( - shape=(len(self), ncol), - dtype=numpy.dtype("float64"), - order="F", - ) - - if dtype is None: - if ncol == 1: - dtype = next(self._dtypes)[1] - else: - dtype = find_common_type([dtype for _, dtype in self._dtypes]) - - if not isinstance(dtype, numpy.dtype): - raise NotImplementedError( - f"{dtype} cannot be exposed as an array" - ) - - if self.ndim == 1: - return to_array(self._columns[0], dtype) - else: - matrix = module.empty( - shape=(len(self), ncol), dtype=dtype, order="F" - ) - for i, col in enumerate(self._columns): - # TODO: col.values may fail if there is nullable data or an - # unsupported dtype. We may want to catch and provide a more - # suitable error. - matrix[:, i] = to_array(col, dtype) - return matrix - - # TODO: As of now, calling cupy.asarray is _much_ faster than calling - # to_cupy. We should investigate the reasons why and whether we can provide - # a more efficient method here by exploiting __cuda_array_interface__. In - # particular, we need to benchmark how much of the overhead is coming from - # (potentially unavoidable) local copies in to_cupy and how much comes from - # inefficiencies in the implementation. - @_performance_tracking - def to_cupy( - self, - dtype: Dtype | None = None, - copy: bool = False, - na_value=None, - ) -> cupy.ndarray: - """Convert the Frame to a CuPy array. - - Parameters - ---------- - dtype : str or :class:`numpy.dtype`, optional - The dtype to pass to :func:`numpy.asarray`. - copy : bool, default False - Whether to ensure that the returned value is not a view on - another array. Note that ``copy=False`` does not *ensure* that - ``to_cupy()`` is no-copy. Rather, ``copy=True`` ensure that - a copy is made, even if not strictly necessary. - na_value : Any, default None - The value to use for missing values. The default value depends on - dtype and the dtypes of the DataFrame columns. - - Returns - ------- - cupy.ndarray - """ - return self._to_array( - lambda col: col.values, - cupy, - copy, - dtype, - na_value, - ) - - @_performance_tracking - def to_numpy( - self, - dtype: Dtype | None = None, - copy: bool = True, - na_value=None, - ) -> numpy.ndarray: - """Convert the Frame to a NumPy array. - - Parameters - ---------- - dtype : str or :class:`numpy.dtype`, optional - The dtype to pass to :func:`numpy.asarray`. - copy : bool, default True - Whether to ensure that the returned value is not a view on - another array. This parameter must be ``True`` since cuDF must copy - device memory to host to provide a numpy array. - na_value : Any, default None - The value to use for missing values. The default value depends on - dtype and the dtypes of the DataFrame columns. - - Returns - ------- - numpy.ndarray - """ - if not copy: - raise ValueError( - "copy=False is not supported because conversion to a numpy " - "array always copies the data." - ) - - return self._to_array( - lambda col: col.values_host, numpy, copy, dtype, na_value - ) - - @_performance_tracking - def where(self, cond, other=None, inplace: bool = False) -> Self | None: - """ - Replace values where the condition is False. - - Parameters - ---------- - cond : bool Series/DataFrame, array-like - Where cond is True, keep the original value. - Where False, replace with corresponding value from other. - Callables are not supported. - other: scalar, list of scalars, Series/DataFrame - Entries where cond is False are replaced with - corresponding value from other. Callables are not - supported. Default is None. - - DataFrame expects only Scalar or array like with scalars or - dataframe with same dimension as self. - - Series expects only scalar or series like with same length - inplace : bool, default False - Whether to perform the operation in place on the data. - - Returns - ------- - Same type as caller - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) - >>> df.where(df % 2 == 0, [-1, -1]) - A B - 0 -1 -1 - 1 4 -1 - 2 -1 8 - - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.where(ser > 2, 10) - 0 4 - 1 3 - 2 10 - 3 10 - 4 10 - dtype: int64 - >>> ser.where(ser > 2) - 0 4 - 1 3 - 2 - 3 - 4 - dtype: int64 - - .. pandas-compat:: - :meth:`pandas.DataFrame.where`, :meth:`pandas.Series.where` - - Note that ``where`` treats missing values as falsy, - in parallel with pandas treatment of nullable data: - - >>> gsr = cudf.Series([1, 2, 3]) - >>> gsr.where([True, False, cudf.NA]) - 0 1 - 1 - 2 - dtype: int64 - >>> gsr.where([True, False, False]) - 0 1 - 1 - 2 - dtype: int64 - """ - raise NotImplementedError - - @_performance_tracking - def fillna( - self, - value: None | ScalarLike | cudf.Series = None, - method: Literal["ffill", "bfill", "pad", "backfill", None] = None, - axis=None, - inplace: bool = False, - limit=None, - ) -> Self | None: - """Fill null values with ``value`` or specified ``method``. - - Parameters - ---------- - value : scalar, Series-like or dict - Value to use to fill nulls. If Series-like, null values - are filled with values in corresponding indices. - A dict can be used to provide different values to fill nulls - in different columns. Cannot be used with ``method``. - method : {'ffill', 'bfill'}, default None - Method to use for filling null values in the dataframe or series. - `ffill` propagates the last non-null values forward to the next - non-null value. `bfill` propagates backward with the next non-null - value. Cannot be used with ``value``. - - .. deprecated:: 24.04 - `method` is deprecated. - - Returns - ------- - result : DataFrame, Series, or Index - Copy with nulls filled. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, None], 'b': [3, None, 5]}) - >>> df - a b - 0 1 3 - 1 2 - 2 5 - >>> df.fillna(4) - a b - 0 1 3 - 1 2 4 - 2 4 5 - >>> df.fillna({'a': 3, 'b': 4}) - a b - 0 1 3 - 1 2 4 - 2 3 5 - - ``fillna`` on a Series object: - - >>> ser = cudf.Series(['a', 'b', None, 'c']) - >>> ser - 0 a - 1 b - 2 - 3 c - dtype: object - >>> ser.fillna('z') - 0 a - 1 b - 2 z - 3 c - dtype: object - - ``fillna`` can also supports inplace operation: - - >>> ser.fillna('z', inplace=True) - >>> ser - 0 a - 1 b - 2 z - 3 c - dtype: object - >>> df.fillna({'a': 3, 'b': 4}, inplace=True) - >>> df - a b - 0 1 3 - 1 2 4 - 2 3 5 - - ``fillna`` specified with fill ``method`` - - >>> ser = cudf.Series([1, None, None, 2, 3, None, None]) - >>> ser.fillna(method='ffill') - 0 1 - 1 1 - 2 1 - 3 2 - 4 3 - 5 3 - 6 3 - dtype: int64 - >>> ser.fillna(method='bfill') - 0 1 - 1 2 - 2 2 - 3 2 - 4 3 - 5 - 6 - dtype: int64 - """ - if limit is not None: - raise NotImplementedError("The limit keyword is not supported") - if axis: - raise NotImplementedError("The axis keyword is not supported") - - if value is not None and method is not None: - raise ValueError("Cannot specify both 'value' and 'method'.") - - if method: - # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn( - f"{type(self).__name__}.fillna with 'method' is " - "deprecated and will raise in a future version. " - "Use obj.ffill() or obj.bfill() instead.", - FutureWarning, - ) - if method not in {"ffill", "bfill", "pad", "backfill"}: - raise NotImplementedError( - f"Fill method {method} is not supported" - ) - if method == "pad": - method = "ffill" - elif method == "backfill": - method = "bfill" - - if is_scalar(value): - value = {name: value for name in self._column_names} - elif not isinstance(value, (abc.Mapping, cudf.Series)): - raise TypeError( - f'"value" parameter must be a scalar, dict ' - f"or Series, but you passed a " - f'"{type(value).__name__}"' - ) - - filled_columns = [ - col.fillna(value[name], method) if name in value else col.copy() - for name, col in self._column_labels_and_values - ] - - return self._mimic_inplace( - self._from_data_like_self( - self._data._from_columns_like_self( - filled_columns, verify=False - ) - ), - inplace=inplace, - ) - - @_performance_tracking - def _drop_column( - self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" - ) -> None: - """Drop a column by *name* inplace.""" - try: - del self._data[name] - except KeyError as err: - if errors != "ignore": - raise KeyError(f"column '{name}' does not exist") from err - - @_performance_tracking - def _quantile_table( - self, - q: float, - interpolation: Literal[ - "LINEAR", "LOWER", "HIGHER", "MIDPOINT", "NEAREST" - ] = "LINEAR", - is_sorted: bool = False, - column_order=(), - null_precedence=(), - ): - interpolation = libcudf.types.Interpolation[interpolation] - - is_sorted = libcudf.types.Sorted["YES" if is_sorted else "NO"] - - column_order = [libcudf.types.Order[key] for key in column_order] - - null_precedence = [ - libcudf.types.NullOrder[key] for key in null_precedence - ] - - return self._from_columns_like_self( - libcudf.quantiles.quantile_table( - [*self._columns], - q, - interpolation, - is_sorted, - column_order, - null_precedence, - ), - column_names=self._column_names, - ) - - @classmethod - @_performance_tracking - def from_arrow(cls, data: pa.Table) -> Self: - """Convert from PyArrow Table to Frame - - Parameters - ---------- - data : PyArrow Table - - Raises - ------ - TypeError for invalid input type. - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> data = pa.table({"a":[1, 2, 3], "b":[4, 5, 6]}) - >>> cudf.core.frame.Frame.from_arrow(data) - a b - 0 1 4 - 1 2 5 - 2 3 6 - """ - - if not isinstance(data, (pa.Table)): - raise TypeError( - "To create a multicolumn cudf data, " - "the data should be an arrow Table" - ) - - column_names = data.column_names - pandas_dtypes = {} - np_dtypes = {} - if isinstance(data.schema.pandas_metadata, dict): - metadata = data.schema.pandas_metadata - pandas_dtypes = { - col["field_name"]: col["pandas_type"] - for col in metadata["columns"] - if "field_name" in col - } - np_dtypes = { - col["field_name"]: col["numpy_type"] - for col in metadata["columns"] - if "field_name" in col - } - - # Currently we don't have support for - # pyarrow.DictionaryArray -> cudf Categorical column, - # so handling indices and dictionary as two different columns. - # This needs be removed once we have hooked libcudf dictionary32 - # with categorical. - if any( - isinstance(x.type, pa.DictionaryType) - and isinstance(x, pa.ChunkedArray) - for x in data - ): - data = data.combine_chunks() - - dict_indices = {} - dict_dictionaries = {} - dict_ordered = {} - for field in data.schema: - if isinstance(field.type, pa.DictionaryType): - dict_ordered[field.name] = field.type.ordered - dict_indices[field.name] = pa.chunked_array( - [chunk.indices for chunk in data[field.name].chunks], - type=field.type.index_type, - ) - dict_dictionaries[field.name] = pa.chunked_array( - [chunk.dictionary for chunk in data[field.name].chunks], - type=field.type.value_type, - ) - - # Handle dict arrays - cudf_category_frame = {} - if len(dict_indices): - dict_indices_table = pa.table(dict_indices) - data = data.drop(dict_indices_table.column_names) - indices_columns = libcudf.interop.from_arrow(dict_indices_table) - # as dictionary size can vary, it can't be a single table - cudf_dictionaries_columns = { - name: ColumnBase.from_arrow(dict_dictionaries[name]) - for name in dict_dictionaries.keys() - } - - for name, codes in zip( - dict_indices_table.column_names, indices_columns - ): - categories = cudf_dictionaries_columns[name] - codes = as_unsigned_codes(len(categories), codes) - cudf_category_frame[name] = CategoricalColumn( - data=None, - size=codes.size, - dtype=cudf.CategoricalDtype( - categories=categories, - ordered=dict_ordered[name], - ), - mask=codes.base_mask, - children=(codes,), - ) - - # Handle non-dict arrays - cudf_non_category_frame = { - name: col - for name, col in zip( - data.column_names, libcudf.interop.from_arrow(data) - ) - } - - result = {**cudf_non_category_frame, **cudf_category_frame} - - # There are some special cases that need to be handled - # based on metadata. - for name in result: - if ( - len(result[name]) == 0 - and pandas_dtypes.get(name) == "categorical" - ): - # When pandas_dtype is a categorical column and the size - # of column is 0 (i.e., empty) then we will have an - # int8 column in result._data[name] returned by libcudf, - # which needs to be type-casted to 'category' dtype. - result[name] = result[name].astype("category") - elif ( - pandas_dtypes.get(name) == "empty" - and np_dtypes.get(name) == "object" - ): - # When a string column has all null values, pandas_dtype is - # is specified as 'empty' and np_dtypes as 'object', - # hence handling this special case to type-cast the empty - # float column to str column. - result[name] = result[name].astype(cudf.dtype("str")) - elif name in data.column_names and isinstance( - data[name].type, - ( - pa.StructType, - pa.ListType, - pa.Decimal128Type, - pa.TimestampType, - ), - ): - # In case of struct column, libcudf is not aware of names of - # struct fields, hence renaming the struct fields is - # necessary by extracting the field names from arrow - # struct types. - - # In case of decimal column, libcudf is not aware of the - # decimal precision. - - # In case of list column, there is a possibility of nested - # list columns to have struct or decimal columns inside them. - - # Datetimes ("timestamps") may need timezone metadata - # attached to them, as libcudf is timezone-unaware - - # All of these cases are handled by calling the - # _with_type_metadata method on the column. - result[name] = result[name]._with_type_metadata( - cudf.utils.dtypes.cudf_dtype_from_pa_type(data[name].type) - ) - - return cls._from_data({name: result[name] for name in column_names}) - - @_performance_tracking - def to_arrow(self): - """ - Convert to arrow Table - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame( - ... {"a":[1, 2, 3], "b":[4, 5, 6]}, index=[1, 2, 3]) - >>> df.to_arrow() - pyarrow.Table - a: int64 - b: int64 - index: int64 - ---- - a: [[1,2,3]] - b: [[4,5,6]] - index: [[1,2,3]] - """ - return pa.Table.from_pydict( - { - str(name): col.to_arrow() - for name, col in self._column_labels_and_values - } - ) - - @_performance_tracking - def _positions_from_column_names(self, column_names) -> list[int]: - """Map each column name into their positions in the frame. - - The order of indices returned corresponds to the column order in this - Frame. - """ - return [ - i - for i, name in enumerate(self._column_names) - if name in set(column_names) - ] - - @_performance_tracking - def _copy_type_metadata(self: Self, other: Self) -> Self: - """ - Copy type metadata from each column of `other` to the corresponding - column of `self`. - - See `ColumnBase._with_type_metadata` for more information. - """ - for (name, col), (_, dtype) in zip( - self._column_labels_and_values, other._dtypes - ): - self._data.set_by_label(name, col._with_type_metadata(dtype)) - - return self - - @_performance_tracking - def isna(self): - """ - Identify missing values. - - Return a boolean same-sized object indicating if - the values are ````. ```` values gets mapped to - ``True`` values. Everything else gets mapped to - ``False`` values. ```` values include: - - * Values where null mask is set. - * ``NaN`` in float dtype. - * ``NaT`` in datetime64 and timedelta64 types. - - Characters such as empty strings ``''`` or - ``inf`` in case of float are not - considered ```` values. - - Returns - ------- - DataFrame/Series/Index - Mask of bool values for each element in - the object that indicates whether an element is an NA value. - - Examples - -------- - Show which entries in a DataFrame are NA. - - >>> import cudf - >>> import numpy as np - >>> import pandas as pd - >>> df = cudf.DataFrame({'age': [5, 6, np.nan], - ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... 'name': ['Alfred', 'Batman', ''], - ... 'toy': [None, 'Batmobile', 'Joker']}) - >>> df - age born name toy - 0 5 Alfred - 1 6 1939-05-27 00:00:00.000000 Batman Batmobile - 2 1940-04-25 00:00:00.000000 Joker - >>> df.isna() - age born name toy - 0 False True False True - 1 False False False False - 2 True False False False - - Show which entries in a Series are NA. - - >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf]) - >>> ser - 0 5.0 - 1 6.0 - 2 - 3 Inf - 4 -Inf - dtype: float64 - >>> ser.isna() - 0 False - 1 False - 2 True - 3 False - 4 False - dtype: bool - - Show which entries in an Index are NA. - - >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf]) - >>> idx - Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') - >>> idx.isna() - array([False, False, True, True, False, False]) - """ - data_columns = (col.isnull() for col in self._columns) - return self._from_data_like_self( - self._data._from_columns_like_self(data_columns) - ) - - # Alias for isna - isnull = isna - - @_performance_tracking - def notna(self): - """ - Identify non-missing values. - - Return a boolean same-sized object indicating if - the values are not ````. Non-missing values get - mapped to ``True``. ```` values get mapped to - ``False`` values. ```` values include: - - * Values where null mask is set. - * ``NaN`` in float dtype. - * ``NaT`` in datetime64 and timedelta64 types. - - Characters such as empty strings ``''`` or - ``inf`` in case of float are not - considered ```` values. - - Returns - ------- - DataFrame/Series/Index - Mask of bool values for each element in - the object that indicates whether an element is not an NA value. - - Examples - -------- - Show which entries in a DataFrame are NA. - - >>> import cudf - >>> import numpy as np - >>> import pandas as pd - >>> df = cudf.DataFrame({'age': [5, 6, np.nan], - ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), - ... pd.Timestamp('1940-04-25')], - ... 'name': ['Alfred', 'Batman', ''], - ... 'toy': [None, 'Batmobile', 'Joker']}) - >>> df - age born name toy - 0 5 Alfred - 1 6 1939-05-27 00:00:00.000000 Batman Batmobile - 2 1940-04-25 00:00:00.000000 Joker - >>> df.notna() - age born name toy - 0 True False True False - 1 True True True True - 2 False True True True - - Show which entries in a Series are NA. - - >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf]) - >>> ser - 0 5.0 - 1 6.0 - 2 - 3 Inf - 4 -Inf - dtype: float64 - >>> ser.notna() - 0 True - 1 True - 2 False - 3 True - 4 True - dtype: bool - - Show which entries in an Index are NA. - - >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf]) - >>> idx - Index([1.0, 2.0, , , 0.32, Inf], dtype='float64') - >>> idx.notna() - array([ True, True, False, False, True, True]) - """ - data_columns = (col.notnull() for col in self._columns) - return self._from_data_like_self( - self._data._from_columns_like_self(data_columns) - ) - - # Alias for notna - notnull = notna - - @_performance_tracking - def searchsorted( - self, - values, - side: Literal["left", "right"] = "left", - sorter=None, - ascending: bool = True, - na_position: Literal["first", "last"] = "last", - ) -> ScalarLike | cupy.ndarray: - """Find indices where elements should be inserted to maintain order - - Parameters - ---------- - value : Frame (Shape must be consistent with self) - Values to be hypothetically inserted into Self - side : str {'left', 'right'} optional, default 'left' - If 'left', the index of the first suitable location found is given - If 'right', return the last such index - sorter : 1-D array-like, optional - Optional array of integer indices that sort `self` into ascending - order. They are typically the result of ``np.argsort``. - Currently not supported. - ascending : bool optional, default True - Sorted Frame is in ascending order (otherwise descending) - na_position : str {'last', 'first'} optional, default 'last' - Position of null values in sorted order - - Returns - ------- - 1-D cupy array of insertion points - - Examples - -------- - >>> s = cudf.Series([1, 2, 3]) - >>> s.searchsorted(4) - 3 - >>> s.searchsorted([0, 4]) - array([0, 3], dtype=int32) - >>> s.searchsorted([1, 3], side='left') - array([0, 2], dtype=int32) - >>> s.searchsorted([1, 3], side='right') - array([1, 3], dtype=int32) - - If the values are not monotonically sorted, wrong - locations may be returned: - - >>> s = cudf.Series([2, 1, 3]) - >>> s.searchsorted(1) - 0 # wrong result, correct would be 1 - - >>> df = cudf.DataFrame({'a': [1, 3, 5, 7], 'b': [10, 12, 14, 16]}) - >>> df - a b - 0 1 10 - 1 3 12 - 2 5 14 - 3 7 16 - >>> values_df = cudf.DataFrame({'a': [0, 2, 5, 6], - ... 'b': [10, 11, 13, 15]}) - >>> values_df - a b - 0 0 10 - 1 2 17 - 2 5 13 - 3 6 15 - >>> df.searchsorted(values_df, ascending=False) - array([4, 4, 4, 0], dtype=int32) - """ - # Note: pandas.DataFrame does not support searchsorted - - if na_position not in {"first", "last"}: - raise ValueError(f"invalid na_position: {na_position}") - elif sorter is not None: - raise NotImplementedError("sorter is currently not supported.") - - scalar_flag = None - if is_scalar(values): - scalar_flag = True - - if not isinstance(values, Frame): - values = [as_column(values)] - else: - values = [*values._columns] - if len(values) != len(self._data): - raise ValueError("Mismatch number of columns to search for.") - - # TODO: Change behavior based on the decision in - # https://github.com/pandas-dev/pandas/issues/54668 - common_dtype_list = [ - find_common_type([col.dtype, val.dtype]) - for col, val in zip(self._columns, values) - ] - sources = [ - col - if is_dtype_equal(col.dtype, common_dtype) - else col.astype(common_dtype) - for col, common_dtype in zip(self._columns, common_dtype_list) - ] - values = [ - val - if is_dtype_equal(val.dtype, common_dtype) - else val.astype(common_dtype) - for val, common_dtype in zip(values, common_dtype_list) - ] - - outcol = libcudf.search.search_sorted( - sources, - values, - side, - ascending=ascending, - na_position=na_position, - ) - - # Return result as cupy array if the values is non-scalar - # If values is scalar, result is expected to be scalar. - result = cupy.asarray(outcol.data_array_view(mode="read")) - if scalar_flag: - return result[0].item() - else: - return result - - @_performance_tracking - def argsort( - self, - by=None, - axis=0, - kind="quicksort", - order=None, - ascending=True, - na_position="last", - ) -> cupy.ndarray: - """Return the integer indices that would sort the Series values. - - Parameters - ---------- - by : str or list of str, default None - Name or list of names to sort by. If None, sort by all columns. - axis : {0 or "index"} - Has no effect but is accepted for compatibility with numpy. - kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' - Choice of sorting algorithm. See :func:`numpy.sort` for more - information. 'mergesort' and 'stable' are the only stable - algorithms. Only quicksort is supported in cuDF. - order : None - Has no effect but is accepted for compatibility with numpy. - ascending : bool or list of bool, default True - If True, sort values in ascending order, otherwise descending. - na_position : {'first' or 'last'}, default 'last' - Argument 'first' puts NaNs at the beginning, 'last' puts NaNs - at the end. - - Returns - ------- - cupy.ndarray: The indices sorted based on input. - - Examples - -------- - **Series** - - >>> import cudf - >>> s = cudf.Series([3, 1, 2]) - >>> s - 0 3 - 1 1 - 2 2 - dtype: int64 - >>> s.argsort() - 0 1 - 1 2 - 2 0 - dtype: int32 - >>> s[s.argsort()] - 1 1 - 2 2 - 0 3 - dtype: int64 - - **DataFrame** - >>> import cudf - >>> df = cudf.DataFrame({'foo': [3, 1, 2]}) - >>> df.argsort() - array([1, 2, 0], dtype=int32) - - **Index** - >>> import cudf - >>> idx = cudf.Index([3, 1, 2]) - >>> idx.argsort() - array([1, 2, 0], dtype=int32) - """ # noqa: E501 - if na_position not in {"first", "last"}: - raise ValueError(f"invalid na_position: {na_position}") - if kind != "quicksort": - if kind not in {"mergesort", "heapsort", "stable"}: - raise AttributeError( - f"{kind} is not a valid sorting algorithm for " - f"'DataFrame' object" - ) - warnings.warn( - f"GPU-accelerated {kind} is currently not supported, " - "defaulting to quicksort." - ) - - if isinstance(by, str): - by = [by] - return self._get_sorted_inds( - by=by, ascending=ascending, na_position=na_position - ).values - - @_performance_tracking - def _get_sorted_inds( - self, - by=None, - ascending=True, - na_position: Literal["first", "last"] = "last", - ) -> ColumnBase: - """ - Get the indices required to sort self according to the columns - specified in by. - """ - if by is None: - to_sort = self._columns - else: - to_sort = self._get_columns_by_label(list(by))._columns - - if is_scalar(ascending): - ascending_lst = [ascending] * len(to_sort) - else: - ascending_lst = list(ascending) - - return libcudf.sort.order_by( - list(to_sort), - ascending_lst, - na_position, - stable=True, - ) - - @_performance_tracking - def _split(self, splits): - """Split a frame with split points in ``splits``. Returns a list of - Frames of length `len(splits) + 1`. - """ - return [ - self._from_columns_like_self( - libcudf.copying.columns_split(list(self._columns), splits)[ - split_idx - ], - self._column_names, - ) - for split_idx in range(len(splits) + 1) - ] - - @_performance_tracking - def _encode(self): - columns, indices = libcudf.transform.table_encode(list(self._columns)) - keys = self._from_columns_like_self(columns) - return keys, indices - - @_performance_tracking - def _unaryop(self, op): - data_columns = (col.unary_operator(op) for col in self._columns) - return self._from_data_like_self( - self._data._from_columns_like_self(data_columns) - ) - - @classmethod - @_performance_tracking - def _colwise_binop( - cls, - operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]], - fn: str, - ): - """Implement binary ops between two frame-like objects. - - Binary operations for Frames can be reduced to a sequence of binary - operations between column-like objects. Different types of frames need - to preprocess different inputs, so subclasses should implement binary - operations as a preprocessing step that calls this method. - - Parameters - ---------- - operands : Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]] - A mapping from column names to a tuple containing left and right - operands as well as a boolean indicating whether or not to reflect - an operation and fill value for nulls. - fn : str - The operation to perform. - - Returns - ------- - Dict[ColumnBase] - A dict of columns constructed from the result of performing the - requested operation on the operands. - """ - # Now actually perform the binop on the columns in left and right. - output = {} - for ( - col, - (left_column, right_column, reflect, fill_value), - ) in operands.items(): - output_mask = None - if fill_value is not None: - left_is_column = isinstance(left_column, ColumnBase) - right_is_column = isinstance(right_column, ColumnBase) - - if left_is_column and right_is_column: - # If both columns are nullable, pandas semantics dictate - # that nulls that are present in both left_column and - # right_column are not filled. - if left_column.nullable and right_column.nullable: - with acquire_spill_lock(): - lmask = as_column(left_column.nullmask) - rmask = as_column(right_column.nullmask) - output_mask = (lmask | rmask).data - left_column = left_column.fillna(fill_value) - right_column = right_column.fillna(fill_value) - elif left_column.nullable: - left_column = left_column.fillna(fill_value) - elif right_column.nullable: - right_column = right_column.fillna(fill_value) - elif left_is_column: - if left_column.nullable: - left_column = left_column.fillna(fill_value) - elif right_is_column: - if right_column.nullable: - right_column = right_column.fillna(fill_value) - else: - assert False, "At least one operand must be a column." - - # TODO: Disable logical and binary operators between columns that - # are not numerical using the new binops mixin. - - outcol = ( - getattr(operator, fn)(right_column, left_column) - if reflect - else getattr(operator, fn)(left_column, right_column) - ) - - if output_mask is not None: - outcol = outcol.set_mask(output_mask) - - output[col] = outcol - - return output - - @_performance_tracking - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - return _array_ufunc(self, ufunc, method, inputs, kwargs) - - @_performance_tracking - @acquire_spill_lock() - def _apply_cupy_ufunc_to_operands( - self, ufunc, cupy_func, operands, **kwargs - ) -> list[dict[Any, ColumnBase]]: - # Note: There are some operations that may be supported by libcudf but - # are not supported by pandas APIs. In particular, libcudf binary - # operations support logical and/or operations as well as - # trigonometric, but those operations are not defined on - # pd.Series/DataFrame. For now those operations will dispatch to cupy, - # but if ufuncs are ever a bottleneck we could add special handling to - # dispatch those (or any other) functions that we could implement - # without cupy. - - mask = None - data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)] - for name, (left, right, _, _) in operands.items(): - cupy_inputs = [] - for inp in (left, right) if ufunc.nin == 2 else (left,): - if isinstance(inp, ColumnBase) and inp.has_nulls(): - new_mask = as_column(inp.nullmask) - - # TODO: This is a hackish way to perform a bitwise and - # of bitmasks. Once we expose - # cudf::detail::bitwise_and, then we can use that - # instead. - mask = new_mask if mask is None else (mask & new_mask) - - # Arbitrarily fill with zeros. For ufuncs, we assume - # that the end result propagates nulls via a bitwise - # and, so these elements are irrelevant. - inp = inp.fillna(0) - cupy_inputs.append(cupy.asarray(inp)) - - cp_output = cupy_func(*cupy_inputs, **kwargs) - if ufunc.nout == 1: - cp_output = (cp_output,) - for i, out in enumerate(cp_output): - data[i][name] = as_column(out).set_mask(mask) - return data - - # Unary logical operators - @_performance_tracking - def __neg__(self): - """Negate for integral dtypes, logical NOT for bools.""" - return self._from_data_like_self( - self._data._from_columns_like_self( - ( - col.unary_operator("not") - if col.dtype.kind == "b" - else -1 * col - for col in self._columns - ) - ) - ) - - @_performance_tracking - def __pos__(self): - return self.copy(deep=True) - - @_performance_tracking - def __abs__(self): - return self._unaryop("abs") - - def __bool__(self): - raise ValueError( - f"The truth value of a {type(self).__name__} is ambiguous. Use " - "a.empty, a.bool(), a.item(), a.any() or a.all()." - ) - - # Reductions - @classmethod - @_performance_tracking - def _get_axis_from_axis_arg(cls, axis): - try: - return cls._SUPPORT_AXIS_LOOKUP[axis] - except KeyError: - raise ValueError(f"No axis named {axis} for object type {cls}") - - @_performance_tracking - def _reduce(self, *args, **kwargs): - raise NotImplementedError( - f"Reductions are not supported for objects of type {type(self)}." - ) - - @_performance_tracking - def min( - self, - axis=0, - skipna=True, - numeric_only=False, - **kwargs, - ): - """ - Return the minimum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - numeric_only: bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> min_series = df.min() - >>> min_series - a 1 - b 7 - dtype: int64 - >>> min_series.min() - 1 - - .. pandas-compat:: - :meth:`pandas.DataFrame.min`, :meth:`pandas.Series.min` - - Parameters currently not supported are `level`, `numeric_only`. - """ - return self._reduce( - "min", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - @_performance_tracking - def max( - self, - axis=0, - skipna=True, - numeric_only=False, - **kwargs, - ): - """ - Return the maximum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - numeric_only: bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.max() - a 4 - b 10 - dtype: int64 - - .. pandas-compat:: - :meth:`pandas.DataFrame.max`, :meth:`pandas.Series.max` - - Parameters currently not supported are `level`, `numeric_only`. - """ - return self._reduce( - "max", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - @_performance_tracking - def all(self, axis=0, skipna=True, **kwargs): - """ - Return whether all elements are True in DataFrame. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns', None}, default 0 - Indicate which axis or axes should be reduced. For `Series` - this parameter is unused and defaults to `0`. - - - 0 or 'index' : reduce the index, return a Series - whose index is the original column labels. - - 1 or 'columns' : reduce the columns, return a Series - whose index is the original index. - - None : reduce all axes, return a scalar. - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `bool_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.all() - a True - b False - dtype: bool - - .. pandas-compat:: - :meth:`pandas.DataFrame.all`, :meth:`pandas.Series.all` - - Parameters currently not supported are `axis`, `bool_only`, - `level`. - """ - return self._reduce( - "all", - axis=axis, - skipna=skipna, - **kwargs, - ) - - @_performance_tracking - def any(self, axis=0, skipna=True, **kwargs): - """ - Return whether any elements is True in DataFrame. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns', None}, default 0 - Indicate which axis or axes should be reduced. For `Series` - this parameter is unused and defaults to `0`. - - - 0 or 'index' : reduce the index, return a Series - whose index is the original column labels. - - 1 or 'columns' : reduce the columns, return a Series - whose index is the original index. - - None : reduce all axes, return a scalar. - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `bool_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.any() - a True - b True - dtype: bool - - .. pandas-compat:: - :meth:`pandas.DataFrame.any`, :meth:`pandas.Series.any` - - Parameters currently not supported are `axis`, `bool_only`, - `level`. - """ - return self._reduce( - "any", - axis=axis, - skipna=skipna, - **kwargs, - ) - - @_performance_tracking - @ioutils.doc_to_dlpack() - def to_dlpack(self): - """{docstring}""" - - return cudf.io.dlpack.to_dlpack(self) - - @_performance_tracking - def __str__(self): - return repr(self) - - @_performance_tracking - def __deepcopy__(self, memo): - return self.copy(deep=True) - - @_performance_tracking - def __copy__(self): - return self.copy(deep=False) - - @_performance_tracking - def __invert__(self): - """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" - return self._from_data_like_self( - self._data._from_columns_like_self((~col for col in self._columns)) - ) - - @_performance_tracking - def nunique(self, dropna: bool = True): - """ - Returns a per column mapping with counts of unique values for - each column. - - Parameters - ---------- - dropna : bool, default True - Don't include NaN in the counts. - - Returns - ------- - dict - Name and unique value counts of each column in frame. - """ - raise NotImplementedError( - f"{type(self).__name__} does not implement nunique" - ) - - @staticmethod - @_performance_tracking - def _repeat( - columns: list[ColumnBase], repeats, axis=None - ) -> list[ColumnBase]: - if axis is not None: - raise NotImplementedError( - "Only axis=`None` supported at this time." - ) - - if not is_scalar(repeats): - repeats = as_column(repeats) - - return libcudf.filling.repeat(columns, repeats) - - @_performance_tracking - @_warn_no_dask_cudf - def __dask_tokenize__(self): - from dask.base import normalize_token - - return [ - type(self), - str(dict(self._dtypes)), - normalize_token(self.to_pandas()), - ] diff --git a/python/cudf/cudf/core/groupby/__init__.py b/python/cudf/cudf/core/groupby/__init__.py deleted file mode 100644 index 621edb316cf..00000000000 --- a/python/cudf/cudf/core/groupby/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.groupby.groupby import GroupBy, Grouper, NamedAgg - -__all__ = [ - "GroupBy", - "Grouper", - "NamedAgg", -] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py deleted file mode 100644 index 81b20488d8d..00000000000 --- a/python/cudf/cudf/core/groupby/groupby.py +++ /dev/null @@ -1,3339 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import copy -import itertools -import pickle -import textwrap -import warnings -from collections import abc -from functools import cached_property -from typing import TYPE_CHECKING, Any, Iterable, Literal - -import cupy as cp -import numpy as np -import pandas as pd - -import cudf -from cudf import _lib as libcudf -from cudf._lib import groupby as libgroupby -from cudf._lib.null_mask import bitmask_or -from cudf._lib.reshape import interleave_columns -from cudf._lib.sort import segmented_sort_by_key -from cudf._lib.types import size_type_dtype -from cudf.api.extensions import no_default -from cudf.api.types import is_list_like, is_numeric_dtype -from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable -from cudf.core.column.column import ColumnBase, StructDtype, as_column -from cudf.core.column_accessor import ColumnAccessor -from cudf.core.copy_types import GatherMap -from cudf.core.join._join_helpers import _match_join_keys -from cudf.core.mixins import Reducible, Scannable -from cudf.core.multiindex import MultiIndex -from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import GetAttrGetItemMixin - -if TYPE_CHECKING: - from cudf._typing import ( - AggType, - DataFrameOrSeries, - MultiColumnAggType, - ScalarLike, - ) - - -def _deprecate_collect(): - warnings.warn( - "Groupby.collect is deprecated and " - "will be removed in a future version. " - "Use `.agg(list)` instead.", - FutureWarning, - ) - - -# The three functions below return the quantiles [25%, 50%, 75%] -# respectively, which are called in the describe() method to output -# the summary stats of a GroupBy object -def _quantile_25(x): - return x.quantile(0.25) - - -def _quantile_50(x): - return x.quantile(0.50) - - -def _quantile_75(x): - return x.quantile(0.75) - - -def _is_row_of(chunk, obj): - return ( - isinstance(chunk, cudf.Series) - and isinstance(obj, cudf.DataFrame) - and len(chunk.index) == len(obj._column_names) - and (chunk.index.to_pandas() == pd.Index(obj._column_names)).all() - ) - - -NamedAgg = pd.NamedAgg - - -NamedAgg.__doc__ = """ -Helper for column specific aggregation with control over output column names. - -Subclass of typing.NamedTuple. - -Parameters ----------- -column : Hashable - Column label in the DataFrame to apply aggfunc. -aggfunc : function or str - Function to apply to the provided column. - -Examples --------- ->>> df = cudf.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]}) ->>> agg_a = cudf.NamedAgg(column="a", aggfunc="min") ->>> agg_1 = cudf.NamedAgg(column=1, aggfunc=lambda x: x.mean()) ->>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1) - result_a result_1 -key -1 -1 10.5 -2 1 12.0 -""" - - -groupby_doc_template = textwrap.dedent( - """Group using a mapper or by a Series of columns. - -A groupby operation involves some combination of splitting the object, -applying a function, and combining the results. This can be used to -group large amounts of data and compute operations on these groups. - -Parameters ----------- -by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. If by is a - function, it's called on each value of the object's index. - If a dict or Series is passed, the Series or dict VALUES will - be used to determine the groups (the Series' values are first - aligned; see .align() method). If an cupy array is passed, the - values are used as-is determine the groups. A label or list - of labels may be passed to group by the columns in self. - Notice that a tuple is interpreted as a (single) key. -level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. -as_index : bool, default True - For aggregated output, return object with group labels as - the index. Only relevant for DataFrame input. - as_index=False is effectively "SQL-style" grouped output. -sort : bool, default False - Sort result by group key. Differ from Pandas, cudf defaults to - ``False`` for better performance. Note this does not influence - the order of observations within each group. Groupby preserves - the order of rows within each group. -group_keys : bool, optional - When calling apply and the ``by`` argument produces a like-indexed - result, add group keys to index to identify pieces. By default group - keys are not included when the result's index (and column) labels match - the inputs, and are included otherwise. This argument has no effect if - the result produced is not like-indexed with respect to the input. -{ret} -Examples --------- -**Series** - ->>> ser = cudf.Series([390., 350., 30., 20.], -... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], -... name="Max Speed") ->>> ser -Falcon 390.0 -Falcon 350.0 -Parrot 30.0 -Parrot 20.0 -Name: Max Speed, dtype: float64 ->>> ser.groupby(level=0, sort=True).mean() -Falcon 370.0 -Parrot 25.0 -Name: Max Speed, dtype: float64 ->>> ser.groupby(ser > 100, sort=True).mean() -Max Speed -False 25.0 -True 370.0 -Name: Max Speed, dtype: float64 - -**DataFrame** - ->>> import cudf ->>> import pandas as pd ->>> df = cudf.DataFrame({{ -... 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], -... 'Max Speed': [380., 370., 24., 26.], -... }}) ->>> df - Animal Max Speed -0 Falcon 380.0 -1 Falcon 370.0 -2 Parrot 24.0 -3 Parrot 26.0 ->>> df.groupby(['Animal'], sort=True).mean() - Max Speed -Animal -Falcon 375.0 -Parrot 25.0 - ->>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], -... ['Captive', 'Wild', 'Captive', 'Wild']] ->>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) ->>> df = cudf.DataFrame({{'Max Speed': [390., 350., 30., 20.]}}, -... index=index) ->>> df - Max Speed -Animal Type -Falcon Captive 390.0 - Wild 350.0 -Parrot Captive 30.0 - Wild 20.0 ->>> df.groupby(level=0, sort=True).mean() - Max Speed -Animal -Falcon 370.0 -Parrot 25.0 ->>> df.groupby(level="Type", sort=True).mean() - Max Speed -Type -Captive 210.0 -Wild 185.0 - ->>> df = cudf.DataFrame({{'A': 'a a b'.split(), -... 'B': [1,2,3], -... 'C': [4,6,5]}}) ->>> g1 = df.groupby('A', group_keys=False, sort=True) ->>> g2 = df.groupby('A', group_keys=True, sort=True) - -Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only -differ in their ``group_keys`` argument. Calling `apply` in various ways, -we can get different grouping results: - ->>> g1[['B', 'C']].apply(lambda x: x / x.sum()) - B C -0 0.333333 0.4 -1 0.666667 0.6 -2 1.000000 1.0 - -In the above, the groups are not part of the index. We can have them included -by using ``g2`` where ``group_keys=True``: - ->>> g2[['B', 'C']].apply(lambda x: x / x.sum()) - B C -A -a 0 0.333333 0.4 - 1 0.666667 0.6 -b 2 1.000000 1.0 -""" -) - - -class GroupBy(Serializable, Reducible, Scannable): - obj: "cudf.core.indexed_frame.IndexedFrame" - - _VALID_REDUCTIONS = { - "sum", - "prod", - "idxmin", - "idxmax", - "min", - "max", - "mean", - "median", - "nunique", - "first", - "last", - "var", - "std", - } - - _VALID_SCANS = { - "cumsum", - "cummin", - "cummax", - } - - # Necessary because the function names don't directly map to the docs. - _SCAN_DOCSTRINGS = { - "cumsum": {"op_name": "Cumulative sum"}, - "cummin": {"op_name": "Cumulative min"}, - "cummax": {"op_name": "Cumulative max"}, - } - - _MAX_GROUPS_BEFORE_WARN = 100 - - def __init__( - self, - obj, - by=None, - level=None, - sort=False, - as_index=True, - dropna=True, - group_keys=True, - ): - """ - Group a DataFrame or Series by a set of columns. - - Parameters - ---------- - by : optional - Specifies the grouping columns. Can be any of the following: - - A Python function called on each value of the object's index - - A dict or Series that maps index labels to group names - - A cudf.Index object - - A str indicating a column name - - An array of the same length as the object - - A Grouper object - - A list of the above - level : int, level_name or list, optional - For objects with a MultiIndex, `level` can be used to specify - grouping by one or more levels of the MultiIndex. - sort : bool, default False - Sort the result by group keys. Differ from Pandas, cudf defaults - to False for better performance. - as_index : bool, optional - If as_index=True (default), the group names appear - as the keys of the resulting DataFrame. - If as_index=False, the groups are returned as ordinary - columns of the resulting DataFrame, *if they are named columns*. - dropna : bool, optional - If True (default), do not include the "null" group. - """ - self.obj = obj - self._as_index = as_index - self._by = by.copy(deep=True) if isinstance(by, _Grouping) else by - self._level = level - self._sort = sort - self._dropna = dropna - self._group_keys = group_keys - - if isinstance(self._by, _Grouping): - self._by._obj = self.obj - self.grouping = self._by - else: - self.grouping = _Grouping(obj, self._by, level) - - def __iter__(self): - group_names, offsets, _, grouped_values = self._grouped() - if isinstance(group_names, cudf.BaseIndex): - group_names = group_names.to_pandas() - for i, name in enumerate(group_names): - yield ( - (name,) - if isinstance(self._by, list) and len(self._by) == 1 - else name, - grouped_values[offsets[i] : offsets[i + 1]], - ) - - def __len__(self) -> int: - return self.ngroups - - @property - def ngroups(self) -> int: - _, offsets, _, _ = self._grouped() - return len(offsets) - 1 - - @property - def ndim(self) -> int: - return self.obj.ndim - - @property - def dtypes(self): - """ - Return the dtypes in this group. - - .. deprecated:: 24.04 - Use `.dtypes` on base object instead. - - Returns - ------- - pandas.DataFrame - The data type of each column of the group. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 3], 'b': ['x', 'y', 'z', 'a'], - ... 'c':[10, 11, 12, 12]}) - >>> df.groupby("a").dtypes - a b c - a - 1 int64 object int64 - 2 int64 object int64 - 3 int64 object int64 - """ - warnings.warn( - f"{type(self).__name__}.dtypes is deprecated and will be " - "removed in a future version. Check the dtypes on the " - "base object instead", - FutureWarning, - ) - index = self.grouping.keys.unique().sort_values().to_pandas() - return pd.DataFrame( - {name: [dtype] * len(index) for name, dtype in self.obj._dtypes}, - index=index, - ) - - @cached_property - def groups(self): - """ - Returns a dictionary mapping group keys to row labels. - """ - group_names, offsets, _, grouped_values = self._grouped() - grouped_index = grouped_values.index - - if len(group_names) > self._MAX_GROUPS_BEFORE_WARN: - warnings.warn( - f"GroupBy.groups() performance scales poorly with " - f"number of groups. Got {len(group_names)} groups." - ) - - return dict( - zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) - ) - - @cached_property - def indices(self) -> dict[ScalarLike, cp.ndarray]: - """ - Dict {group name -> group indices}. - - Examples - -------- - >>> import cudf - >>> data = [[10, 20, 30], [10, 30, 40], [40, 50, 30]] - >>> df = cudf.DataFrame(data, columns=["a", "b", "c"]) - >>> df - a b c - 0 10 20 30 - 1 10 30 40 - 2 40 50 30 - >>> df.groupby(by=["a"]).indices - {10: array([0, 1]), 40: array([2])} - """ - offsets, group_keys, (indices,) = self._groupby.groups( - [ - cudf.core.column.as_column( - range(len(self.obj)), dtype=size_type_dtype - ) - ] - ) - - group_keys = libcudf.stream_compaction.drop_duplicates(group_keys) - if len(group_keys) > 1: - index = cudf.MultiIndex.from_arrays(group_keys) - else: - index = cudf.Index._from_column(group_keys[0]) - return dict( - zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1])) - ) - - @_performance_tracking - def get_group(self, name, obj=None): - """ - Construct DataFrame from group with provided name. - - Parameters - ---------- - name : object - The name of the group to get as a DataFrame. - obj : DataFrame, default None - The DataFrame to take the DataFrame out of. If - it is None, the object groupby was called on will - be used. - - Returns - ------- - group : same type as obj - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) - >>> df - X Y - 0 A 1 - 1 B 4 - 2 A 3 - 3 B 2 - >>> df.groupby("X").get_group("A") - X Y - 0 A 1 - 2 A 3 - """ - if obj is None: - obj = self.obj - else: - warnings.warn( - "obj is deprecated and will be removed in a future version. " - "Use ``df.iloc[gb.indices.get(name)]`` " - "instead of ``gb.get_group(name, obj=df)``.", - FutureWarning, - ) - return obj.iloc[self.indices[name]] - - @_performance_tracking - def size(self): - """ - Return the size of each group. - """ - col = cudf.core.column.column_empty( - len(self.obj), "int8", masked=False - ) - return ( - cudf.Series._from_column(col) - .groupby(self.grouping, sort=self._sort, dropna=self._dropna) - .agg("size") - ) - - @_performance_tracking - def cumcount(self, ascending: bool = True): - """ - Return the cumulative count of keys in each group. - - Parameters - ---------- - ascending : bool, default True - If False, number in reverse, from length of group - 1 to 0. - Currently not supported - """ - if ascending is not True: - raise NotImplementedError( - "ascending is currently not implemented." - ) - return ( - cudf.Series._from_column( - cudf.core.column.column_empty( - len(self.obj), "int8", masked=False - ), - index=self.obj.index, - ) - .groupby(self.grouping, sort=self._sort) - .agg("cumcount") - ) - - @_performance_tracking - def rank( - self, - method="average", - ascending=True, - na_option="keep", - pct=False, - axis=0, - ): - """ - Return the rank of values within each group. - """ - if not axis == 0: - raise NotImplementedError("Only axis=0 is supported.") - - if na_option not in {"keep", "top", "bottom"}: - raise ValueError( - f"na_option must be one of 'keep', 'top', or 'bottom', " - f"but got {na_option}" - ) - - # TODO: in pandas compatibility mode, we should convert any - # NaNs to nulls in any float value columns, as Pandas - # treats NaNs the way we treat nulls. - if cudf.get_option("mode.pandas_compatible"): - if any( - col.dtype.kind == "f" for col in self.grouping.values._columns - ): - raise NotImplementedError( - "NaNs are not supported in groupby.rank." - ) - - def rank(x): - return getattr(x, "rank")( - method=method, - ascending=ascending, - na_option=na_option, - pct=pct, - ) - - result = self.agg(rank) - - if cudf.get_option("mode.pandas_compatible"): - # pandas always returns floats: - return result.astype("float64") - - return result - - @cached_property - def _groupby(self): - return libgroupby.GroupBy( - [*self.grouping.keys._columns], dropna=self._dropna - ) - - @_performance_tracking - def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): - """ - Apply aggregation(s) to the groups. - - Parameters - ---------- - func : str, callable, list or dict - Argument specifying the aggregation(s) to perform on the - groups. `func` can be any of the following: - - - string: the name of a supported aggregation - - callable: a function that accepts a Series/DataFrame and - performs a supported operation on it. - - list: a list of strings/callables specifying the - aggregations to perform on every column. - - dict: a mapping of column names to string/callable - specifying the aggregations to perform on those - columns. - - See :ref:`the user guide ` for supported - aggregations. - - Returns - ------- - A Series or DataFrame containing the combined results of the - aggregation(s). - - Examples - -------- - >>> import cudf - >>> a = cudf.DataFrame({ - ... 'a': [1, 1, 2], - ... 'b': [1, 2, 3], - ... 'c': [2, 2, 1] - ... }) - >>> a.groupby('a', sort=True).agg('sum') - b c - a - 1 3 4 - 2 3 1 - - Specifying a list of aggregations to perform on each column. - - >>> import cudf - >>> a = cudf.DataFrame({ - ... 'a': [1, 1, 2], - ... 'b': [1, 2, 3], - ... 'c': [2, 2, 1] - ... }) - >>> a.groupby('a', sort=True).agg(['sum', 'min']) - b c - sum min sum min - a - 1 3 1 4 2 - 2 3 3 1 1 - - Using a dict to specify aggregations to perform per column. - - >>> import cudf - >>> a = cudf.DataFrame({ - ... 'a': [1, 1, 2], - ... 'b': [1, 2, 3], - ... 'c': [2, 2, 1] - ... }) - >>> a.groupby('a', sort=True).agg({'a': 'max', 'b': ['min', 'mean']}) - a b - max min mean - a - 1 1 1 1.5 - 2 2 3 3.0 - - Using lambdas/callables to specify aggregations taking parameters. - - >>> import cudf - >>> a = cudf.DataFrame({ - ... 'a': [1, 1, 2], - ... 'b': [1, 2, 3], - ... 'c': [2, 2, 1] - ... }) - >>> f1 = lambda x: x.quantile(0.5); f1.__name__ = "q0.5" - >>> f2 = lambda x: x.quantile(0.75); f2.__name__ = "q0.75" - >>> a.groupby('a').agg([f1, f2]) - b c - q0.5 q0.75 q0.5 q0.75 - a - 1 1.5 1.75 2.0 2.0 - 2 3.0 3.00 1.0 1.0 - """ - if engine is not None: - raise NotImplementedError( - "engine is non-functional and added for compatibility with pandas" - ) - if engine_kwargs is not None: - raise NotImplementedError( - "engine_kwargs is non-functional added for compatibility with pandas" - ) - if args: - raise NotImplementedError( - "Passing args to func is currently not supported." - ) - - column_names, columns, normalized_aggs = self._normalize_aggs( - func, **kwargs - ) - orig_dtypes = tuple(c.dtype for c in columns) - - # Note: When there are no key columns, the below produces - # an Index with float64 dtype, while Pandas returns - # an Index with int64 dtype. - # (GH: 6945) - ( - result_columns, - grouped_key_cols, - included_aggregations, - ) = self._groupby.aggregate(columns, normalized_aggs) - - result_index = self.grouping.keys._from_columns_like_self( - grouped_key_cols, - ) - - multilevel = _is_multi_agg(func) - data = {} - for col_name, aggs, cols, orig_dtype in zip( - column_names, - included_aggregations, - result_columns, - orig_dtypes, - ): - for agg_tuple, col in zip(aggs, cols): - agg, agg_kind = agg_tuple - agg_name = agg.__name__ if callable(agg) else agg - if multilevel: - key = (col_name, agg_name) - else: - key = col_name - if ( - agg in {list, "collect"} - and orig_dtype != col.dtype.element_type - ): - # Structs lose their labels which we reconstruct here - col = col._with_type_metadata(cudf.ListDtype(orig_dtype)) - - if agg_kind in {"COUNT", "SIZE", "ARGMIN", "ARGMAX"}: - data[key] = col.astype("int64") - elif ( - self.obj.empty - and ( - isinstance(agg_name, str) - and agg_name in Reducible._SUPPORTED_REDUCTIONS - ) - and len(col) == 0 - and not isinstance( - col, - ( - cudf.core.column.ListColumn, - cudf.core.column.StructColumn, - cudf.core.column.DecimalBaseColumn, - ), - ) - ): - data[key] = col.astype(orig_dtype) - else: - data[key] = col - data = ColumnAccessor(data, multiindex=multilevel) - if not multilevel: - data = data.rename_levels({np.nan: None}, level=0) - result = cudf.DataFrame._from_data(data, index=result_index) - - if self._sort: - result = result.sort_index() - else: - if cudf.get_option( - "mode.pandas_compatible" - ) and not libgroupby._is_all_scan_aggregate(normalized_aggs): - # Even with `sort=False`, pandas guarantees that - # groupby preserves the order of rows within each group. - left_cols = list(self.grouping.keys.drop_duplicates()._columns) - right_cols = list(result_index._columns) - join_keys = [ - _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(left_cols, right_cols) - ] - # TODO: In future, see if we can centralize - # logic else where that has similar patterns. - join_keys = map(list, zip(*join_keys)) - # By construction, left and right keys are related by - # a permutation, so we can use an inner join. - left_order, right_order = libcudf.join.join( - *join_keys, how="inner" - ) - # left order is some permutation of the ordering we - # want, and right order is a matching gather map for - # the result table. Get the correct order by sorting - # the right gather map. - (right_order,) = libcudf.sort.sort_by_key( - [right_order], - [left_order], - [True], - ["first"], - stable=False, - ) - result = result._gather( - GatherMap.from_column_unchecked( - right_order, len(result), nullify=False - ) - ) - - if not self._as_index: - result = result.reset_index() - if libgroupby._is_all_scan_aggregate(normalized_aggs): - # Scan aggregations return rows in original index order - return self._mimic_pandas_order(result) - - return result - - def _reduce_numeric_only(self, op: str): - raise NotImplementedError( - f"numeric_only is not implemented for {type(self)}" - ) - - def _reduce( - self, - op: str, - numeric_only: bool = False, - min_count: int = 0, - *args, - **kwargs, - ): - """Compute {op} of group values. - - Parameters - ---------- - numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - min_count : int, default 0 - The required number of valid values to perform the operation. If - fewer than ``min_count`` non-NA values are present the result will - be NA. - - Returns - ------- - Series or DataFrame - Computed {op} of values within each group. - - .. pandas-compat:: - :meth:`pandas.core.groupby.DataFrameGroupBy.{op}`, - :meth:`pandas.core.groupby.SeriesGroupBy.{op}` - - The numeric_only, min_count - """ - if min_count != 0: - raise NotImplementedError( - "min_count parameter is not implemented yet" - ) - if numeric_only: - return self._reduce_numeric_only(op) - return self.agg(op) - - def _scan(self, op: str, *args, **kwargs): - """{op_name} for each group.""" - return self.agg(op) - - aggregate = agg - - def _head_tail(self, n, *, take_head: bool, preserve_order: bool): - """Return the head or tail of each group - - Parameters - ---------- - n - Number of entries to include (if negative, number of - entries to exclude) - take_head - Do we want the head or the tail of the group - preserve_order - If True, return the n rows from each group in original - dataframe order (this mimics pandas behavior though is - more expensive). - - Returns - ------- - New DataFrame or Series - - Notes - ----- - Unlike pandas, this returns an object in group order, not - original order, unless ``preserve_order`` is ``True``. - """ - # A more memory-efficient implementation would merge the take - # into the grouping, but that probably requires a new - # aggregation scheme in libcudf. This is probably "fast - # enough" for most reasonable input sizes. - _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) - size_per_group = np.diff(group_offsets) - # "Out of bounds" n for the group size either means no entries - # (negative) or all the entries (positive) - if n < 0: - size_per_group = np.maximum( - size_per_group + n, 0, out=size_per_group - ) - else: - size_per_group = np.minimum(size_per_group, n, out=size_per_group) - if take_head: - group_offsets = group_offsets[:-1] - else: - group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype) - fixup = np.empty_like(size_per_group) - fixup[0] = 0 - np.cumsum(size_per_group[:-1], out=fixup[1:]) - to_take += np.repeat(group_offsets - fixup, size_per_group) - to_take = as_column(to_take) - result = group_values.iloc[to_take] - if preserve_order: - # Can't use _mimic_pandas_order because we need to - # subsample the gather map from the full input ordering, - # rather than permuting the gather map of the output. - _, _, (ordering,) = self._groupby.groups( - [as_column(range(0, len(self.obj)))] - ) - # Invert permutation from original order to groups on the - # subset of entries we want. - gather_map = ordering.take(to_take).argsort() - return result.take(gather_map) - else: - return result - - @_performance_tracking - def head(self, n: int = 5, *, preserve_order: bool = True): - """Return first n rows of each group - - Parameters - ---------- - n - If positive: number of entries to include from start of group - If negative: number of entries to exclude from end of group - - preserve_order - If True (default), return the n rows from each group in - original dataframe order (this mimics pandas behavior - though is more expensive). If you don't need rows in - original dataframe order you will see a performance - improvement by setting ``preserve_order=False``. In both - cases, the original index is preserved, so ``.loc``-based - indexing will work identically. - - Returns - ------- - Series or DataFrame - Subset of the original grouped object as determined by n - - See Also - -------- - .tail - - Examples - -------- - >>> df = cudf.DataFrame( - ... { - ... "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3], - ... "b": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - ... } - ... ) - >>> df.groupby("a").head(1) - a b - 0 1 0 - 1 0 1 - 3 2 3 - 6 3 6 - >>> df.groupby("a").head(-2) - a b - 0 1 0 - 3 2 3 - 6 3 6 - 8 3 8 - """ - return self._head_tail( - n, take_head=True, preserve_order=preserve_order - ) - - @_performance_tracking - def tail(self, n: int = 5, *, preserve_order: bool = True): - """Return last n rows of each group - - Parameters - ---------- - n - If positive: number of entries to include from end of group - If negative: number of entries to exclude from start of group - - preserve_order - If True (default), return the n rows from each group in - original dataframe order (this mimics pandas behavior - though is more expensive). If you don't need rows in - original dataframe order you will see a performance - improvement by setting ``preserve_order=False``. In both - cases, the original index is preserved, so ``.loc``-based - indexing will work identically. - - Returns - ------- - Series or DataFrame - Subset of the original grouped object as determined by n - - - See Also - -------- - .head - - Examples - -------- - >>> df = cudf.DataFrame( - ... { - ... "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3], - ... "b": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - ... } - ... ) - >>> df.groupby("a").tail(1) - a b - 1 0 1 - 5 1 5 - 7 2 7 - 10 3 10 - >>> df.groupby("a").tail(-2) - a b - 5 1 5 - 7 2 7 - 9 3 9 - 10 3 10 - """ - return self._head_tail( - n, take_head=False, preserve_order=preserve_order - ) - - @_performance_tracking - def nth(self, n, dropna: Literal["any", "all", None] = None): - """ - Return the nth row from each group. - """ - if dropna is not None: - raise NotImplementedError("dropna is not currently supported.") - self.obj["__groupbynth_order__"] = range(0, len(self.obj)) # type: ignore[index] - # We perform another groupby here to have the grouping columns - # be a part of dataframe columns. - result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n)) - sizes = self.size().reindex(result.index) - - result = result[sizes > n] - - result.index = self.obj.index.take( - result._data["__groupbynth_order__"] - ) - del result._data["__groupbynth_order__"] - del self.obj._data["__groupbynth_order__"] - return result - - @_performance_tracking - def ngroup(self, ascending=True): - """ - Number each group from 0 to the number of groups - 1. - - This is the enumerative complement of cumcount. Note that the - numbers given to the groups match the order in which the groups - would be seen when iterating over the groupby object, not the - order they are first observed. - - Parameters - ---------- - ascending : bool, default True - If False, number in reverse, from number of group - 1 to 0. - - Returns - ------- - Series - Unique numbers for each group. - - See Also - -------- - .cumcount : Number the rows in each group. - - Examples - -------- - >>> df = cudf.DataFrame({"A": list("aaabba")}) - >>> df - A - 0 a - 1 a - 2 a - 3 b - 4 b - 5 a - >>> df.groupby('A').ngroup() - 0 0 - 1 0 - 2 0 - 3 1 - 4 1 - 5 0 - dtype: int64 - >>> df.groupby('A').ngroup(ascending=False) - 0 1 - 1 1 - 2 1 - 3 0 - 4 0 - 5 1 - dtype: int64 - >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() - 0 0 - 1 0 - 2 1 - 3 3 - 4 2 - 5 0 - dtype: int64 - """ - index = self.grouping.keys.unique().sort_values() - num_groups = len(index) - _, has_null_group = bitmask_or([*index._columns]) - - if ascending: - # Count ascending from 0 to num_groups - 1 - groups = range(num_groups) - elif has_null_group: - # Count descending from num_groups - 1 to 0, but subtract one more - # for the null group making it num_groups - 2 to -1. - groups = range(num_groups - 2, -2, -1) - else: - # Count descending from num_groups - 1 to 0 - groups = range(num_groups - 1, -1, -1) - - group_ids = cudf.Series._from_column(as_column(groups)) - - if has_null_group: - group_ids.iloc[-1] = cudf.NA - - group_ids.index = index - return self._broadcast(group_ids) - - def sample( - self, - n: int | None = None, - frac: float | None = None, - replace: bool = False, - weights: abc.Sequence | "cudf.Series" | None = None, - random_state: np.random.RandomState | int | None = None, - ): - """Return a random sample of items in each group. - - Parameters - ---------- - n - Number of items to return for each group, if sampling - without replacement must be at most the size of the - smallest group. Cannot be used with frac. Default is - ``n=1`` if frac is None. - frac - Fraction of items to return. Cannot be used with n. - replace - Should sampling occur with or without replacement? - weights - Sampling probability for each element. Must be the same - length as the grouped frame. Not currently supported. - random_state - Seed for random number generation. - - Returns - ------- - New dataframe or series with samples of appropriate size drawn - from each group. - - """ - if weights is not None: - # To implement this case again needs different algorithms - # in both cases. - # - # Without replacement, use the weighted reservoir sampling - # approach of Efraimidas and Spirakis (2006) - # https://doi.org/10.1016/j.ipl.2005.11.003, essentially, - # do a segmented argsort sorting on weight-scaled - # logarithmic deviates. See - # https://timvieira.github.io/blog/post/ - # 2019/09/16/algorithms-for-sampling-without-replacement/ - # - # With replacement is trickier, one might be able to use - # the alias method, otherwise we're back to bucketed - # rejection sampling. - raise NotImplementedError("Sampling with weights is not supported") - if frac is not None and n is not None: - raise ValueError("Cannot supply both of frac and n") - elif n is None and frac is None: - n = 1 - elif frac is not None and not (0 <= frac <= 1): - raise ValueError( - "Sampling with fraction must provide fraction in " - f"[0, 1], got {frac=}" - ) - # TODO: handle random states properly. - if random_state is not None and not isinstance(random_state, int): - raise NotImplementedError( - "Only integer seeds are supported for random_state " - "in this case" - ) - # Get the groups - # TODO: convince Cython to convert the std::vector offsets - # into a numpy array directly, rather than a list. - # TODO: this uses the sort-based groupby, could one use hash-based? - _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) - size_per_group = np.diff(group_offsets) - if n is not None: - samples_per_group = np.broadcast_to( - size_type_dtype.type(n), size_per_group.shape - ) - if not replace and (minsize := size_per_group.min()) < n: - raise ValueError( - f"Cannot sample {n=} without replacement, " - f"smallest group is {minsize}" - ) - else: - # Pandas uses round-to-nearest, ties to even to - # pick sample sizes for the fractional case (unlike IEEE - # which is round-to-nearest, ties to sgn(x) * inf). - samples_per_group = np.round( - size_per_group * frac, decimals=0 - ).astype(size_type_dtype) - if replace: - # We would prefer to use cupy here, but their rng.integers - # interface doesn't take array-based low and high - # arguments. - low = 0 - high = np.repeat(size_per_group, samples_per_group) - rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=size_type_dtype) - indices += np.repeat(group_offsets[:-1], samples_per_group) - else: - # Approach: do a segmented argsort of the index array and take - # the first samples_per_group entries from sorted array. - # We will shuffle the group indices and then pick them out - # from the grouped dataframe index. - nrows = len(group_values) - indices = cp.arange(nrows, dtype=size_type_dtype) - if len(size_per_group) < 500: - # Empirically shuffling with cupy is faster at this scale - rs = cp.random.get_random_state() - rs.seed(seed=random_state) - for off, size in zip(group_offsets, size_per_group): - rs.shuffle(indices[off : off + size]) - else: - rng = cp.random.default_rng(seed=random_state) - (indices,) = segmented_sort_by_key( - [as_column(indices)], - [as_column(rng.random(size=nrows))], - as_column(group_offsets), - [], - [], - stable=True, - ) - indices = cp.asarray(indices.data_array_view(mode="read")) - # Which indices are we going to want? - want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) - scan = np.empty_like(samples_per_group) - scan[0] = 0 - np.cumsum(samples_per_group[:-1], out=scan[1:]) - want += np.repeat(group_offsets[:-1] - scan, samples_per_group) - indices = indices[want] - return group_values.iloc[indices] - - def serialize(self): - header = {} - frames = [] - - header["kwargs"] = { - "sort": self._sort, - "dropna": self._dropna, - "as_index": self._as_index, - } - - obj_header, obj_frames = self.obj.serialize() - header["obj"] = obj_header - header["obj_type"] = pickle.dumps(type(self.obj)) - header["num_obj_frames"] = len(obj_frames) - frames.extend(obj_frames) - - grouping_header, grouping_frames = self.grouping.serialize() - header["grouping"] = grouping_header - header["num_grouping_frames"] = len(grouping_frames) - frames.extend(grouping_frames) - - return header, frames - - @classmethod - def deserialize(cls, header, frames): - kwargs = header["kwargs"] - - obj_type = pickle.loads(header["obj_type"]) - obj = obj_type.deserialize( - header["obj"], frames[: header["num_obj_frames"]] - ) - grouping = _Grouping.deserialize( - header["grouping"], frames[header["num_obj_frames"] :] - ) - return cls(obj, grouping, **kwargs) - - def _grouped(self, *, include_groups: bool = True): - offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups( - [*self.obj.index._columns, *self.obj._columns] - ) - grouped_keys = cudf.core.index._index_from_data( - dict(enumerate(grouped_key_cols)) - ) - if isinstance(self.grouping.keys, cudf.MultiIndex): - grouped_keys.names = self.grouping.keys.names - to_drop = self.grouping.keys.names - else: - grouped_keys.name = self.grouping.keys.name - to_drop = (self.grouping.keys.name,) - grouped_values = self.obj._from_columns_like_self( - grouped_value_cols, - column_names=self.obj._column_names, - index_names=self.obj._index_names, - ) - if not include_groups: - for col_name in to_drop: - del grouped_values[col_name] - group_names = grouped_keys.unique().sort_values() - return (group_names, offsets, grouped_keys, grouped_values) - - def _normalize_aggs( - self, aggs: MultiColumnAggType, **kwargs - ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]: - """ - Normalize aggs to a list of list of aggregations, where `out[i]` - is a list of aggregations for column `self.obj[i]`. We support four - different form of `aggs` input here: - - A single agg, such as "sum". This agg is applied to all value - columns. - - A list of aggs, such as ["sum", "mean"]. All aggs are applied to all - value columns. - - A mapping of column name to aggs, such as - {"a": ["sum"], "b": ["mean"]}, the aggs are applied to specified - column. - - Pairs of column name and agg tuples passed as kwargs - eg. col1=("a", "sum"), col2=("b", "prod"). The output column names are - the keys. The aggs are applied to the corresponding column in the tuple. - Each agg can be string or lambda functions. - """ - - aggs_per_column: Iterable[AggType | Iterable[AggType]] - # TODO: Remove isinstance condition when the legacy dask_cudf API is removed. - # See https://github.com/rapidsai/cudf/pull/16528#discussion_r1715482302 for information. - if aggs or isinstance(aggs, dict): - if isinstance(aggs, dict): - column_names, aggs_per_column = aggs.keys(), aggs.values() - columns = tuple(self.obj._data[col] for col in column_names) - else: - values = self.grouping.values - column_names = values._column_names - columns = values._columns - aggs_per_column = (aggs,) * len(columns) - elif not aggs and kwargs: - column_names = kwargs.keys() - - def _raise_invalid_type(x): - raise TypeError( - f"Invalid keyword argument {x} of type {type(x)} was passed to agg" - ) - - columns, aggs_per_column = zip( - *( - (self.obj._data[x[0]], x[1]) - if isinstance(x, tuple) - else _raise_invalid_type(x) - for x in kwargs.values() - ) - ) - else: - raise TypeError("Must provide at least one aggregation function.") - - # is_list_like performs type narrowing but type-checkers don't - # know it. One could add a TypeGuard annotation to - # is_list_like (see PEP647), but that is less useful than it - # seems because unlike the builtin narrowings it only performs - # narrowing in the positive case. - normalized_aggs = [ - list(agg) if is_list_like(agg) else [agg] # type: ignore - for agg in aggs_per_column - ] - return column_names, columns, normalized_aggs - - @_performance_tracking - def pipe(self, func, *args, **kwargs): - """ - Apply a function `func` with arguments to this GroupBy - object and return the function's result. - - Parameters - ---------- - func : function - Function to apply to this GroupBy object or, - alternatively, a ``(callable, data_keyword)`` tuple where - ``data_keyword`` is a string indicating the keyword of - ``callable`` that expects the GroupBy object. - args : iterable, optional - Positional arguments passed into ``func``. - kwargs : mapping, optional - A dictionary of keyword arguments passed into ``func``. - - Returns - ------- - object : the return type of ``func``. - - See Also - -------- - cudf.Series.pipe - Apply a function with arguments to a series. - - cudf.DataFrame.pipe - Apply a function with arguments to a dataframe. - - apply - Apply function to each group instead of to the full GroupBy object. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'A': ['a', 'b', 'a', 'b'], 'B': [1, 2, 3, 4]}) - >>> df - A B - 0 a 1 - 1 b 2 - 2 a 3 - 3 b 4 - - To get the difference between each groups maximum and minimum value - in one pass, you can do - - >>> df.groupby('A', sort=True).pipe(lambda x: x.max() - x.min()) - B - A - a 2 - b 2 - """ - return cudf.core.common.pipe(self, func, *args, **kwargs) - - @_performance_tracking - def _jit_groupby_apply( - self, function, group_names, offsets, group_keys, grouped_values, *args - ): - chunk_results = jit_groupby_apply( - offsets, grouped_values, function, *args - ) - return self._post_process_chunk_results( - chunk_results, group_names, group_keys, grouped_values - ) - - @_performance_tracking - def _iterative_groupby_apply( - self, function, group_names, offsets, group_keys, grouped_values, *args - ): - ngroups = len(offsets) - 1 - if ngroups > self._MAX_GROUPS_BEFORE_WARN: - warnings.warn( - f"GroupBy.apply() performance scales poorly with " - f"number of groups. Got {ngroups} groups. Some functions " - "may perform better by passing engine='jit'", - RuntimeWarning, - ) - - chunks = [ - grouped_values[s:e] for s, e in zip(offsets[:-1], offsets[1:]) - ] - chunk_results = [function(chk, *args) for chk in chunks] - return self._post_process_chunk_results( - chunk_results, group_names, group_keys, grouped_values - ) - - def _post_process_chunk_results( - self, chunk_results, group_names, group_keys, grouped_values - ): - if not len(chunk_results): - return self.obj.head(0) - if isinstance(chunk_results, ColumnBase) or cudf.api.types.is_scalar( - chunk_results[0] - ): - data = ColumnAccessor( - {None: as_column(chunk_results)}, verify=False - ) - ty = cudf.Series if self._as_index else cudf.DataFrame - result = ty._from_data(data, index=group_names) - result.index.names = self.grouping.names - return result - - elif isinstance(chunk_results[0], cudf.Series) and isinstance( - self.obj, cudf.DataFrame - ): - # When the UDF is like df.sum(), the result for each - # group is a row-like "Series" where the index labels - # are the same as the original calling DataFrame - if _is_row_of(chunk_results[0], self.obj): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - result = cudf.concat(chunk_results, axis=1).T - result.index = group_names - result.index.names = self.grouping.names - # When the UDF is like df.x + df.y, the result for each - # group is the same length as the original group - elif (total_rows := sum(len(chk) for chk in chunk_results)) in { - len(self.obj), - len(group_names), - }: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - result = cudf.concat(chunk_results) - if total_rows == len(group_names): - result.index = group_names - # TODO: Is there a better way to determine what - # the column name should be, especially if we applied - # a nameless UDF. - result = result.to_frame( - name=grouped_values._column_names[0] - ) - else: - index_data = group_keys._data.copy(deep=True) - index_data[None] = grouped_values.index._column - result.index = cudf.MultiIndex._from_data(index_data) - elif len(chunk_results) == len(group_names): - result = cudf.concat(chunk_results, axis=1).T - result.index = group_names - result.index.names = self.grouping.names - else: - raise TypeError( - "Error handling Groupby apply output with input of " - f"type {type(self.obj)} and output of " - f"type {type(chunk_results[0])}" - ) - else: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - result = cudf.concat(chunk_results) - if self._group_keys: - index_data = group_keys._data.copy(deep=True) - index_data[None] = grouped_values.index._column - result.index = cudf.MultiIndex._from_data(index_data) - return result - - @_performance_tracking - def apply( - self, func, *args, engine="auto", include_groups: bool = True, **kwargs - ): - """Apply a python transformation function over the grouped chunk. - - Parameters - ---------- - func : callable - The python transformation function that will be applied - on the grouped chunk. - args : tuple - Optional positional arguments to pass to the function. - engine: 'auto', 'cudf', or 'jit', default 'auto' - Selects the GroupBy.apply implementation. Use `jit` to - select the numba JIT pipeline. Only certain operations are allowed - within the function when using this option: min, max, sum, mean, var, - std, idxmax, and idxmin and any arithmetic formula involving them are - allowed. Binary operations are not yet supported, so syntax like - `df['x'] * 2` is not yet allowed. - For more information, see the `cuDF guide to user defined functions - `__. - Use `cudf` to select the iterative groupby apply algorithm which aims - to provide maximum flexibility at the expense of performance. - The default value `auto` will attempt to use the numba JIT pipeline - where possible and will fall back to the iterative algorithm if - necessary. - include_groups : bool, default True - When True, will attempt to apply ``func`` to the groupings in - the case that they are columns of the DataFrame. In the future, - this will default to ``False``. - kwargs : dict - Optional keyword arguments to pass to the function. - Currently not supported - - Examples - -------- - .. code-block:: python - - from cudf import DataFrame - df = DataFrame() - df['key'] = [0, 0, 1, 1, 2, 2, 2] - df['val'] = [0, 1, 2, 3, 4, 5, 6] - groups = df.groupby(['key']) - - # Define a function to apply to each row in a group - def mult(df): - df['out'] = df['key'] * df['val'] - return df - - result = groups.apply(mult) - print(result) - - Output: - - .. code-block:: python - - key val out - 0 0 0 0 - 1 0 1 0 - 2 1 2 2 - 3 1 3 3 - 4 2 4 8 - 5 2 5 10 - 6 2 6 12 - - .. pandas-compat:: - :meth:`pandas.core.groupby.DataFrameGroupBy.apply`, - :meth:`pandas.core.groupby.SeriesGroupBy.apply` - - cuDF's ``groupby.apply`` is limited compared to pandas. - In some situations, Pandas returns the grouped keys as part of - the index while cudf does not due to redundancy. For example: - - .. code-block:: - - >>> import pandas as pd - >>> df = pd.DataFrame({ - ... 'a': [1, 1, 2, 2], - ... 'b': [1, 2, 1, 2], - ... 'c': [1, 2, 3, 4], - ... }) - >>> gdf = cudf.from_pandas(df) - >>> df.groupby('a')[["b", "c"]].apply(lambda x: x.iloc[[0]]) - b c - a - 1 0 1 1 - 2 2 1 3 - >>> gdf.groupby('a')[["b", "c"]].apply(lambda x: x.iloc[[0]]) - b c - 0 1 1 - 2 1 3 - - ``engine='jit'`` may be used to accelerate certain functions, - initially those that contain reductions and arithmetic operations - between results of those reductions: - - >>> import cudf - >>> df = cudf.DataFrame({'a':[1,1,2,2,3,3], 'b':[1,2,3,4,5,6]}) - >>> df.groupby('a').apply( - ... lambda group: group['b'].max() - group['b'].min(), - ... engine='jit' - ... ) - a - 1 1 - 2 1 - 3 1 - dtype: int64 - - """ - if kwargs: - raise NotImplementedError( - "Passing kwargs to func is currently not supported." - ) - if self.obj.empty: - if func in {"count", "size", "idxmin", "idxmax"}: - res = cudf.Series([], dtype="int64") - else: - res = self.obj.copy(deep=True) - res.index = self.grouping.keys - if func in {"sum", "product"}: - # For `sum` & `product`, boolean types - # will need to result in `int64` type. - for name, col in res._column_labels_and_values: - if col.dtype.kind == "b": - res._data[name] = col.astype("int") - return res - - if not callable(func): - raise TypeError(f"type {type(func)} is not callable") - group_names, offsets, group_keys, grouped_values = self._grouped( - include_groups=include_groups - ) - - if engine == "auto": - if _can_be_jitted(grouped_values, func, args): - engine = "jit" - else: - engine = "cudf" - if engine == "jit": - result = self._jit_groupby_apply( - func, - group_names, - offsets, - group_keys, - grouped_values, - *args, - ) - elif engine == "cudf": - result = self._iterative_groupby_apply( - func, - group_names, - offsets, - group_keys, - grouped_values, - *args, - ) - else: - raise ValueError(f"Unsupported engine '{engine}'") - - if self._sort: - result = result.sort_index() - if self._as_index is False: - result = result.reset_index() - return result - - @_performance_tracking - def apply_grouped(self, function, **kwargs): - """Apply a transformation function over the grouped chunk. - - This uses numba's CUDA JIT compiler to convert the Python - transformation function into a CUDA kernel, thus will have a - compilation overhead during the first run. - - Parameters - ---------- - func : function - The transformation function that will be executed on the CUDA GPU. - incols: list - A list of names of input columns. - outcols: list - A dictionary of output column names and their dtype. - kwargs : dict - name-value of extra arguments. These values are passed directly into - the function. - - Examples - -------- - .. code-block:: python - - from cudf import DataFrame - from numba import cuda - import numpy as np - - df = DataFrame() - df['key'] = [0, 0, 1, 1, 2, 2, 2] - df['val'] = [0, 1, 2, 3, 4, 5, 6] - groups = df.groupby(['key']) - - # Define a function to apply to each group - def mult_add(key, val, out1, out2): - for i in range(cuda.threadIdx.x, len(key), cuda.blockDim.x): - out1[i] = key[i] * val[i] - out2[i] = key[i] + val[i] - - result = groups.apply_grouped(mult_add, - incols=['key', 'val'], - outcols={'out1': np.int32, - 'out2': np.int32}, - # threads per block - tpb=8) - - print(result) - - Output: - - .. code-block:: python - - key val out1 out2 - 0 0 0 0 0 - 1 0 1 0 1 - 2 1 2 2 3 - 3 1 3 3 4 - 4 2 4 8 6 - 5 2 5 10 7 - 6 2 6 12 8 - - - - .. code-block:: python - - import cudf - import numpy as np - from numba import cuda - import pandas as pd - from random import randint - - - # Create a random 15 row dataframe with one categorical - # feature and one random integer valued feature - df = cudf.DataFrame( - { - "cat": [1] * 5 + [2] * 5 + [3] * 5, - "val": [randint(0, 100) for _ in range(15)], - } - ) - - # Group the dataframe by its categorical feature - groups = df.groupby("cat") - - # Define a kernel which takes the moving average of a - # sliding window - def rolling_avg(val, avg): - win_size = 3 - for i in range(cuda.threadIdx.x, len(val), cuda.blockDim.x): - if i < win_size - 1: - # If there is not enough data to fill the window, - # take the average to be NaN - avg[i] = np.nan - else: - total = 0 - for j in range(i - win_size + 1, i + 1): - total += val[j] - avg[i] = total / win_size - - # Compute moving averages on all groups - results = groups.apply_grouped(rolling_avg, - incols=['val'], - outcols=dict(avg=np.float64)) - print("Results:", results) - - # Note this gives the same result as its pandas equivalent - pdf = df.to_pandas() - pd_results = pdf.groupby('cat')['val'].rolling(3).mean() - - - Output: - - .. code-block:: python - - Results: - cat val avg - 0 1 16 - 1 1 45 - 2 1 62 41.0 - 3 1 45 50.666666666666664 - 4 1 26 44.333333333333336 - 5 2 5 - 6 2 51 - 7 2 77 44.333333333333336 - 8 2 1 43.0 - 9 2 46 41.333333333333336 - [5 more rows] - - This is functionally equivalent to `pandas.DataFrame.Rolling - `_ - - """ - if not callable(function): - raise TypeError(f"type {type(function)} is not callable") - - _, offsets, _, grouped_values = self._grouped() - kwargs.update({"chunks": offsets}) - return grouped_values.apply_chunks(function, **kwargs) - - @_performance_tracking - def _broadcast(self, values: cudf.Series) -> cudf.Series: - """ - Broadcast the results of an aggregation to the group - - Parameters - ---------- - values: Series - A Series representing the results of an aggregation. The - index of the Series must be the (unique) values - representing the group keys. - - Returns - ------- - A Series of the same size and with the same index as - ``self.obj``. - """ - if not values.index.equals(self.grouping.keys): - values = values._align_to_index( - self.grouping.keys, how="right", allow_non_unique=True - ) - values.index = self.obj.index - return values - - @_performance_tracking - def transform( - self, func, *args, engine=None, engine_kwargs=None, **kwargs - ): - """Apply an aggregation, then broadcast the result to the group size. - - Parameters - ---------- - func: str or callable - Aggregation to apply to each group. Note that the set of - operations currently supported by `transform` is identical - to that supported by the `agg` method. - - Returns - ------- - A Series or DataFrame of the same size as the input, with the - result of the aggregation per group broadcasted to the group - size. - - Examples - -------- - .. code-block:: python - - import cudf - df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]}) - df.groupby('a').transform('max') - b - 0 5 - 1 3 - 2 3 - 3 5 - 4 5 - - See Also - -------- - agg - """ - if engine is not None: - raise NotImplementedError( - "engine is non-functional and added for compatibility with pandas" - ) - if engine_kwargs is not None: - raise NotImplementedError( - "engine_kwargs is non-functional added for compatibility with pandas" - ) - if args: - raise NotImplementedError( - "Passing args to func is currently not supported." - ) - if kwargs: - raise NotImplementedError( - "Passing kwargs to func is currently not supported." - ) - - if not (isinstance(func, str) or callable(func)): - raise TypeError( - "Aggregation must be a named aggregation or a callable" - ) - try: - result = self.agg(func) - except TypeError as e: - raise NotImplementedError( - "Currently, `transform()` supports only aggregations." - ) from e - # If the aggregation is a scan, don't broadcast - if libgroupby._is_all_scan_aggregate([[func]]): - if len(result) != len(self.obj): - raise AssertionError( - "Unexpected result length for scan transform" - ) - return result - return self._broadcast(result) - - def rolling(self, *args, **kwargs): - """ - Returns a `RollingGroupby` object that enables rolling window - calculations on the groups. - - See Also - -------- - cudf.core.window.Rolling - """ - return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs) - - @_performance_tracking - def count(self, dropna=True): - """Compute the number of values in each column. - - Parameters - ---------- - dropna : bool - If ``True``, don't include null values in the count. - """ - - def func(x): - return getattr(x, "count")(dropna=dropna) - - return self.agg(func) - - @_performance_tracking - def describe(self, percentiles=None, include=None, exclude=None): - """ - Generate descriptive statistics that summarizes the central tendency, - dispersion and shape of a dataset's distribution, excluding NaN values. - - Analyzes numeric DataFrames only - - Parameters - ---------- - percentiles : list-like of numbers, optional - The percentiles to include in the output. - Currently not supported. - - include: 'all', list-like of dtypes or None (default), optional - list of data types to include in the result. - Ignored for Series. - - exclude: list-like of dtypes or None (default), optional, - list of data types to omit from the result. - Ignored for Series. - - Returns - ------- - Series or DataFrame - Summary statistics of the Dataframe provided. - - Examples - -------- - >>> import cudf - >>> gdf = cudf.DataFrame({ - ... "Speed": [380.0, 370.0, 24.0, 26.0], - ... "Score": [50, 30, 90, 80], - ... }) - >>> gdf - Speed Score - 0 380.0 50 - 1 370.0 30 - 2 24.0 90 - 3 26.0 80 - >>> gdf.groupby('Score').describe() - Speed - count mean std min 25% 50% 75% max - Score - 30 1 370.0 370.0 370.0 370.0 370.0 370.0 - 50 1 380.0 380.0 380.0 380.0 380.0 380.0 - 80 1 26.0 26.0 26.0 26.0 26.0 26.0 - 90 1 24.0 24.0 24.0 24.0 24.0 24.0 - - """ - if percentiles is not None: - raise NotImplementedError("percentiles is currently not supported") - if exclude is not None: - raise NotImplementedError("exclude is currently not supported") - if include is not None: - raise NotImplementedError("include is currently not supported") - - res = self.agg( - [ - "count", - "mean", - "std", - "min", - _quantile_25, - _quantile_50, - _quantile_75, - "max", - ] - ) - res.rename( - columns={ - "_quantile_25": "25%", - "_quantile_50": "50%", - "_quantile_75": "75%", - }, - level=1, - inplace=True, - ) - return res - - @_performance_tracking - def cov(self, min_periods=0, ddof=1, numeric_only: bool = False): - """ - Compute the pairwise covariance among the columns of a DataFrame, - excluding NA/null values. - - The returned DataFrame is the covariance matrix of the columns of - the DataFrame. - - Both NA and null values are automatically excluded from the - calculation. See the note below about bias from missing values. - - A threshold can be set for the minimum number of observations - for each value created. Comparisons with observations below this - threshold will be returned as `NA`. - - This method is generally used for the analysis of time series data to - understand the relationship between different measures across time. - - Parameters - ---------- - min_periods: int, optional - Minimum number of observations required per pair of columns - to have a valid result. - - ddof: int, optional - Delta degrees of freedom, default is 1. - - Returns - ------- - DataFrame - Covariance matrix. - - Notes - ----- - Returns the covariance matrix of the DataFrame's time series. - The covariance is normalized by N-ddof. - - For DataFrames that have Series that are missing data - (assuming that data is missing at random) the returned covariance - matrix will be an unbiased estimate of the variance and covariance - between the member Series. - - However, for many applications this estimate may not be acceptable - because the estimate covariance matrix is not guaranteed to be - positive semi-definite. This could lead to estimate correlations - having absolute values which are greater than one, and/or a - non-invertible covariance matrix. See - `Estimation of covariance matrices - ` - for more details. - - Examples - -------- - >>> import cudf - >>> gdf = cudf.DataFrame({ - ... "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - ... "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - ... "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - ... "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - ... }) - >>> gdf - id val1 val2 val3 - 0 a 5 4 4 - 1 a 4 5 5 - 2 a 6 6 6 - 3 b 4 1 1 - 4 b 8 2 2 - 5 b 7 9 9 - 6 c 4 8 8 - 7 c 5 5 5 - 8 c 2 1 1 - >>> gdf.groupby("id").cov() - val1 val2 val3 - id - a val1 1.000000 0.500000 0.500000 - val2 0.500000 1.000000 1.000000 - val3 0.500000 1.000000 1.000000 - b val1 4.333333 3.500000 3.500000 - val2 3.500000 19.000000 19.000000 - val3 3.500000 19.000000 19.000000 - c val1 2.333333 3.833333 3.833333 - val2 3.833333 12.333333 12.333333 - val3 3.833333 12.333333 12.333333 - """ - if numeric_only is not False: - raise NotImplementedError( - "numeric_only is currently not supported." - ) - - return self._cov_or_corr( - lambda x: x.cov(min_periods, ddof), "Covariance" - ) - - def _cov_or_corr(self, func, method_name): - """ - Internal function that is called by either corr() or cov() - for sort groupby correlation and covariance computations, - respectively. - """ - # create expanded dataframe consisting all combinations of the - # struct columns-pairs to be used in the correlation or covariance - # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - column_names = self.grouping.values._column_names - num_cols = len(column_names) - - column_pair_structs = {} - for x, y in itertools.combinations_with_replacement(column_names, 2): - # The number of output columns is the number of input columns - # squared. We directly call the struct column factory here to - # reduce overhead and avoid copying data. Since libcudf groupby - # maintains a cache of aggregation requests, reusing the same - # column also makes use of previously cached column means and - # reduces kernel costs. - - # checks if input column names are string, raise a warning if - # not so and cast them to strings - if not (isinstance(x, str) and isinstance(y, str)): - warnings.warn( - "DataFrame contains non-string column name(s). " - "Struct columns require field names to be strings. " - "Non-string column names will be cast to strings " - "in the result's field names." - ) - x, y = str(x), str(y) - - column_pair_structs[(x, y)] = cudf.core.column.StructColumn( - data=None, - dtype=StructDtype( - fields={x: self.obj._data[x].dtype, y: self.obj._data[y]} - ), - children=(self.obj._data[x], self.obj._data[y]), - size=len(self.obj), - offset=0, - ) - - column_pair_groupby = cudf.DataFrame._from_data( - column_pair_structs - ).groupby(by=self.grouping.keys) - - try: - gb_cov_corr = column_pair_groupby.agg(func) - except RuntimeError as e: - if "Unsupported groupby reduction type-agg combination" in str(e): - raise TypeError( - f"{method_name} accepts only numerical column-pairs" - ) - raise - - # ensure that column-pair labels are arranged in ascending order - cols_list = [ - (y, x) if i > j else (x, y) - for j, y in enumerate(column_names) - for i, x in enumerate(column_names) - ] - cols_split = [ - cols_list[i : i + num_cols] - for i in range(0, len(cols_list), num_cols) - ] - - # interleave: combines the correlation or covariance results for each - # column-pair into a single column - res = cudf.DataFrame._from_data( - { - x: interleave_columns([gb_cov_corr._data[y] for y in ys]) - for ys, x in zip(cols_split, column_names) - } - ) - - # create a multiindex for the groupby covariance or correlation - # dataframe, to match pandas behavior - unsorted_idx = gb_cov_corr.index.repeat(num_cols) - idx_sort_order = unsorted_idx._get_sorted_inds() - sorted_idx = unsorted_idx._gather(idx_sort_order) - if len(gb_cov_corr): - # TO-DO: Should the operation below be done on the CPU instead? - sorted_idx._data[None] = as_column( - np.tile(column_names, len(gb_cov_corr.index)) - ) - res.index = MultiIndex._from_data(sorted_idx._data) - - return res - - @_performance_tracking - def var( - self, - ddof=1, - engine=None, - engine_kwargs=None, - numeric_only: bool = False, - ): - """Compute the column-wise variance of the values in each group. - - Parameters - ---------- - ddof : int - The delta degrees of freedom. N - ddof is the divisor used to - normalize the variance. - """ - if engine is not None: - raise NotImplementedError( - "engine is non-functional and added for compatibility with pandas" - ) - if engine_kwargs is not None: - raise NotImplementedError( - "engine_kwargs is non-functional added for compatibility with pandas" - ) - if numeric_only is not False: - raise NotImplementedError( - "numeric_only is currently not supported." - ) - - def func(x): - return getattr(x, "var")(ddof=ddof) - - return self.agg(func) - - @_performance_tracking - def nunique(self, dropna: bool = True): - """ - Return number of unique elements in the group. - - Parameters - ---------- - dropna : bool, default True - Don't include NaN in the counts. - """ - - def func(x): - return getattr(x, "nunique")(dropna=dropna) - - return self.agg(func) - - @_performance_tracking - def std( - self, - ddof=1, - engine=None, - engine_kwargs=None, - numeric_only: bool = False, - ): - """Compute the column-wise std of the values in each group. - - Parameters - ---------- - ddof : int - The delta degrees of freedom. N - ddof is the divisor used to - normalize the standard deviation. - """ - if engine is not None: - raise NotImplementedError( - "engine is non-functional and added for compatibility with pandas" - ) - if engine_kwargs is not None: - raise NotImplementedError( - "engine_kwargs is non-functional added for compatibility with pandas" - ) - if numeric_only is not False: - raise NotImplementedError( - "numeric_only is currently not supported." - ) - - def func(x): - return getattr(x, "std")(ddof=ddof) - - return self.agg(func) - - @_performance_tracking - def quantile( - self, q=0.5, interpolation="linear", numeric_only: bool = False - ): - """Compute the column-wise quantiles of the values in each group. - - Parameters - ---------- - q : float or array-like - The quantiles to compute. - interpolation : {"linear", "lower", "higher", "midpoint", "nearest"} - The interpolation method to use when the desired quantile lies - between two data points. Defaults to "linear". - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - Currently not supported - """ - if numeric_only is not False: - raise NotImplementedError( - "numeric_only is not currently supported." - ) - - def func(x): - return getattr(x, "quantile")(q=q, interpolation=interpolation) - - return self.agg(func) - - @_performance_tracking - def collect(self): - """Get a list of all the values for each column in each group.""" - _deprecate_collect() - return self.agg(list) - - @_performance_tracking - def unique(self): - """Get a list of the unique values for each column in each group.""" - return self.agg("unique") - - @_performance_tracking - def diff(self, periods=1, axis=0): - """Get the difference between the values in each group. - - Parameters - ---------- - periods : int, default 1 - Periods to shift for calculating difference, - accepts negative values. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Take difference over rows (0) or columns (1). - Only row-wise (0) shift is supported. - - Returns - ------- - Series or DataFrame - First differences of the Series or DataFrame. - """ - - if not axis == 0: - raise NotImplementedError("Only axis=0 is supported.") - - values = self.obj.__class__._from_data( - self.grouping.values._data, self.obj.index - ) - return values - self.shift(periods=periods) - - def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: - """Internal implementation for `ffill` and `bfill`""" - values = self.grouping.values - result = self.obj._from_data( - dict( - zip( - values._column_names, - self._groupby.replace_nulls([*values._columns], method), - ) - ) - ) - result = self._mimic_pandas_order(result) - return result._copy_type_metadata(values) - - def ffill(self, limit=None): - """Forward fill NA values. - - Parameters - ---------- - limit : int, default None - Unsupported - """ - - if limit is not None: - raise NotImplementedError("Does not support limit param yet.") - - return self._scan_fill("ffill", limit) - - def bfill(self, limit=None): - """Backward fill NA values. - - Parameters - ---------- - limit : int, default None - Unsupported - """ - if limit is not None: - raise NotImplementedError("Does not support limit param yet.") - - return self._scan_fill("bfill", limit) - - @_performance_tracking - def fillna( - self, - value=None, - method=None, - axis=0, - inplace=False, - limit=None, - downcast=None, - ): - """Fill NA values using the specified method. - - Parameters - ---------- - value : scalar, dict - Value to use to fill the holes. Cannot be specified with method. - method : { 'bfill', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - - - ffill: propagate last valid observation forward to next valid - - bfill: use next valid observation to fill gap - axis : {0 or 'index', 1 or 'columns'} - Unsupported - inplace : bool, default False - If `True`, fill inplace. Note: this will modify other views on this - object. - limit : int, default None - Unsupported - downcast : dict, default None - Unsupported - - Returns - ------- - DataFrame or Series - """ - warnings.warn( - "groupby fillna is deprecated and " - "will be removed in a future version. Use groupby ffill " - "or groupby bfill for forward or backward filling instead.", - FutureWarning, - ) - if inplace: - raise NotImplementedError("Does not support inplace yet.") - if limit is not None: - raise NotImplementedError("Does not support limit param yet.") - if downcast is not None: - raise NotImplementedError("Does not support downcast yet.") - if not axis == 0: - raise NotImplementedError("Only support axis == 0.") - - if value is None and method is None: - raise ValueError("Must specify a fill 'value' or 'method'.") - if value is not None and method is not None: - raise ValueError("Cannot specify both 'value' and 'method'.") - - if method is not None: - if method not in {"ffill", "bfill"}: - raise ValueError("Method can only be of 'ffill', 'bfill'.") - return getattr(self, method, limit)() - - values = self.obj.__class__._from_data( - self.grouping.values._data, self.obj.index - ) - return values.fillna( - value=value, inplace=inplace, axis=axis, limit=limit - ) - - @_performance_tracking - def shift( - self, - periods=1, - freq=None, - axis=0, - fill_value=None, - suffix: str | None = None, - ): - """ - Shift each group by ``periods`` positions. - - Parameters - ---------- - periods : int, default 1 - Number of periods to shift. - freq : str, unsupported - axis : 0, axis to shift - Shift direction. Only row-wise shift is supported - fill_value : scalar or list of scalars, optional - The scalar value to use for newly introduced missing values. Can be - specified with `None`, a single value or multiple values: - - - `None` (default): sets all indeterminable values to null. - - Single value: fill all shifted columns with this value. Should - match the data type of all columns. - - List of values: fill shifted columns with corresponding value in - the list. The length of the list should match the number of - columns shifted. Each value should match the data type of the - column to fill. - suffix : str, optional - A string to add to each shifted column if there are multiple periods. - Ignored otherwise. - Currently not supported. - - Returns - ------- - Series or DataFrame - Object shifted within each group. - - .. pandas-compat:: - :meth:`pandas.core.groupby.DataFrameGroupBy.shift`, - :meth:`pandas.core.groupby.SeriesGroupBy.shift` - - Parameter ``freq`` is unsupported. - """ - - if freq is not None: - raise NotImplementedError("Parameter freq is unsupported.") - - if not axis == 0: - raise NotImplementedError("Only axis=0 is supported.") - - if suffix is not None: - raise NotImplementedError("shift is not currently supported.") - - values = self.grouping.values - if is_list_like(fill_value): - if len(fill_value) != len(values._data): - raise ValueError( - "Mismatched number of columns and values to fill." - ) - else: - fill_value = [fill_value] * len(values._data) - - result = self.obj.__class__._from_data( - dict( - zip( - values._column_names, - self._groupby.shift( - [*values._columns], periods, fill_value - )[0], - ) - ) - ) - result = self._mimic_pandas_order(result) - return result._copy_type_metadata(values) - - @_performance_tracking - def pct_change( - self, - periods=1, - fill_method=no_default, - axis=0, - limit=no_default, - freq=None, - ): - """ - Calculates the percent change between sequential elements - in the group. - - Parameters - ---------- - periods : int, default 1 - Periods to shift for forming percent change. - fill_method : str, default 'ffill' - How to handle NAs before computing percent changes. - - .. deprecated:: 24.04 - All options of `fill_method` are deprecated - except `fill_method=None`. - limit : int, optional - The number of consecutive NAs to fill before stopping. - Not yet implemented. - - .. deprecated:: 24.04 - `limit` is deprecated. - freq : str, optional - Increment to use from time series API. - Not yet implemented. - - Returns - ------- - Series or DataFrame - Percentage changes within each group - """ - if not axis == 0: - raise NotImplementedError("Only axis=0 is supported.") - if limit is not no_default: - raise NotImplementedError("limit parameter not supported yet.") - if freq is not None: - raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in {no_default, None, "ffill", "bfill"}: - raise ValueError( - "fill_method must be one of 'ffill', or" "'bfill'." - ) - - if fill_method not in (no_default, None) or limit is not no_default: - # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn( - "The 'fill_method' keyword being not None and the 'limit' " - f"keywords in {type(self).__name__}.pct_change are " - "deprecated and will be removed in a future version. " - "Either fill in any non-leading NA values prior " - "to calling pct_change or specify 'fill_method=None' " - "to not fill NA values.", - FutureWarning, - ) - - if fill_method in (no_default, None): - fill_method = "ffill" - if limit is no_default: - limit = None - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - filled = self.fillna(method=fill_method, limit=limit) - - fill_grp = filled.groupby(self.grouping) - shifted = fill_grp.shift(periods=periods, freq=freq) - return (filled / shifted) - 1 - - def _mimic_pandas_order( - self, result: DataFrameOrSeries - ) -> DataFrameOrSeries: - """Given a groupby result from libcudf, reconstruct the row orders - matching that of pandas. This also adds appropriate indices. - """ - # TODO: copy metadata after this method is a common pattern, should - # merge in this method. - - # This function is used to reorder the results of scan-based - # groupbys which have the same output size as input size. - # However, if the grouping key has NAs and dropna=True, the - # result coming back from libcudf has null_count few rows than - # the input, so we must produce an ordering from the full - # input range. - _, _, (ordering,) = self._groupby.groups( - [as_column(range(0, len(self.obj)))] - ) - if self._dropna and any( - c.has_nulls(include_nan=True) > 0 - for c in self.grouping._key_columns - ): - # Scan aggregations with null/nan keys put nulls in the - # corresponding output rows in pandas, to do that here - # expand the result by reindexing. - ri = cudf.RangeIndex(0, len(self.obj)) - result.index = cudf.Index._from_column(ordering) - # This reorders and expands - result = result.reindex(ri) - else: - # Just reorder according to the groupings - result = result.take(ordering.argsort()) - # Now produce the actual index we first thought of - result.index = self.obj.index - return result - - def ohlc(self): - """ - Compute open, high, low and close values of a group, excluding missing values. - - Currently not implemented. - """ - raise NotImplementedError("ohlc is currently not implemented") - - @property - def plot(self): - """ - Make plots of a grouped Series or DataFrame. - - Currently not implemented. - """ - raise NotImplementedError("plot is currently not implemented") - - def resample(self, rule, *args, include_groups: bool = True, **kwargs): - """ - Provide resampling when using a TimeGrouper. - - Currently not implemented. - """ - raise NotImplementedError("resample is currently not implemented") - - def take(self, indices): - """ - Return the elements in the given *positional* indices in each group. - - Currently not implemented. - """ - raise NotImplementedError("take is currently not implemented") - - def filter(self, func, dropna: bool = True, *args, **kwargs): - """ - Filter elements from groups that don't satisfy a criterion. - - Currently not implemented. - """ - raise NotImplementedError("filter is currently not implemented") - - def expanding(self, *args, **kwargs): - """ - Return an expanding grouper, providing expanding - functionality per group. - - Currently not implemented. - """ - raise NotImplementedError("expanding is currently not implemented") - - def ewm(self, *args, **kwargs): - """ - Return an ewm grouper, providing ewm functionality per group. - - Currently not implemented. - """ - raise NotImplementedError("expanding is currently not implemented") - - def any(self, skipna: bool = True): - """ - Return True if any value in the group is truthful, else False. - - Currently not implemented. - """ - raise NotImplementedError("any is currently not implemented") - - def all(self, skipna: bool = True): - """ - Return True if all values in the group are truthful, else False. - - Currently not implemented. - """ - raise NotImplementedError("all is currently not implemented") - - -class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): - obj: "cudf.core.dataframe.DataFrame" - - _PROTECTED_KEYS = frozenset(("obj",)) - - def _reduce_numeric_only(self, op: str): - columns = list( - name - for name, dtype in self.obj._dtypes - if (is_numeric_dtype(dtype) and name not in self.grouping.names) - ) - return self[columns].agg(op) - - def __getitem__(self, key): - return self.obj[key].groupby( - by=self.grouping.keys, - dropna=self._dropna, - sort=self._sort, - group_keys=self._group_keys, - as_index=self._as_index, - ) - - def value_counts( - self, - subset=None, - normalize: bool = False, - sort: bool = True, - ascending: bool = False, - dropna: bool = True, - ) -> DataFrameOrSeries: - """ - Return a Series or DataFrame containing counts of unique rows. - - Parameters - ---------- - subset : list-like, optional - Columns to use when counting unique combinations. - normalize : bool, default False - Return proportions rather than frequencies. - sort : bool, default True - Sort by frequencies. - ascending : bool, default False - Sort in ascending order. - dropna : bool, default True - Don't include counts of rows that contain NA values. - - Returns - ------- - Series or DataFrame - Series if the groupby as_index is True, otherwise DataFrame. - - See Also - -------- - Series.value_counts: Equivalent method on Series. - DataFrame.value_counts: Equivalent method on DataFrame. - SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy. - - Notes - ----- - - If the groupby as_index is True then the returned Series will have a - MultiIndex with one level per input column. - - If the groupby as_index is False then the returned DataFrame will - have an additional column with the value_counts. The column is - labelled 'count' or 'proportion', depending on the ``normalize`` - parameter. - - By default, rows that contain any NA values are omitted from - the result. - - By default, the result will be in descending order so that the - first element of each group is the most frequently-occurring row. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({ - ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'], - ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'], - ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR'] - ... }) - - >>> df - gender education country - 0 male low US - 1 male medium FR - 2 female high US - 3 male low FR - 4 female high FR - 5 male low FR - - >>> df.groupby('gender').value_counts() - gender education country - female high FR 1 - US 1 - male low FR 2 - US 1 - medium FR 1 - Name: count, dtype: int64 - - >>> df.groupby('gender').value_counts(ascending=True) - gender education country - female high FR 1 - US 1 - male low US 1 - medium FR 1 - low FR 2 - Name: count, dtype: int64 - - >>> df.groupby('gender').value_counts(normalize=True) - gender education country - female high FR 0.50 - US 0.50 - male low FR 0.50 - US 0.25 - medium FR 0.25 - Name: proportion, dtype: float64 - - >>> df.groupby('gender', as_index=False).value_counts() - gender education country count - 0 female high FR 1 - 1 female high US 1 - 2 male low FR 2 - 3 male low US 1 - 4 male medium FR 1 - - >>> df.groupby('gender', as_index=False).value_counts(normalize=True) - gender education country proportion - 0 female high FR 0.50 - 1 female high US 0.50 - 2 male low FR 0.50 - 3 male low US 0.25 - 4 male medium FR 0.25 - """ - - df = cudf.DataFrame.copy(self.obj) - groupings = self.grouping.names - name = "proportion" if normalize else "count" - - if subset is None: - subset = [i for i in df._column_names if i not in groupings] - # Check subset exists in dataframe - elif set(subset) - set(df._column_names): - raise ValueError( - f"Keys {set(subset) - set(df._column_names)} in subset " - f"do not exist in the DataFrame." - ) - # Catch case where groupby and subset share an element - elif set(subset) & set(groupings): - raise ValueError( - f"Keys {set(subset) & set(groupings)} in subset " - "cannot be in the groupby column keys." - ) - - df["__placeholder"] = 1 - result = ( - df.groupby(groupings + list(subset), dropna=dropna)[ - "__placeholder" - ] - .count() - .sort_index() - .astype(np.int64) - ) - - if normalize: - levels = list(range(len(groupings), result.index.nlevels)) - result /= result.groupby( - result.index.droplevel(levels), - ).transform("sum") - - if sort: - result = result.sort_values(ascending=ascending).sort_index( - level=range(len(groupings)), sort_remaining=False - ) - - if not self._as_index: - if name in df._column_names: - raise ValueError( - f"Column label '{name}' is duplicate of result column" - ) - result.name = name - result = result.to_frame().reset_index() - else: - result.name = name - - return result - - @_performance_tracking - def corr( - self, method="pearson", min_periods=1, numeric_only: bool = False - ): - """ - Compute pairwise correlation of columns, excluding NA/null values. - - Parameters - ---------- - method: {"pearson", "kendall", "spearman"} or callable, - default "pearson". Currently only the pearson correlation - coefficient is supported. - - min_periods: int, optional - Minimum number of observations required per pair of columns - to have a valid result. - - Returns - ------- - DataFrame - Correlation matrix. - - Examples - -------- - >>> import cudf - >>> gdf = cudf.DataFrame({ - ... "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - ... "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - ... "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - ... "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]}) - >>> gdf - id val1 val2 val3 - 0 a 5 4 4 - 1 a 4 5 5 - 2 a 6 6 6 - 3 b 4 1 1 - 4 b 8 2 2 - 5 b 7 9 9 - 6 c 4 8 8 - 7 c 5 5 5 - 8 c 2 1 1 - >>> gdf.groupby("id").corr(method="pearson") - val1 val2 val3 - id - a val1 1.000000 0.500000 0.500000 - val2 0.500000 1.000000 1.000000 - val3 0.500000 1.000000 1.000000 - b val1 1.000000 0.385727 0.385727 - val2 0.385727 1.000000 1.000000 - val3 0.385727 1.000000 1.000000 - c val1 1.000000 0.714575 0.714575 - val2 0.714575 1.000000 1.000000 - val3 0.714575 1.000000 1.000000 - """ - - if method != "pearson": - raise NotImplementedError( - "Only pearson correlation is currently supported" - ) - if numeric_only is not False: - raise NotImplementedError( - "numeric_only is currently not supported." - ) - - return self._cov_or_corr( - lambda x: x.corr(method, min_periods), "Correlation" - ) - - def hist( - self, - column=None, - by=None, - grid: bool = True, - xlabelsize: int | None = None, - xrot: float | None = None, - ylabelsize: int | None = None, - yrot: float | None = None, - ax=None, - sharex: bool = False, - sharey: bool = False, - figsize: tuple[float, float] | None = None, - layout: tuple[int, int] | None = None, - bins: int | abc.Sequence[int] = 10, - backend: str | None = None, - legend: bool = False, - **kwargs, - ): - raise NotImplementedError("hist is not currently implemented") - - def boxplot( - self, - subplots: bool = True, - column=None, - fontsize: int | None = None, - rot: int = 0, - grid: bool = True, - ax=None, - figsize: tuple[float, float] | None = None, - layout=None, - sharex: bool = False, - sharey: bool = True, - backend=None, - **kwargs, - ): - raise NotImplementedError("boxplot is not currently implemented") - - -DataFrameGroupBy.__doc__ = groupby_doc_template.format(ret="") - - -class SeriesGroupBy(GroupBy): - obj: "cudf.core.series.Series" - - def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - result = super().agg( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) - - # downcast the result to a Series: - if len(result._data): - if result.shape[1] == 1 and not is_list_like(func): - return result.iloc[:, 0] - - # drop the first level if we have a multiindex - if result._data.nlevels > 1: - result.columns = result._data.to_pandas_index().droplevel(0) - - return result - - aggregate = agg - - def apply(self, func, *args, **kwargs): - result = super().apply(func, *args, **kwargs) - - # apply Series name to result - result.name = self.obj.name - - return result - - @property - def dtype(self) -> pd.Series: - raise NotImplementedError("dtype is currently not implemented.") - - def hist( - self, - by=None, - ax=None, - grid: bool = True, - xlabelsize: int | None = None, - xrot: float | None = None, - ylabelsize: int | None = None, - yrot: float | None = None, - figsize: tuple[float, float] | None = None, - bins: int | abc.Sequence[int] = 10, - backend: str | None = None, - legend: bool = False, - **kwargs, - ): - raise NotImplementedError("hist is currently not implemented.") - - @property - def is_monotonic_increasing(self) -> cudf.Series: - """ - Return whether each group's values are monotonically increasing. - - Currently not implemented - """ - raise NotImplementedError( - "is_monotonic_increasing is currently not implemented." - ) - - @property - def is_monotonic_decreasing(self) -> cudf.Series: - """ - Return whether each group's values are monotonically decreasing. - - Currently not implemented - """ - raise NotImplementedError( - "is_monotonic_decreasing is currently not implemented." - ) - - def nlargest( - self, n: int = 5, keep: Literal["first", "last", "all"] = "first" - ) -> cudf.Series: - """ - Return the largest n elements. - - Currently not implemented - """ - raise NotImplementedError("nlargest is currently not implemented.") - - def nsmallest( - self, n: int = 5, keep: Literal["first", "last", "all"] = "first" - ) -> cudf.Series: - """ - Return the smallest n elements. - - Currently not implemented - """ - raise NotImplementedError("nsmallest is currently not implemented.") - - def value_counts( - self, - normalize: bool = False, - sort: bool = True, - ascending: bool = False, - bins=None, - dropna: bool = True, - ) -> cudf.Series | cudf.DataFrame: - raise NotImplementedError("value_counts is currently not implemented.") - - def corr( - self, - other: cudf.Series, - method: str = "pearson", - min_periods: int | None = None, - ) -> cudf.Series: - raise NotImplementedError("corr is currently not implemented.") - - -SeriesGroupBy.__doc__ = groupby_doc_template.format(ret="") - - -# TODO: should we define this as a dataclass instead? -class Grouper: - def __init__( - self, key=None, level=None, freq=None, closed=None, label=None - ): - if key is not None and level is not None: - raise ValueError("Grouper cannot specify both key and level") - if (key, level) == (None, None) and not freq: - raise ValueError("Grouper must specify either key or level") - self.key = key - self.level = level - self.freq = freq - self.closed = closed - self.label = label - - -class _Grouping(Serializable): - def __init__(self, obj, by=None, level=None): - self._obj = obj - self._key_columns = [] - self.names = [] - - # Need to keep track of named key columns - # to support `as_index=False` correctly - self._named_columns = [] - self._handle_by_or_level(by, level) - - if len(obj) and not len(self._key_columns): - raise ValueError("No group keys passed") - - def _handle_by_or_level(self, by=None, level=None): - if level is not None: - if by is not None: - raise ValueError("Cannot specify both by and level") - level_list = level if isinstance(level, list) else [level] - for level in level_list: - self._handle_level(level) - else: - by_list = by if isinstance(by, list) else [by] - - for by in by_list: - if callable(by): - self._handle_callable(by) - elif isinstance(by, cudf.Series): - self._handle_series(by) - elif isinstance(by, cudf.BaseIndex): - self._handle_index(by) - elif isinstance(by, abc.Mapping): - self._handle_mapping(by) - elif isinstance(by, Grouper): - self._handle_grouper(by) - elif isinstance(by, pd.Series): - self._handle_series(cudf.Series.from_pandas(by)) - elif isinstance(by, pd.Index): - self._handle_index(cudf.Index.from_pandas(by)) - else: - try: - self._handle_label(by) - except (KeyError, TypeError): - self._handle_misc(by) - - @property - def keys(self): - """Return grouping key columns as index""" - nkeys = len(self._key_columns) - - if nkeys == 0: - return cudf.Index([], name=None) - elif nkeys > 1: - return cudf.MultiIndex._from_data( - dict(zip(range(nkeys), self._key_columns)) - )._set_names(self.names) - else: - return cudf.Index._from_column( - self._key_columns[0], name=self.names[0] - ) - - @property - def values(self) -> cudf.core.frame.Frame: - """Return value columns as a frame. - - Note that in aggregation, value columns can be arbitrarily - specified. While this method returns all non-key columns from `obj` as - a frame. - - This is mainly used in transform-like operations. - """ - # If the key columns are in `obj`, filter them out - value_column_names = [ - x for x in self._obj._column_names if x not in self._named_columns - ] - value_columns = self._obj._data.select_by_label(value_column_names) - return self._obj.__class__._from_data(value_columns) - - def _handle_callable(self, by): - by = by(self._obj.index) - self.__init__(self._obj, by) - - def _handle_series(self, by): - by = by._align_to_index(self._obj.index, how="right") - self._key_columns.append(by._column) - self.names.append(by.name) - - def _handle_index(self, by): - self._key_columns.extend(by._columns) - self.names.extend(by._column_names) - - def _handle_mapping(self, by): - by = cudf.Series(by.values(), index=by.keys()) - self._handle_series(by) - - def _handle_label(self, by): - try: - self._key_columns.append(self._obj._data[by]) - except KeyError as e: - # `by` can be index name(label) too. - if by in self._obj.index.names: - self._key_columns.append(self._obj.index._data[by]) - else: - raise e - self.names.append(by) - self._named_columns.append(by) - - def _handle_grouper(self, by): - if by.freq: - self._handle_frequency_grouper(by) - elif by.key: - self._handle_label(by.key) - else: - self._handle_level(by.level) - - def _handle_frequency_grouper(self, by): - raise NotImplementedError() - - def _handle_level(self, by): - level_values = self._obj.index.get_level_values(by) - self._key_columns.append(level_values._values) - self.names.append(level_values.name) - - def _handle_misc(self, by): - by = cudf.core.column.as_column(by) - if len(by) != len(self._obj): - raise ValueError("Grouper and object must have same length") - self._key_columns.append(by) - self.names.append(None) - - def serialize(self): - header = {} - frames = [] - header["names"] = pickle.dumps(self.names) - header["_named_columns"] = pickle.dumps(self._named_columns) - column_header, column_frames = cudf.core.column.serialize_columns( - self._key_columns - ) - header["columns"] = column_header - frames.extend(column_frames) - return header, frames - - @classmethod - def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) - key_columns = cudf.core.column.deserialize_columns( - header["columns"], frames - ) - out = _Grouping.__new__(_Grouping) - out.names = names - out._named_columns = _named_columns - out._key_columns = key_columns - return out - - def copy(self, deep=True): - out = _Grouping.__new__(_Grouping) - out.names = copy.deepcopy(self.names) - out._named_columns = copy.deepcopy(self._named_columns) - out._key_columns = [col.copy(deep=deep) for col in self._key_columns] - return out - - -def _is_multi_agg(aggs): - """ - Returns True if more than one aggregation is performed - on any of the columns as specified in `aggs`. - """ - if isinstance(aggs, abc.Mapping): - return any(is_list_like(agg) for agg in aggs.values()) - if is_list_like(aggs): - return True - return False diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py deleted file mode 100644 index cd07c58c5d9..00000000000 --- a/python/cudf/cudf/core/index.py +++ /dev/null @@ -1,3964 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import operator -import pickle -import warnings -from collections.abc import Hashable -from functools import cache, cached_property -from numbers import Number -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast - -import cupy -import numpy as np -import pandas as pd -import pyarrow as pa -from typing_extensions import Self - -import cudf -from cudf import _lib as libcudf -from cudf._lib.filling import sequence -from cudf._lib.search import search_sorted -from cudf._lib.types import size_type_dtype -from cudf.api.extensions import no_default -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_dtype_equal, - is_integer, - is_list_like, - is_scalar, - is_string_dtype, -) -from cudf.core._base_index import BaseIndex, _return_get_indexer_result -from cudf.core._compat import PANDAS_LT_300 -from cudf.core.column import ( - CategoricalColumn, - ColumnBase, - DatetimeColumn, - IntervalColumn, - NumericalColumn, - StringColumn, - StructColumn, - TimeDeltaColumn, - column, -) -from cudf.core.column.column import as_column, concat_columns -from cudf.core.column.string import StringMethods as StringMethods -from cudf.core.dtypes import IntervalDtype -from cudf.core.join._join_helpers import _match_join_keys -from cudf.core.mixins import BinaryOperand -from cudf.core.single_column_frame import SingleColumnFrame -from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import ( - _maybe_convert_to_default_type, - find_common_type, - is_mixed_with_object_dtype, -) -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import _warn_no_dask_cudf, search_range - -if TYPE_CHECKING: - from collections.abc import Generator, Iterable - from datetime import tzinfo - - from cudf.core.frame import Frame - - -def ensure_index(index_like: Any) -> BaseIndex: - """ - Ensure an Index is returned. - - Avoids a shallow copy compared to calling cudf.Index(...) - """ - if not isinstance(index_like, BaseIndex): - return cudf.Index(index_like) - return index_like - - -class IndexMeta(type): - """Custom metaclass for Index that overrides instance/subclass tests.""" - - def __call__(cls, data, *args, **kwargs): - if kwargs.get("tupleize_cols", True) is not True: - raise NotImplementedError( - "tupleize_cols is currently not supported." - ) - - if cls is Index: - return as_index( - arbitrary=data, - *args, - **kwargs, - ) - return super().__call__(data, *args, **kwargs) - - def __instancecheck__(self, instance): - if self is cudf.Index: - return isinstance(instance, BaseIndex) - else: - return type.__instancecheck__(self, instance) - - def __subclasscheck__(self, subclass): - if self is cudf.Index: - return issubclass(subclass, BaseIndex) - else: - return type.__subclasscheck__(self, subclass) - - -def _lexsorted_equal_range( - idx: Index | cudf.MultiIndex, - keys: list[ColumnBase], - is_sorted: bool, -) -> tuple[int, int, ColumnBase | None]: - """Get equal range for key in lexicographically sorted index. If index - is not sorted when called, a sort will take place and `sort_inds` is - returned. Otherwise `None` is returned in that position. - """ - if not is_sorted: - sort_inds = idx._get_sorted_inds() - sort_vals = idx._gather(sort_inds) - else: - sort_inds = None - sort_vals = idx - lower_bound = search_sorted( - list(sort_vals._columns), - keys, - side="left", - ascending=sort_vals.is_monotonic_increasing, - ).element_indexing(0) - upper_bound = search_sorted( - list(sort_vals._columns), - keys, - side="right", - ascending=sort_vals.is_monotonic_increasing, - ).element_indexing(0) - - return lower_bound, upper_bound, sort_inds - - -def _index_from_data(data: MutableMapping, name: Any = no_default): - """Construct an index of the appropriate type from some data.""" - - if len(data) == 0: - raise ValueError("Cannot construct Index from any empty Table") - if len(data) == 1: - values = next(iter(data.values())) - - if isinstance(values, NumericalColumn): - index_class_type = Index - elif isinstance(values, DatetimeColumn): - index_class_type = DatetimeIndex - elif isinstance(values, TimeDeltaColumn): - index_class_type = TimedeltaIndex - elif isinstance(values, StringColumn): - index_class_type = Index - elif isinstance(values, CategoricalColumn): - index_class_type = CategoricalIndex - elif isinstance(values, (IntervalColumn, StructColumn)): - index_class_type = IntervalIndex - else: - raise NotImplementedError( - "Unsupported column type passed to " - f"create an Index: {type(values)}" - ) - else: - index_class_type = cudf.MultiIndex - return index_class_type._from_data(data, name) - - -def validate_range_arg(arg, arg_name: Literal["start", "stop", "step"]) -> int: - """Validate start/stop/step argument in RangeIndex.__init__""" - if not is_integer(arg): - raise TypeError( - f"{arg_name} must be an integer, not {type(arg).__name__}" - ) - return int(arg) - - -class RangeIndex(BaseIndex, BinaryOperand): - """ - Immutable Index implementing a monotonic integer range. - - This is the default index type used by DataFrame and Series - when no explicit index is provided by the user. - - Parameters - ---------- - start : int (default: 0), or other range instance - stop : int (default: 0) - step : int (default: 1) - name : object, optional - Name to be stored in the index. - dtype : numpy dtype - Unused, accepted for homogeneity with other index types. - copy : bool, default False - Unused, accepted for homogeneity with other index types. - - Attributes - ---------- - start - stop - step - - Methods - ------- - to_numpy - to_arrow - - Examples - -------- - >>> import cudf - >>> cudf.RangeIndex(0, 10, 1, name="a") - RangeIndex(start=0, stop=10, step=1, name='a') - - >>> cudf.RangeIndex(range(1, 10, 1), name="a") - RangeIndex(start=1, stop=10, step=1, name='a') - """ - - _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS - - _range: range - - @_performance_tracking - def __init__( - self, start, stop=None, step=1, dtype=None, copy=False, name=None - ): - if not cudf.api.types.is_hashable(name): - raise ValueError("Name must be a hashable value.") - self._name = name - if dtype is not None and cudf.dtype(dtype).kind != "i": - raise ValueError(f"{dtype=} must be a signed integer type") - - if isinstance(start, range): - self._range = start - else: - if stop is None: - start, stop = 0, start - start = validate_range_arg(start, "start") - stop = validate_range_arg(stop, "stop") - if step is not None: - step = validate_range_arg(step, "step") - else: - step = 1 - try: - self._range = range(start, stop, step) - except ValueError as err: - if step == 0: - raise ValueError("Step must not be zero.") from err - raise - - def _copy_type_metadata(self: Self, other: Self) -> Self: - # There is no metadata to be copied for RangeIndex since it does not - # have an underlying column. - return self - - def searchsorted( - self, - value: int, - side: Literal["left", "right"] = "left", - ascending: bool = True, - na_position: Literal["first", "last"] = "last", - ): - assert (len(self) <= 1) or ( - ascending == (self.step > 0) - ), "Invalid ascending flag" - return search_range(value, self._range, side=side) - - def factorize( - self, sort: bool = False, use_na_sentinel: bool = True - ) -> tuple[cupy.ndarray, Self]: - if sort and self.step < 0: - codes = cupy.arange(len(self) - 1, -1, -1) - uniques = self[::-1] - else: - codes = cupy.arange(len(self), dtype=np.intp) - uniques = self - return codes, uniques - - @property # type: ignore - @_performance_tracking - def name(self): - return self._name - - @name.setter # type: ignore - @_performance_tracking - def name(self, value): - self._name = value - - @property - @_performance_tracking - def _column_names(self) -> tuple[Any]: - return (self.name,) - - @property - @_performance_tracking - def _columns(self) -> tuple[ColumnBase]: - return (self._values,) - - @property - def _column_labels_and_values(self) -> Iterable: - return zip(self._column_names, self._columns) - - @property # type: ignore - @_performance_tracking - def start(self) -> int: - """ - The value of the `start` parameter (0 if this was not supplied). - """ - return self._range.start - - @property # type: ignore - @_performance_tracking - def stop(self) -> int: - """ - The value of the stop parameter. - """ - return self._range.stop - - @property # type: ignore - @_performance_tracking - def step(self) -> int: - """ - The value of the step parameter. - """ - return self._range.step - - @property # type: ignore - @_performance_tracking - def _num_rows(self) -> int: - return len(self) - - @cached_property # type: ignore - @_performance_tracking - def _values(self) -> ColumnBase: - if len(self) > 0: - return column.as_column(self._range, dtype=self.dtype) - else: - return column.column_empty(0, masked=False, dtype=self.dtype) - - def _clean_nulls_from_index(self) -> Self: - return self - - def _is_numeric(self) -> bool: - return True - - def _is_boolean(self) -> bool: - return False - - def _is_integer(self) -> bool: - return True - - def _is_floating(self) -> bool: - return False - - def _is_object(self) -> bool: - return False - - def _is_categorical(self) -> bool: - return False - - def _is_interval(self) -> bool: - return False - - @property # type: ignore - @_performance_tracking - def hasnans(self) -> bool: - return False - - @property # type: ignore - @_performance_tracking - def _data(self): - return cudf.core.column_accessor.ColumnAccessor( - {self.name: self._values}, verify=False - ) - - @_performance_tracking - def __contains__(self, item): - hash(item) - if not isinstance(item, (np.floating, np.integer, int, float)): - return False - elif isinstance(item, (np.timedelta64, np.datetime64, bool)): - # Cases that would pass the above check - return False - try: - int_item = int(item) - return int_item == item and int_item in self._range - except (ValueError, OverflowError): - return False - - @_performance_tracking - def copy(self, name=None, deep=False): - """ - Make a copy of this object. - - Parameters - ---------- - name : object optional (default: None), name of index - deep : Bool (default: False) - Ignored for RangeIndex - - Returns - ------- - New RangeIndex instance with same range - """ - - name = self.name if name is None else name - - return RangeIndex(self._range, name=name) - - @_performance_tracking - def astype(self, dtype, copy: bool = True): - if is_dtype_equal(dtype, self.dtype): - return self - return self._as_int_index().astype(dtype, copy=copy) - - def fillna(self, value, downcast=None): - return self.copy() - - @_performance_tracking - def drop_duplicates(self, keep="first"): - return self - - @_performance_tracking - def duplicated(self, keep="first") -> cupy.ndarray: - return cupy.zeros(len(self), dtype=bool) - - @_performance_tracking - def __repr__(self): - return ( - f"{self.__class__.__name__}(start={self.start}, stop={self.stop}" - f", step={self.step}" - + ( - f", name={pd.io.formats.printing.default_pprint(self.name)}" - if self.name is not None - else "" - ) - + ")" - ) - - @property - @_performance_tracking - def size(self) -> int: - return len(self) - - @_performance_tracking - def __len__(self): - return len(self._range) - - @_performance_tracking - def __getitem__(self, index): - if isinstance(index, slice): - sl_start, sl_stop, sl_step = index.indices(len(self)) - - lo = self.start + sl_start * self.step - hi = self.start + sl_stop * self.step - st = self.step * sl_step - return RangeIndex(start=lo, stop=hi, step=st, name=self._name) - - elif isinstance(index, Number): - len_self = len(self) - if index < 0: - index += len_self - if not (0 <= index < len_self): - raise IndexError("Index out of bounds") - return self.start + index * self.step - return self._as_int_index()[index] - - def _get_columns_by_label(self, labels) -> Index: - # used in .sort_values - if isinstance(labels, Hashable): - if labels == self.name: - return self._as_int_index() - elif is_list_like(labels): - if list(self.names) == list(labels): - return self._as_int_index() - raise KeyError(labels) - - @_performance_tracking - def equals(self, other) -> bool: - if isinstance(other, RangeIndex): - return self._range == other._range - return self._as_int_index().equals(other) - - @_performance_tracking - def serialize(self): - header = {} - header["index_column"] = {} - - # store metadata values of index separately - # We don't need to store the GPU buffer for RangeIndexes - # cuDF only needs to store start/stop and rehydrate - # during de-serialization - header["index_column"]["start"] = self.start - header["index_column"]["stop"] = self.stop - header["index_column"]["step"] = self.step - frames = [] - - header["name"] = pickle.dumps(self.name) - header["dtype"] = pickle.dumps(self.dtype) - header["type-serialized"] = pickle.dumps(type(self)) - header["frame_count"] = 0 - return header, frames - - @classmethod - @_performance_tracking - def deserialize(cls, header, frames): - h = header["index_column"] - name = pickle.loads(header["name"]) - start = h["start"] - stop = h["stop"] - step = h.get("step", 1) - return RangeIndex(start=start, stop=stop, step=step, name=name) - - @property # type: ignore - @_performance_tracking - def dtype(self): - """ - `dtype` of the range of values in RangeIndex. - - By default the dtype is 64 bit signed integer. This is configurable - via `default_integer_bitwidth` as 32 bit in `cudf.options` - """ - dtype = np.dtype(np.int64) - return _maybe_convert_to_default_type(dtype) - - @property - def _dtypes(self) -> Iterable: - return [(self.name, self.dtype)] - - @_performance_tracking - def to_pandas( - self, *, nullable: bool = False, arrow_type: bool = False - ) -> pd.RangeIndex: - if nullable: - raise NotImplementedError(f"{nullable=} is not implemented.") - elif arrow_type: - raise NotImplementedError(f"{arrow_type=} is not implemented.") - return pd.RangeIndex( - start=self.start, - stop=self.stop, - step=self.step, - dtype=self.dtype, - name=self.name, - ) - - def to_frame( - self, index: bool = True, name: Hashable = no_default - ) -> cudf.DataFrame: - return self._as_int_index().to_frame(index=index, name=name) - - @property - def is_unique(self) -> bool: - return True - - @cached_property # type: ignore - @_performance_tracking - def is_monotonic_increasing(self) -> bool: - return self.step > 0 or len(self) <= 1 - - @cached_property # type: ignore - @_performance_tracking - def is_monotonic_decreasing(self): - return self.step < 0 or len(self) <= 1 - - @_performance_tracking - def memory_usage(self, deep: bool = False) -> int: - if deep: - warnings.warn( - "The deep parameter is ignored and is only included " - "for pandas compatibility." - ) - return 0 - - def unique(self, level: int | None = None) -> Self: - # RangeIndex always has unique values - if level is not None and level > 0: - raise IndexError( - f"Too many levels: Index has only 1 level, not {level + 1}" - ) - return self.copy() - - @_performance_tracking - def __mul__(self, other): - # Multiplication by raw ints must return a RangeIndex to match pandas. - if isinstance(other, cudf.Scalar) and other.dtype.kind in "iu": - other = other.value - elif ( - isinstance(other, (np.ndarray, cupy.ndarray)) - and other.ndim == 0 - and other.dtype.kind in "iu" - ): - other = other.item() - if isinstance(other, (int, np.integer)): - return RangeIndex( - self.start * other, self.stop * other, self.step * other - ) - return self._as_int_index().__mul__(other) - - @_performance_tracking - def __rmul__(self, other): - # Multiplication is commutative. - return self.__mul__(other) - - @_performance_tracking - def _as_int_index(self) -> Index: - # Convert self to an integer index. This method is used to perform ops - # that are not defined directly on RangeIndex. - return cudf.Index._from_data(self._data) - - @_performance_tracking - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - return self._as_int_index().__array_ufunc__( - ufunc, method, *inputs, **kwargs - ) - - @_performance_tracking - def get_indexer(self, target, limit=None, method=None, tolerance=None): - target_col = cudf.core.column.as_column(target) - if method is not None or not isinstance( - target_col, cudf.core.column.NumericalColumn - ): - # TODO: See if we can implement this without converting to - # Integer index. - return self._as_int_index().get_indexer( - target=target, limit=limit, method=method, tolerance=tolerance - ) - - if self.step > 0: - start, stop, step = self.start, self.stop, self.step - else: - # Reversed - reverse = self._range[::-1] - start, stop, step = reverse.start, reverse.stop, reverse.step - - target_array = target_col.values - locs = target_array - start - valid = (locs % step == 0) & (locs >= 0) & (target_array < stop) - locs[~valid] = -1 - locs[valid] = locs[valid] / step - - if step != self.step: - # Reversed - locs[valid] = len(self) - 1 - locs[valid] - return locs - - @_performance_tracking - def get_loc(self, key): - if not is_scalar(key): - raise TypeError("Should be a scalar-like") - idx = (key - self.start) / self.step - idx_int_upper_bound = (self.stop - self.start) // self.step - if idx > idx_int_upper_bound or idx < 0: - raise KeyError(key) - - idx_int = (key - self.start) // self.step - if idx_int != idx: - raise KeyError(key) - return idx_int - - @_performance_tracking - def _union(self, other, sort=None): - if isinstance(other, RangeIndex): - # Variable suffixes are of the - # following notation: *_o -> other, *_s -> self, - # and *_r -> result - start_s, step_s = self.start, self.step - end_s = self.start + self.step * (len(self) - 1) - start_o, step_o = other.start, other.step - end_o = other.start + other.step * (len(other) - 1) - if self.step < 0: - start_s, step_s, end_s = end_s, -step_s, start_s - if other.step < 0: - start_o, step_o, end_o = end_o, -step_o, start_o - if len(self) == 1 and len(other) == 1: - step_s = step_o = abs(self.start - other.start) - elif len(self) == 1: - step_s = step_o - elif len(other) == 1: - step_o = step_s - - # Determine minimum start value of the result. - start_r = min(start_s, start_o) - # Determine maximum end value of the result. - end_r = max(end_s, end_o) - result = None - min_step = min(step_o, step_s) - - if ((start_s - start_o) % min_step) == 0: - # Checking to determine other is a subset of self with - # equal step size. - if ( - step_o == step_s - and (start_s - end_o) <= step_s - and (start_o - end_s) <= step_s - ): - result = type(self)(start_r, end_r + step_s, step_s) - # Checking if self is a subset of other with unequal - # step sizes. - elif ( - step_o % step_s == 0 - and (start_o + step_s >= start_s) - and (end_o - step_s <= end_s) - ): - result = type(self)(start_r, end_r + step_s, step_s) - # Checking if other is a subset of self with unequal - # step sizes. - elif ( - step_s % step_o == 0 - and (start_s + step_o >= start_o) - and (end_s - step_o <= end_o) - ): - result = type(self)(start_r, end_r + step_o, step_o) - # Checking to determine when the steps are even but one of - # the inputs spans across is near half or less then half - # the other input. This case needs manipulation to step - # size. - elif ( - step_o == step_s - and (step_s % 2 == 0) - and (abs(start_s - start_o) <= step_s / 2) - and (abs(end_s - end_o) <= step_s / 2) - ): - result = type(self)(start_r, end_r + step_s / 2, step_s / 2) - if result is not None: - if sort in {None, True} and not result.is_monotonic_increasing: - return result.sort_values() - else: - return result - - # If all the above optimizations don't cater to the inputs, - # we materialize RangeIndexes into integer indexes and - # then perform `union`. - return self._try_reconstruct_range_index( - self._as_int_index()._union(other, sort=sort) - ) - - @_performance_tracking - def _intersection(self, other, sort=None): - if not isinstance(other, RangeIndex): - return self._try_reconstruct_range_index( - super()._intersection(other, sort=sort) - ) - - if not len(self) or not len(other): - return RangeIndex(0) - - first = self._range[::-1] if self.step < 0 else self._range - second = other._range[::-1] if other.step < 0 else other._range - - # check whether intervals intersect - # deals with in- and decreasing ranges - int_low = max(first.start, second.start) - int_high = min(first.stop, second.stop) - if int_high <= int_low: - return RangeIndex(0) - - # Method hint: linear Diophantine equation - # solve intersection problem - # performance hint: for identical step sizes, could use - # cheaper alternative - gcd, s, _ = _extended_gcd(first.step, second.step) - - # check whether element sets intersect - if (first.start - second.start) % gcd: - return RangeIndex(0) - - # calculate parameters for the RangeIndex describing the - # intersection disregarding the lower bounds - tmp_start = ( - first.start + (second.start - first.start) * first.step // gcd * s - ) - new_step = first.step * second.step // gcd - no_steps = -(-(int_low - tmp_start) // abs(new_step)) - new_start = tmp_start + abs(new_step) * no_steps - new_range = range(new_start, int_high, new_step) - new_index = RangeIndex(new_range) - - if (self.step < 0 and other.step < 0) is not (new_index.step < 0): - new_index = new_index[::-1] - if sort in {None, True}: - new_index = new_index.sort_values() - - return self._try_reconstruct_range_index(new_index) - - @_performance_tracking - def difference(self, other, sort=None): - if isinstance(other, RangeIndex) and self.equals(other): - return self[:0]._get_reconciled_name_object(other) - - return self._try_reconstruct_range_index( - super().difference(other, sort=sort) - ) - - def _try_reconstruct_range_index( - self, index: BaseIndex - ) -> Self | BaseIndex: - if isinstance(index, RangeIndex) or index.dtype.kind not in "iu": - return index - # Evenly spaced values can return a - # RangeIndex instead of a materialized Index. - if not index._column.has_nulls(): # type: ignore[attr-defined] - uniques = cupy.unique(cupy.diff(index.values)) - if len(uniques) == 1 and (diff := uniques[0].get()) != 0: - new_range = range(index[0], index[-1] + diff, diff) - return type(self)(new_range, name=index.name) - return index - - def sort_values( - self, - return_indexer=False, - ascending=True, - na_position="last", - key=None, - ): - if key is not None: - raise NotImplementedError("key parameter is not yet implemented.") - if na_position not in {"first", "last"}: - raise ValueError(f"invalid na_position: {na_position}") - - sorted_index = self - indexer = RangeIndex(range(len(self))) - - sorted_index = self - if ascending: - if self.step < 0: - sorted_index = self[::-1] - indexer = indexer[::-1] - else: - if self.step > 0: - sorted_index = self[::-1] - indexer = indexer = indexer[::-1] - - if return_indexer: - return sorted_index, indexer - else: - return sorted_index - - @_performance_tracking - def _gather(self, gather_map, nullify=False, check_bounds=True): - gather_map = cudf.core.column.as_column(gather_map) - return cudf.Index._from_column( - self._column.take(gather_map, nullify, check_bounds), - name=self.name, - ) - - @_performance_tracking - def _apply_boolean_mask(self, boolean_mask): - return cudf.Index._from_column( - self._column.apply_boolean_mask(boolean_mask), name=self.name - ) - - def repeat(self, repeats, axis=None): - return self._as_int_index().repeat(repeats, axis) - - def _split(self, splits): - return cudf.Index._from_column( - self._as_int_index()._split(splits), name=self.name - ) - - def _binaryop(self, other, op: str): - # TODO: certain binops don't require materializing range index and - # could use some optimization. - return self._as_int_index()._binaryop(other, op=op) - - def join( - self, other, how="left", level=None, return_indexers=False, sort=False - ): - if how in {"left", "right"} or self.equals(other): - # pandas supports directly merging RangeIndex objects and can - # intelligently create RangeIndex outputs depending on the type of - # join. Hence falling back to performing a merge on pd.RangeIndex - # since the conversion is cheap. - if isinstance(other, RangeIndex): - result = self.to_pandas().join( - other.to_pandas(), - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - if return_indexers: - return tuple( - cudf.from_pandas(result[0]), result[1], result[2] - ) - else: - return cudf.from_pandas(result) - return self._as_int_index().join( - other, how, level, return_indexers, sort - ) - - @property # type: ignore - @_performance_tracking - def _column(self) -> ColumnBase: - return self._as_int_index()._column - - @property # type: ignore - @_performance_tracking - def _columns(self) -> list[ColumnBase]: - return self._as_int_index()._columns - - @property # type: ignore - @_performance_tracking - def values_host(self) -> np.ndarray: - return np.arange(start=self.start, stop=self.stop, step=self.step) - - @_performance_tracking - def argsort( - self, - ascending=True, - na_position="last", - ) -> cupy.ndarray: - if na_position not in {"first", "last"}: - raise ValueError(f"invalid na_position: {na_position}") - if (ascending and self.step < 0) or (not ascending and self.step > 0): - return cupy.arange(len(self) - 1, -1, -1) - else: - return cupy.arange(len(self)) - - @_performance_tracking - def where(self, cond, other=None, inplace=False): - return self._as_int_index().where(cond, other, inplace) - - @_performance_tracking - def to_numpy(self) -> np.ndarray: - return self.values_host - - @_performance_tracking - def to_cupy(self) -> cupy.ndarray: - return self.values - - @_performance_tracking - def to_arrow(self) -> pa.Array: - return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype)) - - def __array__(self, dtype=None): - raise TypeError( - "Implicit conversion to a host NumPy array via __array__ is not " - "allowed, To explicitly construct a GPU matrix, consider using " - ".to_cupy()\nTo explicitly construct a host matrix, consider " - "using .to_numpy()." - ) - - @_performance_tracking - def nunique(self, dropna: bool = True) -> int: - return len(self) - - @_performance_tracking - def isna(self) -> cupy.ndarray: - return cupy.zeros(len(self), dtype=bool) - - isnull = isna - - @_performance_tracking - def notna(self) -> cupy.ndarray: - return cupy.ones(len(self), dtype=bool) - - notnull = isna - - @_performance_tracking - def _minmax(self, meth: str) -> int | float: - no_steps = len(self) - 1 - if no_steps == -1: - return np.nan - elif (meth == "min" and self.step > 0) or ( - meth == "max" and self.step < 0 - ): - return self.start - - return self.start + self.step * no_steps - - def min(self) -> int | float: - return self._minmax("min") - - def max(self) -> int | float: - return self._minmax("max") - - @property - def values(self) -> cupy.ndarray: - return cupy.arange(self.start, self.stop, self.step) - - def any(self) -> bool: - return any(self._range) - - def all(self) -> bool: - return 0 not in self._range - - def append(self, other): - result = self._as_int_index().append(other) - return self._try_reconstruct_range_index(result) - - def _indices_of(self, value) -> cudf.core.column.NumericalColumn: - if isinstance(value, (bool, np.bool_)): - raise ValueError( - f"Cannot use {type(value).__name__} to get an index of a " - f"{type(self).__name__}." - ) - try: - i = [self._range.index(value)] - except ValueError: - i = [] - return as_column(i, dtype=size_type_dtype) - - def isin(self, values, level=None): - if level is not None and level > 0: - raise IndexError( - f"Too many levels: Index has only 1 level, not {level + 1}" - ) - if is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a {type(values).__name__}" - ) - - return self._values.isin(values).values - - def __pos__(self) -> Self: - return self.copy() - - def __neg__(self) -> Self: - rng = range(-self.start, -self.stop, -self.step) - return type(self)(rng, name=self.name) - - def __abs__(self) -> Self | Index: - if len(self) == 0 or self.min() >= 0: - return self.copy() - elif self.max() <= 0: - return -self - else: - return abs(self._as_int_index()) - - def _columns_for_reset_index( - self, levels: tuple | None - ) -> Generator[tuple[Any, ColumnBase], None, None]: - """Return the columns and column names for .reset_index""" - # We need to explicitly materialize the RangeIndex to a column - yield "index" if self.name is None else self.name, as_column(self) - - @_warn_no_dask_cudf - def __dask_tokenize__(self): - return (type(self), self.start, self.stop, self.step) - - -class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta): - """ - Immutable sequence used for indexing and alignment. - - The basic object storing axis labels for all pandas objects. - - Parameters - ---------- - data : array-like (1-dimensional) - dtype : str, numpy.dtype, or ExtensionDtype, optional - Data type for the output Index. If not specified, this will be - inferred from `data`. - copy : bool, default False - Copy input data. - name : object - Name to be stored in the index. - tupleize_cols : bool (default: True) - When True, attempt to create a MultiIndex if possible. - Currently not supported. - """ - - @_performance_tracking - def __init__(self, data, **kwargs): - name = _getdefault_name(data, name=kwargs.get("name")) - super().__init__({name: data}) - - @_performance_tracking - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs) - - if ret is not None: - return ret - - # Attempt to dispatch all other functions to cupy. - cupy_func = getattr(cupy, ufunc.__name__) - if cupy_func: - if ufunc.nin == 2: - other = inputs[self is inputs[0]] - inputs = self._make_operands_for_binop(other) - else: - inputs = { - name: (col, None, False, None) - for name, col in self._column_labels_and_values - } - - data = self._apply_cupy_ufunc_to_operands( - ufunc, cupy_func, inputs, **kwargs - ) - - out = [_index_from_data(out) for out in data] - - # pandas returns numpy arrays when the outputs are boolean. - for i, o in enumerate(out): - # We explicitly _do not_ use isinstance here: we want only - # boolean Indexes, not dtype-specific subclasses. - if type(o) is Index and o.dtype.kind == "b": - out[i] = o.values - - return out[0] if ufunc.nout == 1 else tuple(out) - - return NotImplemented - - @classmethod - @_performance_tracking - def _from_column( - cls, column: ColumnBase, *, name: Hashable = None - ) -> Self: - if cls is Index: - ca = cudf.core.column_accessor.ColumnAccessor( - {name: column}, verify=False - ) - return _index_from_data(ca) - else: - return super()._from_column(column, name=name) - - @classmethod - @_performance_tracking - def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: - out = super()._from_data(data=data) - if name is not no_default: - out.name = name - return out - - @classmethod - @_performance_tracking - def _from_data_like_self( - cls, data: MutableMapping, name: Any = no_default - ) -> Self: - out = _index_from_data(data, name) - if name is not no_default: - out.name = name - return out - - @classmethod - @_performance_tracking - def from_arrow(cls, obj) -> Index | cudf.MultiIndex: - """Create from PyArrow Array/ChunkedArray. - - Parameters - ---------- - array : PyArrow Array/ChunkedArray - PyArrow Object which has to be converted. - - Raises - ------ - TypeError for invalid input type. - - Returns - ------- - SingleColumnFrame - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> cudf.Index.from_arrow(pa.array(["a", "b", None])) - Index(['a', 'b', ], dtype='object') - """ - try: - return cls._from_column(ColumnBase.from_arrow(obj)) - except TypeError: - # Try interpreting object as a MultiIndex before failing. - return cudf.MultiIndex.from_arrow(obj) - - @cached_property - def is_monotonic_increasing(self) -> bool: - return super().is_monotonic_increasing - - @cached_property - def is_monotonic_decreasing(self) -> bool: - return super().is_monotonic_decreasing - - def _binaryop( - self, - other: Frame, - op: str, - fill_value: Any = None, - *args, - **kwargs, - ) -> SingleColumnFrame: - reflect, op = self._check_reflected_op(op) - operands = self._make_operands_for_binop(other, fill_value, reflect) - if operands is NotImplemented: - return NotImplemented - binop_result = self._colwise_binop(operands, op) - - if isinstance(other, cudf.Series): - ret = other._from_data_like_self(binop_result) - other_name = other.name - else: - ret = _index_from_data(binop_result) - other_name = getattr(other, "name", self.name) - - ret.name = ( - self.name - if cudf.utils.utils._is_same_name(self.name, other_name) - else None - ) - - # pandas returns numpy arrays when the outputs are boolean. We - # explicitly _do not_ use isinstance here: we want only boolean - # Indexes, not dtype-specific subclasses. - if isinstance(ret, (Index, cudf.Series)) and ret.dtype.kind == "b": - if ret._column.has_nulls(): - ret = ret.fillna(op == "__ne__") - - return ret.values - return ret - - @property # type: ignore - @_performance_tracking - def _values(self) -> ColumnBase: - return self._column - - @classmethod - @_performance_tracking - def _concat(cls, objs): - non_empties = [index for index in objs if len(index)] - if len(objs) != len(non_empties): - # Do not remove until pandas-3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warning_msg = ( - "The behavior of array concatenation with empty entries is " - "deprecated. In a future version, this will no longer exclude " - "empty items when determining the result dtype. " - "To retain the old behavior, exclude the empty entries before " - "the concat operation." - ) - # Warn only if the type might _actually_ change - if len(non_empties) == 0: - if not all(objs[0].dtype == index.dtype for index in objs[1:]): - warnings.warn(warning_msg, FutureWarning) - else: - common_all_type = find_common_type( - [index.dtype for index in objs] - ) - common_non_empty_type = find_common_type( - [index.dtype for index in non_empties] - ) - if common_all_type != common_non_empty_type: - warnings.warn(warning_msg, FutureWarning) - if all(isinstance(obj, RangeIndex) for obj in non_empties): - result = _concat_range_index(non_empties) - else: - data = concat_columns([o._column for o in non_empties]) - result = Index._from_column(data) - - names = {obj.name for obj in objs} - if len(names) == 1: - name = names.pop() - else: - name = None - - result.name = name - return result - - @_performance_tracking - def memory_usage(self, deep: bool = False) -> int: - return self._column.memory_usage - - @cached_property # type: ignore - @_performance_tracking - def is_unique(self) -> bool: - return self._column.is_unique - - @_performance_tracking - def equals(self, other) -> bool: - if not isinstance(other, BaseIndex) or len(self) != len(other): - return False - - check_dtypes = False - - self_is_categorical = isinstance(self, CategoricalIndex) - other_is_categorical = isinstance(other, CategoricalIndex) - if self_is_categorical and not other_is_categorical: - other = other.astype(self.dtype) - check_dtypes = True - elif other_is_categorical and not self_is_categorical: - self = self.astype(other.dtype) - check_dtypes = True - - try: - return self._column.equals( - other._column, check_dtypes=check_dtypes - ) - except TypeError: - return False - - @_performance_tracking - def copy(self, name: Hashable = None, deep: bool = False) -> Self: - """ - Make a copy of this object. - - Parameters - ---------- - name : object, default None - Name of index, use original name when None - deep : bool, default True - Make a deep copy of the data. - With ``deep=False`` the original data is used - - Returns - ------- - New index instance. - """ - name = self.name if name is None else name - col = self._column.copy(deep=True) if deep else self._column - return type(self)._from_column(col, name=name) - - @_performance_tracking - def astype(self, dtype, copy: bool = True) -> Index: - return super().astype({self.name: dtype}, copy) - - @_performance_tracking - def get_indexer(self, target, method=None, limit=None, tolerance=None): - if is_scalar(target): - raise TypeError("Should be a sequence") - - if method not in { - None, - "ffill", - "bfill", - "pad", - "backfill", - "nearest", - }: - raise ValueError( - f"Invalid fill method. Expecting pad (ffill), backfill (bfill)" - f" or nearest. Got {method}" - ) - - if not self.is_unique: - raise ValueError("Cannot get index for a non-unique Index.") - - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) - - if not is_sorted and method is not None: - raise ValueError( - "index must be monotonic increasing or decreasing if `method`" - "is specified." - ) - - needle = as_column(target) - result = as_column( - -1, - length=len(needle), - dtype=libcudf.types.size_type_dtype, - ) - - if not len(self): - return _return_get_indexer_result(result.values) - try: - lcol, rcol = _match_join_keys(needle, self._column, "inner") - except ValueError: - return _return_get_indexer_result(result.values) - - scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner") - result = libcudf.copying.scatter([indices], scatter_map, [result])[0] - result_series = cudf.Series._from_column(result) - - if method in {"ffill", "bfill", "pad", "backfill"}: - result_series = _get_indexer_basic( - index=self, - positions=result_series, - method=method, - target_col=cudf.Series._from_column(needle), - tolerance=tolerance, - ) - elif method == "nearest": - result_series = _get_nearest_indexer( - index=self, - positions=result_series, - target_col=cudf.Series._from_column(needle), - tolerance=tolerance, - ) - elif method is not None: - raise ValueError( - f"{method=} is unsupported, only supported values are: " - "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}" - ) - - return _return_get_indexer_result(result_series.to_cupy()) - - @_performance_tracking - def get_loc(self, key) -> int | slice | cupy.ndarray: - if not is_scalar(key): - raise TypeError("Should be a scalar-like") - - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) - - lower_bound, upper_bound, sort_inds = _lexsorted_equal_range( - self, [as_column([key])], is_sorted - ) - - if lower_bound == upper_bound: - raise KeyError(key) - - if lower_bound + 1 == upper_bound: - # Search result is unique, return int. - return ( - lower_bound - if is_sorted - else sort_inds.element_indexing(lower_bound) # type: ignore[union-attr] - ) - - if is_sorted: - # In monotonic index, lex search result is continuous. A slice for - # the range is returned. - return slice(lower_bound, upper_bound) - - # Not sorted and not unique. Return a boolean mask - mask = cupy.full(len(self), False) - true_inds = sort_inds.slice(lower_bound, upper_bound).values # type: ignore[union-attr] - mask[true_inds] = True - return mask - - @_performance_tracking - def __repr__(self) -> str: - max_seq_items = pd.get_option("max_seq_items") or len(self) - mr = 0 - if 2 * max_seq_items < len(self): - mr = max_seq_items + 1 - - if len(self) > mr and mr != 0: - top = self[0:mr] - bottom = self[-1 * mr :] - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - preprocess = cudf.concat([top, bottom]) - else: - preprocess = self - - # TODO: Change below usages accordingly to - # utilize `Index.to_string` once it is implemented - # related issue : https://github.com/pandas-dev/pandas/issues/35389 - if isinstance(preprocess, CategoricalIndex): - if preprocess.categories.dtype.kind == "f": - output = repr( - preprocess.astype("str") - .to_pandas() - .astype( - dtype=pd.CategoricalDtype( - categories=preprocess.dtype.categories.astype( - "str" - ).to_pandas(), - ordered=preprocess.dtype.ordered, - ) - ) - ) - break_idx = output.find("ordered=") - output = ( - output[:break_idx].replace("'", "") + output[break_idx:] - ) - else: - # Too many non-unique categories will cause - # the output to take too long. In this case, we - # split the categories into data and categories - # and generate the repr separately and - # merge them. - pd_cats = pd.Categorical( - preprocess.astype(preprocess.categories.dtype).to_pandas() - ) - pd_preprocess = pd.CategoricalIndex(pd_cats) - data_repr = repr(pd_preprocess).split("\n") - pd_preprocess.dtype._categories = ( - preprocess.categories.to_pandas() - ) - pd_preprocess.dtype._ordered = preprocess.dtype.ordered - cats_repr = repr(pd_preprocess).split("\n") - output = "\n".join(data_repr[:-1] + cats_repr[-1:]) - - output = output.replace("nan", str(cudf.NA)) - elif preprocess._values.nullable: - if isinstance(self._values, StringColumn): - output = repr(self.to_pandas(nullable=True)) - else: - output = repr(self._clean_nulls_from_index().to_pandas()) - # We should remove all the single quotes - # from the output due to the type-cast to - # object dtype happening above. - # Note : The replacing of single quotes has - # to happen only in case of non-Index[string] types, - # as we want to preserve single quotes in case - # of Index[string] and it is valid to have them. - output = output.replace("'", "") - else: - output = repr(preprocess.to_pandas()) - - # Fix and correct the class name of the output - # string by finding first occurrence of "(" in the output - index_class_split_index = output.find("(") - output = self.__class__.__name__ + output[index_class_split_index:] - - lines = output.split("\n") - - tmp_meta = lines[-1] - dtype_index = tmp_meta.rfind(" dtype=") - prior_to_dtype = tmp_meta[:dtype_index] - lines = lines[:-1] - keywords = [f"dtype='{self.dtype}'"] - if self.name is not None: - keywords.append(f"name={self.name!r}") - if "length" in tmp_meta: - keywords.append(f"length={len(self)}") - if ( - "freq" in tmp_meta - and isinstance(self, DatetimeIndex) - and self._freq is not None - ): - keywords.append( - f"freq={self._freq._maybe_as_fast_pandas_offset().freqstr!r}" - ) - joined_keywords = ", ".join(keywords) - lines.append(f"{prior_to_dtype} {joined_keywords})") - return "\n".join(lines) - - @_performance_tracking - def __getitem__(self, index): - res = self._get_elements_from_column(index) - if isinstance(res, ColumnBase): - res = Index._from_column(res, name=self.name) - return res - - @property # type: ignore - @_performance_tracking - def dtype(self): - """ - `dtype` of the underlying values in Index. - """ - return self._column.dtype - - @_performance_tracking - def isna(self) -> cupy.ndarray: - return self._column.isnull().values - - isnull = isna - - @_performance_tracking - def notna(self) -> cupy.ndarray: - return self._column.notnull().values - - notnull = notna - - def _is_numeric(self) -> bool: - return ( - isinstance(self._values, cudf.core.column.NumericalColumn) - and self.dtype.kind != "b" - ) - - def _is_boolean(self) -> bool: - return self.dtype.kind == "b" - - def _is_integer(self) -> bool: - return self.dtype.kind in "iu" - - def _is_floating(self) -> bool: - return self.dtype.kind == "f" - - def _is_object(self) -> bool: - return isinstance(self._column, cudf.core.column.StringColumn) - - def _is_categorical(self) -> bool: - return False - - def _is_interval(self) -> bool: - return False - - @property # type: ignore - @_performance_tracking - def hasnans(self) -> bool: - return self._column.has_nulls(include_nan=True) - - @_performance_tracking - def argsort( - self, - axis=0, - kind="quicksort", - order=None, - ascending=True, - na_position="last", - ) -> cupy.ndarray: - """Return the integer indices that would sort the index. - - Parameters - ---------- - axis : {0 or "index"} - Has no effect but is accepted for compatibility with numpy. - kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort' - Choice of sorting algorithm. See :func:`numpy.sort` for more - information. 'mergesort' and 'stable' are the only stable - algorithms. Only quicksort is supported in cuDF. - order : None - Has no effect but is accepted for compatibility with numpy. - ascending : bool or list of bool, default True - If True, sort values in ascending order, otherwise descending. - na_position : {'first' or 'last'}, default 'last' - Argument 'first' puts NaNs at the beginning, 'last' puts NaNs - at the end. - - Returns - ------- - cupy.ndarray: The indices sorted based on input. - """ # noqa: E501 - return super().argsort( - axis=axis, - kind=kind, - order=order, - ascending=ascending, - na_position=na_position, - ) - - def repeat(self, repeats, axis=None) -> Self: - result = super()._repeat([self._column], repeats, axis)[0] - result = result._with_type_metadata(self.dtype) - return type(self)._from_column(result, name=self.name) - - @_performance_tracking - def where(self, cond, other=None, inplace=False) -> Index: - result_col = super().where(cond, other, inplace) - return self._mimic_inplace( - _index_from_data({self.name: result_col}), - inplace=inplace, - ) - - @property - def values(self) -> cupy.ndarray: - return self._column.values - - def __contains__(self, item) -> bool: - hash(item) - return item in self._column - - def _clean_nulls_from_index(self) -> Index: - if self._values.has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance(self, (DatetimeIndex, TimedeltaIndex)) - else str(cudf.NA) - ) - return cudf.Index._from_column( - self._column.astype("str").fillna(fill_value), - name=self.name, - ) - - return self - - def any(self) -> bool: - return self._column.any() - - def to_pandas( - self, *, nullable: bool = False, arrow_type: bool = False - ) -> pd.Index: - result = self._column.to_pandas( - nullable=nullable, arrow_type=arrow_type - ) - result.name = self.name - return result - - def to_frame( - self, index: bool = True, name: Hashable = no_default - ) -> cudf.DataFrame: - """Create a DataFrame with a column containing this Index - - Parameters - ---------- - index : boolean, default True - Set the index of the returned DataFrame as the original Index - name : object, defaults to index.name - The passed name should substitute for the index name (if it has - one). - - Returns - ------- - DataFrame - DataFrame containing the original Index data. - - See Also - -------- - Index.to_series : Convert an Index to a Series. - Series.to_frame : Convert Series to DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal') - >>> idx.to_frame() - animal - animal - Ant Ant - Bear Bear - Cow Cow - - By default, the original Index is reused. To enforce a new Index: - - >>> idx.to_frame(index=False) - animal - 0 Ant - 1 Bear - 2 Cow - - To override the name of the resulting column, specify `name`: - - >>> idx.to_frame(index=False, name='zoo') - zoo - 0 Ant - 1 Bear - 2 Cow - """ - return self._to_frame(name=name, index=self if index else None) - - def append(self, other): - if is_list_like(other): - to_concat = [self] - for obj in other: - if not isinstance(obj, BaseIndex): - raise TypeError("all inputs must be Index") - to_concat.append(obj) - else: - this = self - other = ensure_index(other) - - if len(this) == 0 or len(other) == 0: - # we'll filter out empties later in ._concat - to_concat = [this, other] - else: - if is_mixed_with_object_dtype(this, other): - got_dtype = ( - other.dtype - if this.dtype == cudf.dtype("object") - else this.dtype - ) - raise TypeError( - f"cudf does not support appending an Index of " - f"dtype `{cudf.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ) - - if ( - isinstance(self._column, cudf.core.column.NumericalColumn) - and self.dtype != other.dtype - ): - common_type = find_common_type((self.dtype, other.dtype)) - this = this.astype(common_type) - other = other.astype(common_type) - to_concat = [this, other] - - return self._concat(to_concat) - - def unique(self, level: int | None = None) -> Self: - if level is not None and level > 0: - raise IndexError( - f"Too many levels: Index has only 1 level, not {level + 1}" - ) - return type(self)._from_column(self._column.unique(), name=self.name) - - def isin(self, values, level=None) -> cupy.ndarray: - if level is not None and level > 0: - raise IndexError( - f"Too many levels: Index has only 1 level, not {level + 1}" - ) - if is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a {type(values).__name__}" - ) - - return self._column.isin(values).values - - @copy_docstring(StringMethods) # type: ignore - @property - @_performance_tracking - def str(self): - if is_string_dtype(self.dtype): - return StringMethods(parent=self) - else: - raise AttributeError( - "Can only use .str accessor with string values!" - ) - - @cache - @_warn_no_dask_cudf - def __dask_tokenize__(self): - # We can use caching, because an index is immutable - return super().__dask_tokenize__() - - -class DatetimeIndex(Index): - """ - Immutable , ordered and sliceable sequence of datetime64 data, - represented internally as int64. - - Parameters - ---------- - data : array-like (1-dimensional), optional - Optional datetime-like data to construct index with. - copy : bool - Make a copy of input. - freq : str, optional - Frequency of the DatetimeIndex - tz : pytz.timezone or dateutil.tz.tzfile - This is not yet supported - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - This is not yet supported - name : object - Name to be stored in the index. - dayfirst : bool, default False - If True, parse dates in data with the day first order. - This is not yet supported - yearfirst : bool, default False - If True parse dates in data with the year first order. - This is not yet supported - - Attributes - ---------- - year - month - day - hour - minute - second - microsecond - nanosecond - date - time - dayofyear - day_of_year - weekday - quarter - freq - - Methods - ------- - ceil - floor - round - tz_convert - tz_localize - - Returns - ------- - DatetimeIndex - - Examples - -------- - >>> import cudf - >>> cudf.DatetimeIndex([1, 2, 3, 4], name="a") - DatetimeIndex(['1970-01-01 00:00:00.000000001', - '1970-01-01 00:00:00.000000002', - '1970-01-01 00:00:00.000000003', - '1970-01-01 00:00:00.000000004'], - dtype='datetime64[ns]', name='a') - """ - - @_performance_tracking - def __init__( - self, - data=None, - freq=None, - tz=None, - normalize: bool = False, - closed=None, - ambiguous: Literal["raise"] = "raise", - dayfirst: bool = False, - yearfirst: bool = False, - dtype=None, - copy: bool = False, - name=None, - ): - # we should be more strict on what we accept here but - # we'd have to go and figure out all the semantics around - # pandas dtindex creation first which. For now - # just make sure we handle np.datetime64 arrays - # and then just dispatch upstream - if tz is not None: - raise NotImplementedError("tz is not yet supported") - if normalize is not False: - warnings.warn( - "The 'normalize' keyword is " - "deprecated and will be removed in a future version. ", - FutureWarning, - ) - raise NotImplementedError("normalize == True is not yet supported") - if closed is not None: - warnings.warn( - "The 'closed' keyword is " - "deprecated and will be removed in a future version. ", - FutureWarning, - ) - raise NotImplementedError("closed is not yet supported") - if ambiguous != "raise": - raise NotImplementedError("ambiguous is not yet supported") - if dayfirst is not False: - raise NotImplementedError("dayfirst == True is not yet supported") - if yearfirst is not False: - raise NotImplementedError("yearfirst == True is not yet supported") - - self._freq = _validate_freq(freq) - - if dtype is None: - # nanosecond default matches pandas - dtype = "datetime64[ns]" - dtype = cudf.dtype(dtype) - if dtype.kind != "M": - raise TypeError("dtype must be a datetime type") - - name = _getdefault_name(data, name=name) - data = column.as_column(data) - - # TODO: if data.dtype.kind == "M" (i.e. data is already datetime type) - # We probably shouldn't always astype to datetime64[ns] - if not isinstance(data.dtype, pd.DatetimeTZDtype): - data = data.astype(dtype) - - if copy: - data = data.copy() - - super().__init__(data, name=name) - - if self._freq is not None: - unique_vals = self.to_series().diff().unique() - if len(unique_vals) > 2 or ( - len(unique_vals) == 2 - and unique_vals[1] != self._freq._maybe_as_fast_pandas_offset() - ): - raise ValueError("No unique frequency found") - - @_performance_tracking - def _copy_type_metadata(self: Self, other: Self) -> Self: - super()._copy_type_metadata(other) - self._freq = _validate_freq(other._freq) - return self - - @classmethod - def _from_data( - cls, data: MutableMapping, name: Any = no_default, freq: Any = None - ): - result = super()._from_data(data, name) - result._freq = _validate_freq(freq) - return result - - @classmethod - @_performance_tracking - def _from_column( - cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None - ) -> Self: - if column.dtype.kind != "M": - raise ValueError("column must have a datetime type.") - result = super()._from_column(column, name=name) - result._freq = _validate_freq(freq) - return result - - def __getitem__(self, index): - value = super().__getitem__(index) - if cudf.get_option("mode.pandas_compatible") and isinstance( - value, np.datetime64 - ): - return pd.Timestamp(value) - return value - - @_performance_tracking - def copy(self, name=None, deep=False): - idx_copy = super().copy(name=name, deep=deep) - return idx_copy._copy_type_metadata(self) - - def searchsorted( - self, - value, - side: Literal["left", "right"] = "left", - ascending: bool = True, - na_position: Literal["first", "last"] = "last", - ): - value = self.dtype.type(value) - return super().searchsorted( - value, side=side, ascending=ascending, na_position=na_position - ) - - def as_unit(self, unit: str, round_ok: bool = True) -> Self: - """ - Convert to a dtype with the given unit resolution. - - Currently not implemented. - - Parameters - ---------- - unit : {'s', 'ms', 'us', 'ns'} - round_ok : bool, default True - If False and the conversion requires rounding, raise ValueError. - """ - raise NotImplementedError("as_unit is currently not implemented") - - def mean(self, *, skipna: bool = True, axis: int | None = 0): - return self._column.mean(skipna=skipna) - - def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1): - return self._column.std(skipna=skipna, ddof=ddof) - - def strftime(self, date_format: str) -> Index: - """ - Convert to Index using specified date_format. - - Return an Index of formatted strings specified by date_format, which - supports the same string format as the python standard library. - - Parameters - ---------- - date_format : str - Date format string (e.g. "%Y-%m-%d"). - """ - return Index._from_column( - self._column.strftime(date_format), name=self.name - ) - - @property - def asi8(self) -> cupy.ndarray: - return self._column.astype("int64").values - - @property - def inferred_freq(self) -> cudf.DateOffset | None: - raise NotImplementedError("inferred_freq is currently not implemented") - - @property - def freq(self) -> cudf.DateOffset | None: - return self._freq - - @freq.setter - def freq(self) -> None: - raise NotImplementedError("Setting freq is currently not supported.") - - @property - def freqstr(self) -> str: - raise NotImplementedError("freqstr is currently not implemented") - - @property - def resolution(self) -> str: - """ - Returns day, hour, minute, second, millisecond or microsecond - """ - raise NotImplementedError("resolution is currently not implemented") - - @property - def unit(self) -> str: - return self._column.time_unit - - @property - def tz(self) -> tzinfo | None: - """ - Return the timezone. - - Returns - ------- - datetime.tzinfo or None - Returns None when the array is tz-naive. - """ - return getattr(self.dtype, "tz", None) - - @property - def tzinfo(self) -> tzinfo | None: - """ - Alias for tz attribute - """ - return self.tz - - def to_pydatetime(self) -> np.ndarray: - """ - Return an ndarray of ``datetime.datetime`` objects. - - Returns - ------- - numpy.ndarray - An ndarray of ``datetime.datetime`` objects. - """ - return self.to_pandas().to_pydatetime() - - def to_julian_date(self) -> Index: - return Index._from_column( - self._column.to_julian_date(), name=self.name - ) - - def to_period(self, freq) -> pd.PeriodIndex: - return self.to_pandas().to_period(freq=freq) - - def normalize(self) -> Self: - """ - Convert times to midnight. - - Currently not implemented. - """ - return type(self)._from_column( - self._column.normalize(), name=self.name - ) - - @property - def time(self) -> np.ndarray: - """ - Returns numpy array of ``datetime.time`` objects. - - The time part of the Timestamps. - """ - return self.to_pandas().time - - @property - def timetz(self) -> np.ndarray: - """ - Returns numpy array of ``datetime.time`` objects with timezones. - - The time part of the Timestamps. - """ - return self.to_pandas().timetz - - @property - def date(self) -> np.ndarray: - """ - Returns numpy array of python ``datetime.date`` objects. - - Namely, the date part of Timestamps without time and - timezone information. - """ - return self.to_pandas().date - - @property - def is_month_start(self) -> cupy.ndarray: - """ - Booleans indicating if dates are the first day of the month. - """ - return self._column.is_month_start.values - - @property - def is_month_end(self) -> cupy.ndarray: - """ - Booleans indicating if dates are the last day of the month. - """ - return self._column.is_month_end.values - - @property - def is_quarter_end(self) -> cupy.ndarray: - """ - Booleans indicating if dates are the last day of the quarter. - """ - return self._column.is_quarter_end.values - - @property - def is_quarter_start(self) -> cupy.ndarray: - """ - Booleans indicating if dates are the start day of the quarter. - """ - return self._column.is_quarter_start.values - - @property - def is_year_end(self) -> cupy.ndarray: - """ - Booleans indicating if dates are the last day of the year. - """ - return self._column.is_year_end.values - - @property - def is_year_start(self) -> cupy.ndarray: - """ - Booleans indicating if dates are the first day of the year. - """ - return self._column.is_year_start.values - - @property - def is_normalized(self) -> bool: - """ - Returns True if all of the dates are at midnight ("no time") - """ - return self._column.is_normalized - - @property - def days_in_month(self) -> Index: - """ - Get the total number of days in the month that the date falls on. - """ - return Index._from_column(self._column.days_in_month, name=self.name) - - daysinmonth = days_in_month - - @property - def day_of_week(self) -> Index: - """ - Get the day of week that the date falls on. - """ - return Index._from_column(self._column.day_of_week, name=self.name) - - @property # type: ignore - @_performance_tracking - def year(self) -> Index: - """ - The year of the datetime. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="Y")) - >>> datetime_index - DatetimeIndex(['2000-12-31', '2001-12-31', '2002-12-31'], dtype='datetime64[ns]') - >>> datetime_index.year - Index([2000, 2001, 2002], dtype='int16') - """ # noqa: E501 - return self._get_dt_field("year") - - @property # type: ignore - @_performance_tracking - def month(self) -> Index: - """ - The month as January=1, December=12. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="M")) - >>> datetime_index - DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]') - >>> datetime_index.month - Index([1, 2, 3], dtype='int16') - """ # noqa: E501 - return self._get_dt_field("month") - - @property # type: ignore - @_performance_tracking - def day(self) -> Index: - """ - The day of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="D")) - >>> datetime_index - DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], dtype='datetime64[ns]') - >>> datetime_index.day - Index([1, 2, 3], dtype='int16') - """ # noqa: E501 - return self._get_dt_field("day") - - @property # type: ignore - @_performance_tracking - def hour(self) -> Index: - """ - The hours of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="h")) - >>> datetime_index - DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:00:00', - '2000-01-01 02:00:00'], - dtype='datetime64[ns]') - >>> datetime_index.hour - Index([0, 1, 2], dtype='int16') - """ - return self._get_dt_field("hour") - - @property # type: ignore - @_performance_tracking - def minute(self) -> Index: - """ - The minutes of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="T")) - >>> datetime_index - DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', - '2000-01-01 00:02:00'], - dtype='datetime64[ns]') - >>> datetime_index.minute - Index([0, 1, 2], dtype='int16') - """ - return self._get_dt_field("minute") - - @property # type: ignore - @_performance_tracking - def second(self) -> Index: - """ - The seconds of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="s")) - >>> datetime_index - DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:00:01', - '2000-01-01 00:00:02'], - dtype='datetime64[ns]') - >>> datetime_index.second - Index([0, 1, 2], dtype='int16') - """ - return self._get_dt_field("second") - - @property # type: ignore - @_performance_tracking - def microsecond(self) -> Index: - """ - The microseconds of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="us")) - >>> datetime_index - DatetimeIndex([ '2000-01-01 00:00:00', '2000-01-01 00:00:00.000001', - '2000-01-01 00:00:00.000002'], - dtype='datetime64[ns]') - >>> datetime_index.microsecond - Index([0, 1, 2], dtype='int32') - """ # noqa: E501 - return Index._from_column( - ( - # Need to manually promote column to int32 because - # pandas-matching binop behaviour requires that this - # __mul__ returns an int16 column. - self._column.get_dt_field("millisecond").astype("int32") - * cudf.Scalar(1000, dtype="int32") - ) - + self._column.get_dt_field("microsecond"), - name=self.name, - ) - - @property # type: ignore - @_performance_tracking - def nanosecond(self) -> Index: - """ - The nanoseconds of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2000-01-01", - ... periods=3, freq="ns")) - >>> datetime_index - DatetimeIndex([ '2000-01-01 00:00:00', - '2000-01-01 00:00:00.000000001', - '2000-01-01 00:00:00.000000002'], - dtype='datetime64[ns]') - >>> datetime_index.nanosecond - Index([0, 1, 2], dtype='int16') - """ - return self._get_dt_field("nanosecond") - - @property # type: ignore - @_performance_tracking - def weekday(self) -> Index: - """ - The day of the week with Monday=0, Sunday=6. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2016-12-31", - ... "2017-01-08", freq="D")) - >>> datetime_index - DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', - '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', - '2017-01-08'], - dtype='datetime64[ns]') - >>> datetime_index.weekday - Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') - """ - return self._get_dt_field("weekday") - - @property # type: ignore - @_performance_tracking - def dayofweek(self) -> Index: - """ - The day of the week with Monday=0, Sunday=6. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2016-12-31", - ... "2017-01-08", freq="D")) - >>> datetime_index - DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', - '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', - '2017-01-08'], - dtype='datetime64[ns]') - >>> datetime_index.dayofweek - Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int16') - """ - return self._get_dt_field("weekday") - - @property # type: ignore - @_performance_tracking - def dayofyear(self) -> Index: - """ - The day of the year, from 1-365 in non-leap years and - from 1-366 in leap years. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2016-12-31", - ... "2017-01-08", freq="D")) - >>> datetime_index - DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', - '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', - '2017-01-08'], - dtype='datetime64[ns]') - >>> datetime_index.dayofyear - Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') - """ - return self._get_dt_field("day_of_year") - - @property # type: ignore - @_performance_tracking - def day_of_year(self) -> Index: - """ - The day of the year, from 1-365 in non-leap years and - from 1-366 in leap years. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_index = cudf.Index(pd.date_range("2016-12-31", - ... "2017-01-08", freq="D")) - >>> datetime_index - DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', - '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', - '2017-01-08'], - dtype='datetime64[ns]') - >>> datetime_index.day_of_year - Index([366, 1, 2, 3, 4, 5, 6, 7, 8], dtype='int16') - """ - return self._get_dt_field("day_of_year") - - @property # type: ignore - @_performance_tracking - def is_leap_year(self) -> cupy.ndarray: - """ - Boolean indicator if the date belongs to a leap year. - - A leap year is a year, which has 366 days (instead of 365) including - 29th of February as an intercalary day. Leap years are years which are - multiples of four with the exception of years divisible by 100 but not - by 400. - - Returns - ------- - ndarray - Booleans indicating if dates belong to a leap year. - """ - res = self._column.is_leap_year.fillna(False) - return cupy.asarray(res) - - @property # type: ignore - @_performance_tracking - def quarter(self) -> Index: - """ - Integer indicator for which quarter of the year the date belongs in. - - There are 4 quarters in a year. With the first quarter being from - January - March, second quarter being April - June, third quarter - being July - September and fourth quarter being October - December. - - Returns - ------- - Index - Integer indicating which quarter the date belongs to. - - Examples - -------- - >>> import cudf - >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", - ... "1999-12-31 18:40:00"]) - >>> gIndex.quarter - Index([2, 4], dtype='int8') - """ - return Index._from_column(self._column.quarter.astype("int8")) - - @_performance_tracking - def day_name(self, locale: str | None = None) -> Index: - """ - Return the day names. Currently supports English locale only. - - Examples - -------- - >>> import cudf - >>> datetime_index = cudf.date_range("2016-12-31", "2017-01-08", freq="D") - >>> datetime_index - DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', - '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', - '2017-01-08'], - dtype='datetime64[ns]', freq='D') - >>> datetime_index.day_name() - Index(['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', - 'Friday', 'Saturday', 'Sunday'], - dtype='object') - """ - day_names = self._column.get_day_names(locale) - return Index._from_column(day_names, name=self.name) - - @_performance_tracking - def month_name(self, locale: str | None = None) -> Index: - """ - Return the month names. Currently supports English locale only. - - Examples - -------- - >>> import cudf - >>> datetime_index = cudf.date_range("2017-12-30", periods=6, freq='W') - >>> datetime_index - DatetimeIndex(['2017-12-30', '2018-01-06', '2018-01-13', '2018-01-20', - '2018-01-27', '2018-02-03'], - dtype='datetime64[ns]', freq='7D') - >>> datetime_index.month_name() - Index(['December', 'January', 'January', 'January', 'January', 'February'], dtype='object') - """ - month_names = self._column.get_month_names(locale) - return Index._from_column(month_names, name=self.name) - - @_performance_tracking - def isocalendar(self) -> cudf.DataFrame: - """ - Returns a DataFrame with the year, week, and day - calculated according to the ISO 8601 standard. - - Returns - ------- - DataFrame - with columns year, week and day - - Examples - -------- - >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", - ... "1999-12-31 18:40:00"]) - >>> gIndex.isocalendar() - year week day - 2020-05-31 08:00:00 2020 22 7 - 1999-12-31 18:40:00 1999 52 5 - """ - ca = cudf.core.column_accessor.ColumnAccessor( - self._column.isocalendar(), verify=False - ) - return cudf.DataFrame._from_data(ca, index=self) - - @_performance_tracking - def to_pandas( - self, *, nullable: bool = False, arrow_type: bool = False - ) -> pd.DatetimeIndex: - result = super().to_pandas(nullable=nullable, arrow_type=arrow_type) - if not arrow_type and self._freq is not None: - result.freq = self._freq._maybe_as_fast_pandas_offset() - return result - - @_performance_tracking - def _get_dt_field(self, field: str) -> Index: - """Return an Index of a numerical component of the DatetimeIndex.""" - out_column = self._column.get_dt_field(field) - out_column = NumericalColumn( - data=out_column.base_data, - size=out_column.size, - dtype=out_column.dtype, - mask=out_column.base_mask, - offset=out_column.offset, - ) - return Index._from_column(out_column, name=self.name) - - def _is_boolean(self) -> bool: - return False - - @_performance_tracking - def ceil(self, freq: str) -> Self: - """ - Perform ceil operation on the data to the specified freq. - - Parameters - ---------- - freq : str - One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. - Must be a fixed frequency like 'S' (second) not 'ME' (month end). - See `frequency aliases `__ - for more details on these aliases. - - Returns - ------- - DatetimeIndex - Index of the same type for a DatetimeIndex - - Examples - -------- - >>> import cudf - >>> gIndex = cudf.DatetimeIndex([ - ... "2020-05-31 08:05:42", - ... "1999-12-31 18:40:30", - ... ]) - >>> gIndex.ceil("T") - DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]') - """ # noqa: E501 - return type(self)._from_column(self._column.ceil(freq), name=self.name) - - @_performance_tracking - def floor(self, freq: str) -> Self: - """ - Perform floor operation on the data to the specified freq. - - Parameters - ---------- - freq : str - One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. - Must be a fixed frequency like 'S' (second) not 'ME' (month end). - See `frequency aliases `__ - for more details on these aliases. - - Returns - ------- - DatetimeIndex - Index of the same type for a DatetimeIndex - - Examples - -------- - >>> import cudf - >>> gIndex = cudf.DatetimeIndex([ - ... "2020-05-31 08:59:59", - ... "1999-12-31 18:44:59", - ... ]) - >>> gIndex.floor("T") - DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]') - """ # noqa: E501 - return type(self)._from_column( - self._column.floor(freq), name=self.name - ) - - @_performance_tracking - def round(self, freq: str) -> Self: - """ - Perform round operation on the data to the specified freq. - - Parameters - ---------- - freq : str - One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. - Must be a fixed frequency like 'S' (second) not 'ME' (month end). - See `frequency aliases `__ - for more details on these aliases. - - Returns - ------- - DatetimeIndex - Index containing rounded datetimes. - - Examples - -------- - >>> import cudf - >>> dt_idx = cudf.Index([ - ... "2001-01-01 00:04:45", - ... "2001-01-01 00:04:58", - ... "2001-01-01 00:05:04", - ... ], dtype="datetime64[ns]") - >>> dt_idx - DatetimeIndex(['2001-01-01 00:04:45', '2001-01-01 00:04:58', - '2001-01-01 00:05:04'], - dtype='datetime64[ns]') - >>> dt_idx.round('H') - DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]') - >>> dt_idx.round('T') - DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]') - """ # noqa: E501 - return type(self)._from_column( - self._column.round(freq), name=self.name - ) - - def tz_localize( - self, - tz: str | None, - ambiguous: Literal["NaT"] = "NaT", - nonexistent: Literal["NaT"] = "NaT", - ) -> Self: - """ - Localize timezone-naive data to timezone-aware data. - - Parameters - ---------- - tz : str - Timezone to convert timestamps to. - - Returns - ------- - DatetimeIndex containing timezone aware timestamps. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> tz_naive = cudf.date_range('2018-03-01 09:00', periods=3, freq='D') - >>> tz_aware = tz_naive.tz_localize("America/New_York") - >>> tz_aware - DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', - '2018-03-03 09:00:00-05:00'], - dtype='datetime64[ns, America/New_York]', freq='D') - - Ambiguous or nonexistent datetimes are converted to NaT. - - >>> s = cudf.to_datetime(cudf.Series(['2018-10-28 01:20:00', - ... '2018-10-28 02:36:00', - ... '2018-10-28 03:46:00'])) - >>> s.dt.tz_localize("CET") - 0 2018-10-28 01:20:00.000000000 - 1 NaT - 2 2018-10-28 03:46:00.000000000 - dtype: datetime64[ns, CET] - - Notes - ----- - 'NaT' is currently the only supported option for the - ``ambiguous`` and ``nonexistent`` arguments. Any - ambiguous or nonexistent timestamps are converted - to 'NaT'. - """ # noqa: E501 - result_col = self._column.tz_localize(tz, ambiguous, nonexistent) - return DatetimeIndex._from_column( - result_col, name=self.name, freq=self._freq - ) - - def tz_convert(self, tz: str | None) -> Self: - """ - Convert tz-aware datetimes from one time zone to another. - - Parameters - ---------- - tz : str - Time zone for time. Corresponding timestamps would be converted - to this time zone of the Datetime Array/Index. - A `tz` of None will convert to UTC and remove the timezone - information. - - Returns - ------- - DatetimeIndex containing timestamps corresponding to the timezone - `tz`. - - Examples - -------- - >>> import cudf - >>> dti = cudf.date_range('2018-03-01 09:00', periods=3, freq='D') - >>> dti = dti.tz_localize("America/New_York") - >>> dti - DatetimeIndex(['2018-03-01 09:00:00-05:00', '2018-03-02 09:00:00-05:00', - '2018-03-03 09:00:00-05:00'], - dtype='datetime64[ns, America/New_York]', freq='D') - >>> dti.tz_convert("Europe/London") - DatetimeIndex(['2018-03-01 14:00:00+00:00', - '2018-03-02 14:00:00+00:00', - '2018-03-03 14:00:00+00:00'], - dtype='datetime64[ns, Europe/London]') - """ # noqa: E501 - result_col = self._column.tz_convert(tz) - return DatetimeIndex._from_column(result_col, name=self.name) - - def repeat(self, repeats, axis=None) -> Self: - res = super().repeat(repeats, axis=axis) - res._freq = None - return res - - -class TimedeltaIndex(Index): - """ - Immutable, ordered and sliceable sequence of timedelta64 data, - represented internally as int64. - - Parameters - ---------- - data : array-like (1-dimensional), optional - Optional datetime-like data to construct index with. - unit : str, optional - This is not yet supported - copy : bool - Make a copy of input. - freq : str, optional - This is not yet supported - closed : str, optional - This is not yet supported - dtype : str or :class:`numpy.dtype`, optional - Data type for the output Index. If not specified, the - default dtype will be ``timedelta64[ns]``. - name : object - Name to be stored in the index. - - Attributes - ---------- - days - seconds - microseconds - nanoseconds - components - inferred_freq - - Methods - ------- - None - - Returns - ------- - TimedeltaIndex - - Examples - -------- - >>> import cudf - >>> cudf.TimedeltaIndex([1132223, 2023232, 342234324, 4234324], - ... dtype="timedelta64[ns]") - TimedeltaIndex(['0 days 00:00:00.001132223', '0 days 00:00:00.002023232', - '0 days 00:00:00.342234324', '0 days 00:00:00.004234324'], - dtype='timedelta64[ns]') - >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype="timedelta64[s]", - ... name="delta-index") - TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', - '0 days 00:00:04'], - dtype='timedelta64[s]', name='delta-index') - """ - - @_performance_tracking - def __init__( - self, - data=None, - unit=None, - freq=None, - closed=None, - dtype=None, - copy: bool = False, - name=None, - ): - if freq is not None: - raise NotImplementedError("freq is not yet supported") - - if closed is not None: - warnings.warn( - "The 'closed' keyword is " - "deprecated and will be removed in a future version. ", - FutureWarning, - ) - raise NotImplementedError("closed is not yet supported") - - if unit is not None: - warnings.warn( - "The 'unit' keyword is " - "deprecated and will be removed in a future version. ", - FutureWarning, - ) - raise NotImplementedError( - "unit is not yet supported, alternatively " - "dtype parameter is supported" - ) - - if dtype is None: - dtype = "timedelta64[ns]" - dtype = cudf.dtype(dtype) - if dtype.kind != "m": - raise TypeError("dtype must be a timedelta type") - - name = _getdefault_name(data, name=name) - data = column.as_column(data, dtype=dtype) - - if copy: - data = data.copy() - - super().__init__(data, name=name) - - @classmethod - @_performance_tracking - def _from_column( - cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None - ) -> Self: - if column.dtype.kind != "m": - raise ValueError("column must have a timedelta type.") - return super()._from_column(column, name=name) - - def __getitem__(self, index): - value = super().__getitem__(index) - if cudf.get_option("mode.pandas_compatible") and isinstance( - value, np.timedelta64 - ): - return pd.Timedelta(value) - return value - - def as_unit(self, unit: str, round_ok: bool = True) -> Self: - """ - Convert to a dtype with the given unit resolution. - - Currently not implemented. - - Parameters - ---------- - unit : {'s', 'ms', 'us', 'ns'} - round_ok : bool, default True - If False and the conversion requires rounding, raise ValueError. - """ - raise NotImplementedError("as_unit is currently not implemented") - - @property - def freq(self) -> cudf.DateOffset | None: - raise NotImplementedError("freq is currently not implemented") - - @property - def freqstr(self) -> str: - raise NotImplementedError("freqstr is currently not implemented") - - @property - def resolution(self) -> str: - """ - Returns day, hour, minute, second, millisecond or microsecond - """ - raise NotImplementedError("resolution is currently not implemented") - - @property - def unit(self) -> str: - return self._column.time_unit - - def to_pytimedelta(self) -> np.ndarray: - """ - Return an ndarray of ``datetime.timedelta`` objects. - - Returns - ------- - numpy.ndarray - An ndarray of ``datetime.timedelta`` objects. - """ - return self.to_pandas().to_pytimedelta() - - @property - def asi8(self) -> cupy.ndarray: - return self._column.astype("int64").values - - def sum(self, *, skipna: bool = True, axis: int | None = 0): - return self._column.sum(skipna=skipna) - - def mean(self, *, skipna: bool = True, axis: int | None = 0): - return self._column.mean(skipna=skipna) - - def median(self, *, skipna: bool = True, axis: int | None = 0): - return self._column.median(skipna=skipna) - - def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1): - return self._column.std(skipna=skipna, ddof=ddof) - - def total_seconds(self) -> cupy.ndarray: - """ - Return total duration of each element expressed in seconds. - - This method is currently not implemented. - """ - return self._column.total_seconds().values - - def ceil(self, freq: str) -> Self: - """ - Ceil to the specified resolution. - - This method is currently not implemented. - """ - return type(self)._from_column(self._column.ceil(freq), name=self.name) - - def floor(self, freq: str) -> Self: - """ - Floor to the specified resolution. - - This method is currently not implemented. - """ - return type(self)._from_column( - self._column.floor(freq), name=self.name - ) - - def round(self, freq: str) -> Self: - """ - Round to the specified resolution. - - This method is currently not implemented. - """ - return type(self)._from_column( - self._column.round(freq), name=self.name - ) - - @property # type: ignore - @_performance_tracking - def days(self) -> cudf.Index: - """ - Number of days for each element. - """ - # Need to specifically return `int64` to avoid overflow. - return Index._from_column( - self._column.days.astype("int64"), name=self.name - ) - - @property # type: ignore - @_performance_tracking - def seconds(self) -> cudf.Index: - """ - Number of seconds (>= 0 and less than 1 day) for each element. - """ - return Index._from_column( - self._column.seconds.astype("int32"), name=self.name - ) - - @property # type: ignore - @_performance_tracking - def microseconds(self) -> cudf.Index: - """ - Number of microseconds (>= 0 and less than 1 second) for each element. - """ - return Index._from_column( - self._column.microseconds.astype("int32"), name=self.name - ) - - @property # type: ignore - @_performance_tracking - def nanoseconds(self) -> cudf.Index: - """ - Number of nanoseconds (>= 0 and less than 1 microsecond) for each - element. - """ - return Index._from_column( - self._column.nanoseconds.astype("int32"), name=self.name - ) - - @property # type: ignore - @_performance_tracking - def components(self) -> cudf.DataFrame: - """ - Return a dataframe of the components (days, hours, minutes, - seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas. - """ - ca = cudf.core.column_accessor.ColumnAccessor( - self._column.components(), verify=False - ) - return cudf.DataFrame._from_data(ca) - - @property - def inferred_freq(self): - """ - Infers frequency of TimedeltaIndex. - - Notes - ----- - This property is currently not supported. - """ - raise NotImplementedError("inferred_freq is not yet supported") - - def _is_boolean(self) -> bool: - return False - - -class CategoricalIndex(Index): - """ - A categorical of orderable values that represent the indices of another - Column - - Parameters - ---------- - data : array-like (1-dimensional) - The values of the categorical. If categories are given, - values not in categories will be replaced with None/NaN. - categories : list-like, optional - The categories for the categorical. Items need to be unique. - If the categories are not given here (and also not in dtype), - they will be inferred from the data. - ordered : bool, optional - Whether or not this categorical is treated as an ordered categorical. - If not given here or in dtype, the resulting categorical will be - unordered. - dtype : CategoricalDtype or "category", optional - If CategoricalDtype, cannot be used together with categories or - ordered. - copy : bool, default False - Make a copy of input. - name : object, optional - Name to be stored in the index. - - Attributes - ---------- - codes - categories - - Methods - ------- - equals - - Returns - ------- - CategoricalIndex - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> cudf.CategoricalIndex( - ... data=[1, 2, 3, 4], categories=[1, 2], ordered=False, name="a") - CategoricalIndex([1, 2, , ], categories=[1, 2], ordered=False, dtype='category', name='a') - - >>> cudf.CategoricalIndex( - ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a") - CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, dtype='category', name='a') - """ # noqa: E501 - - @_performance_tracking - def __init__( - self, - data=None, - categories=None, - ordered=None, - dtype=None, - copy=False, - name=None, - ): - if isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - if categories is not None or ordered is not None: - raise ValueError( - "Cannot specify `categories` or " - "`ordered` together with `dtype`." - ) - if copy: - data = column.as_column(data, dtype=dtype).copy(deep=True) - name = _getdefault_name(data, name=name) - if isinstance(data, CategoricalColumn): - data = data - elif isinstance(getattr(data, "dtype", None), pd.CategoricalDtype): - data = column.as_column(data) - else: - data = column.as_column( - data, dtype="category" if dtype is None else dtype - ) - # dtype has already been taken care - dtype = None - - if categories is not None: - data = data.set_categories(categories, ordered=ordered) - elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)): - data = data.set_categories(dtype.categories, ordered=ordered) - elif ordered is True and data.ordered is False: - data = data.as_ordered(ordered=True) - elif ordered is False and data.ordered is True: - data = data.as_ordered(ordered=False) - super().__init__(data, name=name) - - @classmethod - @_performance_tracking - def _from_column( - cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None - ) -> Self: - if not isinstance(column.dtype, cudf.CategoricalDtype): - raise ValueError("column must have a categorial type.") - return super()._from_column(column, name=name) - - @property - def ordered(self) -> bool: - return self._column.ordered - - @property # type: ignore - @_performance_tracking - def codes(self) -> cudf.Index: - """ - The category codes of this categorical. - """ - return Index._from_column(self._column.codes) - - @property # type: ignore - @_performance_tracking - def categories(self) -> cudf.Index: - """ - The categories of this categorical. - """ - return self.dtype.categories - - def _is_boolean(self) -> bool: - return False - - def _is_categorical(self) -> bool: - return True - - def add_categories(self, new_categories) -> Self: - """ - Add new categories. - - `new_categories` will be included at the last/highest place in the - categories and will be unused directly after this call. - """ - return type(self)._from_column( - self._column.add_categories(new_categories), name=self.name - ) - - def as_ordered(self) -> Self: - """ - Set the Categorical to be ordered. - """ - return type(self)._from_column( - self._column.as_ordered(ordered=True), name=self.name - ) - - def as_unordered(self) -> Self: - """ - Set the Categorical to be unordered. - """ - return type(self)._from_column( - self._column.as_ordered(ordered=False), name=self.name - ) - - def remove_categories(self, removals) -> Self: - """ - Remove the specified categories. - - `removals` must be included in the old categories. - - Parameters - ---------- - removals : category or list of categories - The categories which should be removed. - """ - return type(self)._from_column( - self._column.remove_categories(removals), name=self.name - ) - - def remove_unused_categories(self) -> Self: - """ - Remove categories which are not used. - - This method is currently not supported. - """ - return type(self)._from_column( - self._column.remove_unused_categories(), name=self.name - ) - - def rename_categories(self, new_categories) -> Self: - """ - Rename categories. - - This method is currently not supported. - """ - return type(self)._from_column( - self._column.rename_categories(new_categories), name=self.name - ) - - def reorder_categories(self, new_categories, ordered=None) -> Self: - """ - Reorder categories as specified in new_categories. - - ``new_categories`` need to include all old categories and no new category - items. - - Parameters - ---------- - new_categories : Index-like - The categories in new order. - ordered : bool, optional - Whether or not the categorical is treated as a ordered categorical. - If not given, do not change the ordered information. - """ - return type(self)._from_column( - self._column.reorder_categories(new_categories, ordered=ordered), - name=self.name, - ) - - def set_categories( - self, new_categories, ordered=None, rename: bool = False - ) -> Self: - """ - Set the categories to the specified new_categories. - - Parameters - ---------- - new_categories : list-like - The categories in new order. - ordered : bool, default None - Whether or not the categorical is treated as - a ordered categorical. If not given, do - not change the ordered information. - rename : bool, default False - Whether or not the `new_categories` should be - considered as a rename of the old categories - or as reordered categories. - """ - return type(self)._from_column( - self._column.set_categories( - new_categories, ordered=ordered, rename=rename - ), - name=self.name, - ) - - -@_performance_tracking -def interval_range( - start=None, - end=None, - periods=None, - freq=None, - name=None, - closed="right", -) -> IntervalIndex: - """ - Returns a fixed frequency IntervalIndex. - - Parameters - ---------- - start : numeric, default None - Left bound for generating intervals. - end : numeric , default None - Right bound for generating intervals. - periods : int, default None - Number of periods to generate - freq : numeric, default None - The length of each interval. Must be consistent - with the type of start and end - name : str, default None - Name of the resulting IntervalIndex. - closed : {"left", "right", "both", "neither"}, default "right" - Whether the intervals are closed on the left-side, right-side, - both or neither. - - Returns - ------- - IntervalIndex - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> cudf.interval_range(start=0,end=5) - IntervalIndex([(0, 0], (1, 1], (2, 2], (3, 3], (4, 4], (5, 5]], - ...closed='right',dtype='interval') - >>> cudf.interval_range(start=0,end=10, freq=2,closed='left') - IntervalIndex([[0, 2), [2, 4), [4, 6), [6, 8), [8, 10)], - ...closed='left',dtype='interval') - >>> cudf.interval_range(start=0,end=10, periods=3,closed='left') - ...IntervalIndex([[0.0, 3.3333333333333335), - [3.3333333333333335, 6.666666666666667), - [6.666666666666667, 10.0)], - closed='left', - dtype='interval') - """ - nargs = sum(_ is not None for _ in (start, end, periods, freq)) - - # we need at least three of (start, end, periods, freq) - if nargs == 2 and freq is None: - freq = 1 - nargs += 1 - - if nargs != 3: - raise ValueError( - "Of the four parameters: start, end, periods, and " - "freq, exactly three must be specified" - ) - - start = cudf.Scalar(start) if start is not None else start - end = cudf.Scalar(end) if end is not None else end - if periods is not None and not cudf.api.types.is_integer(periods): - warnings.warn( - "Non-integer 'periods' in cudf.date_range, and cudf.interval_range" - " are deprecated and will raise in a future version.", - FutureWarning, - ) - periods = cudf.Scalar(int(periods)) if periods is not None else periods - freq = cudf.Scalar(freq) if freq is not None else freq - - if start is None: - start = end - freq * periods - elif freq is None: - quotient, remainder = divmod((end - start).value, periods.value) - if remainder: - freq = (end - start) / periods - else: - freq = cudf.Scalar(int(quotient)) - elif periods is None: - periods = cudf.Scalar(int((end - start) / freq)) - elif end is None: - end = start + periods * freq - - if any( - not _is_non_decimal_numeric_dtype(x.dtype) - for x in (start, periods, freq, end) - ): - raise ValueError("start, end, periods, freq must be numeric values.") - - periods = periods.astype("int64") - common_dtype = find_common_type((start.dtype, freq.dtype, end.dtype)) - start = start.astype(common_dtype) - freq = freq.astype(common_dtype) - - bin_edges = sequence( - size=periods + 1, - init=start.device_value, - step=freq.device_value, - ) - return IntervalIndex.from_breaks(bin_edges, closed=closed, name=name) - - -class IntervalIndex(Index): - """ - Immutable index of intervals that are closed on the same side. - - Parameters - ---------- - data : array-like (1-dimensional) - Array-like containing Interval objects from which to build the - IntervalIndex. - closed : {"left", "right", "both", "neither"}, default "right" - Whether the intervals are closed on the left-side, right-side, - both or neither. - dtype : dtype or None, default None - If None, dtype will be inferred. - copy : bool, default False - Copy the input data. - name : object, optional - Name to be stored in the index. - - Attributes - ---------- - values - - Methods - ------- - from_breaks - get_loc - - Returns - ------- - IntervalIndex - """ - - @_performance_tracking - def __init__( - self, - data, - closed: Literal["left", "right", "neither", "both"] | None = None, - dtype=None, - copy: bool = False, - name=None, - verify_integrity: bool = True, - ): - name = _getdefault_name(data, name=name) - - if dtype is not None: - dtype = cudf.dtype(dtype) - if not isinstance(dtype, IntervalDtype): - raise TypeError("dtype must be an IntervalDtype") - if closed is not None and closed != dtype.closed: - raise ValueError("closed keyword does not match dtype.closed") - closed = dtype.closed - - if closed is None and isinstance(dtype, IntervalDtype): - closed = dtype.closed - - closed = closed or "right" - - if len(data) == 0: - if not hasattr(data, "dtype"): - data = np.array([], dtype=np.int64) - elif isinstance(data.dtype, (pd.IntervalDtype, IntervalDtype)): - data = np.array([], dtype=data.dtype.subtype) - interval_col = IntervalColumn( - None, - dtype=IntervalDtype(data.dtype, closed), - size=len(data), - children=(as_column(data), as_column(data)), - ) - else: - col = as_column(data) - if not isinstance(col, IntervalColumn): - raise TypeError("data must be an iterable of Interval data") - if copy: - col = col.copy() - interval_col = IntervalColumn( - data=None, - dtype=IntervalDtype(col.dtype.subtype, closed), - mask=col.mask, - size=col.size, - offset=col.offset, - null_count=col.null_count, - children=col.children, # type: ignore[arg-type] - ) - - if dtype: - interval_col = interval_col.astype(dtype) # type: ignore[assignment] - - super().__init__(interval_col, name=name) - - @property - def closed(self) -> Literal["left", "right", "neither", "both"]: - return self.dtype.closed - - @classmethod - @_performance_tracking - def _from_column( - cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None - ) -> Self: - if not isinstance(column.dtype, cudf.IntervalDtype): - raise ValueError("column must have a interval type.") - return super()._from_column(column, name=name) - - @classmethod - @_performance_tracking - def from_breaks( - cls, - breaks, - closed: Literal["left", "right", "neither", "both"] | None = "right", - name=None, - copy: bool = False, - dtype=None, - ) -> Self: - """ - Construct an IntervalIndex from an array of splits. - - Parameters - ---------- - breaks : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {"left", "right", "both", "neither"}, default "right" - Whether the intervals are closed on the left-side, right-side, - both or neither. - copy : bool, default False - Copy the input data. - name : object, optional - Name to be stored in the index. - dtype : dtype or None, default None - If None, dtype will be inferred. - - Returns - ------- - IntervalIndex - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval[int64, right]') - """ - breaks = as_column(breaks, dtype=dtype) - if copy: - breaks = breaks.copy() - left_col = breaks.slice(0, len(breaks) - 1) - right_col = breaks.slice(1, len(breaks)) - # For indexing, children should both have 0 offset - right_col = type(right_col)( - data=right_col.data, - dtype=right_col.dtype, - size=right_col.size, - mask=right_col.mask, - offset=0, - null_count=right_col.null_count, - children=right_col.children, - ) - - interval_col = IntervalColumn( - data=None, - dtype=IntervalDtype(left_col.dtype, closed), - size=len(left_col), - children=(left_col, right_col), - ) - return IntervalIndex._from_column(interval_col, name=name) - - @classmethod - def from_arrays( - cls, - left, - right, - closed: Literal["left", "right", "both", "neither"] = "right", - copy: bool = False, - dtype=None, - ) -> Self: - raise NotImplementedError("from_arrays is currently not supported.") - - @classmethod - def from_tuples( - cls, - data, - closed: Literal["left", "right", "both", "neither"] = "right", - name=None, - copy: bool = False, - dtype=None, - ) -> Self: - piidx = pd.IntervalIndex.from_tuples( - data, closed=closed, name=name, copy=copy, dtype=dtype - ) - return cls.from_pandas(piidx) - - def __getitem__(self, index): - raise NotImplementedError( - "Getting a scalar from an IntervalIndex is not yet supported" - ) - - def _is_interval(self) -> bool: - return True - - def _is_boolean(self) -> bool: - return False - - def _clean_nulls_from_index(self) -> Self: - return self - - @property - def is_empty(self) -> cupy.ndarray: - """ - Indicates if an interval is empty, meaning it contains no points. - """ - return self._column.is_empty.values - - @property - def is_non_overlapping_monotonic(self) -> bool: - """ - Return a True if the IntervalIndex is non-overlapping and monotonic. - """ - return self._column.is_non_overlapping_monotonic - - @property - def is_overlapping(self) -> bool: - """ - Return True if the IntervalIndex has overlapping intervals, else False. - - Currently not implemented - """ - return self._column.is_overlapping - - @property - def length(self) -> Index: - """ - Return an Index with entries denoting the length of each Interval. - """ - return _index_from_data({None: self._column.length}) - - @property - def left(self) -> Index: - """ - Return left bounds of the intervals in the IntervalIndex. - - The left bounds of each interval in the IntervalIndex are - returned as an Index. The datatype of the left bounds is the - same as the datatype of the endpoints of the intervals. - """ - return _index_from_data({None: self._column.left}) - - @property - def mid(self) -> Index: - """ - Return the midpoint of each interval in the IntervalIndex as an Index. - - Each midpoint is calculated as the average of the left and right bounds - of each interval. - """ - return _index_from_data({None: self._column.mid}) - - @property - def right(self) -> Index: - """ - Return right bounds of the intervals in the IntervalIndex. - - The right bounds of each interval in the IntervalIndex are - returned as an Index. The datatype of the right bounds is the - same as the datatype of the endpoints of the intervals. - """ - return _index_from_data({None: self._column.right}) - - def overlaps(self, other) -> cupy.ndarray: - """ - Check elementwise if an Interval overlaps the values in the IntervalIndex. - - Currently not supported. - """ - return self._column.overlaps(other).values - - def set_closed( - self, closed: Literal["left", "right", "both", "neither"] - ) -> Self: - """ - Return an identical IntervalArray closed on the specified side. - - Parameters - ---------- - closed : {'left', 'right', 'both', 'neither'} - Whether the intervals are closed on the left-side, right-side, both - or neither. - """ - return type(self)._from_column( - self._column.set_closed(closed), name=self.name - ) - - def to_tuples(self, na_tuple: bool = True) -> pd.Index: - """ - Return an Index of tuples of the form (left, right). - - Parameters - ---------- - na_tuple : bool, default True - If ``True``, return ``NA`` as a tuple ``(nan, nan)``. If ``False``, - just return ``NA`` as ``nan``. - """ - return self.to_pandas().to_tuples(na_tuple=na_tuple) - - -@_performance_tracking -def as_index( - arbitrary, nan_as_null=no_default, copy=False, name=no_default, dtype=None -) -> BaseIndex: - """Create an Index from an arbitrary object - - Parameters - ---------- - arbitrary : object - Object to construct the Index from. See *Notes*. - nan_as_null : bool, optional, default None - If None (default), treats NaN values in arbitrary as null. - If True, combines the mask and NaNs to - form a new validity mask. If False, leaves NaN values as is. - copy : bool, default False - If True, Make copies of `arbitrary` if possible and create an - Index out of it. - If False, `arbitrary` will be shallow-copied if it is a - device-object to construct an Index. - name : object, optional - Name of the index being created, by default it is `None`. - dtype : optional - Optionally typecast the constructed Index to the given - dtype. - - Returns - ------- - result : subclass of Index - - CategoricalIndex for Categorical input. - - DatetimeIndex for Datetime input. - - Index for all other inputs. - - Notes - ----- - Currently supported inputs are: - - * ``Column`` - * ``Buffer`` - * ``Series`` - * ``Index`` - * numba device array - * numpy array - * pyarrow array - * pandas.Categorical - - Returns - ------- - result : subclass of Index - - CategoricalIndex for Categorical input. - - DatetimeIndex for Datetime input. - - Index for all other inputs. - """ - if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) - - if name is no_default: - name = getattr(arbitrary, "name", None) - - if isinstance(arbitrary, cudf.MultiIndex): - if dtype is not None: - raise TypeError( - "dtype must be `None` for inputs of type: " - f"{type(arbitrary).__name__}, found {dtype=} " - ) - return arbitrary.copy(deep=copy) - elif isinstance(arbitrary, BaseIndex): - idx = arbitrary.copy(deep=copy).rename(name) - elif isinstance(arbitrary, ColumnBase): - raise ValueError("Use cudf.Index._from_column instead.") - elif isinstance(arbitrary, (pd.RangeIndex, range)): - idx = RangeIndex( - start=arbitrary.start, - stop=arbitrary.stop, - step=arbitrary.step, - name=name, - ) - elif isinstance(arbitrary, pd.MultiIndex): - if dtype is not None: - raise TypeError( - "dtype must be `None` for inputs of type: " - f"{type(arbitrary).__name__}, found {dtype=} " - ) - return cudf.MultiIndex.from_pandas( - arbitrary.copy(deep=copy), nan_as_null=nan_as_null - ) - elif isinstance(arbitrary, cudf.DataFrame) or is_scalar(arbitrary): - raise ValueError("Index data must be 1-dimensional and list-like") - else: - return Index._from_column( - column.as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null), - name=name, - ) - if dtype is not None: - idx = idx.astype(dtype) - return idx - - -def _getdefault_name(values, name): - if name is None: - return getattr(values, "name", None) - return name - - -@_performance_tracking -def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex: - """ - An internal Utility function to concat RangeIndex objects. - """ - start = step = next_ = None - - # Filter the empty indexes - non_empty_indexes = [obj for obj in indexes if len(obj)] - - if not non_empty_indexes: - # Here all "indexes" had 0 length, i.e. were empty. - # In this case return an empty range index. - return RangeIndex(0, 0) - - for obj in non_empty_indexes: - if start is None: - # This is set by the first non-empty index - start = obj.start - if step is None and len(obj) > 1: - step = obj.step - elif step is None: - # First non-empty index had only one element - if obj.start == start: - result = Index._from_column( - concat_columns([x._column for x in indexes]) - ) - return result - step = obj.start - start - - non_consecutive = (step != obj.step and len(obj) > 1) or ( - next_ is not None and obj.start != next_ - ) - if non_consecutive: - result = Index._from_column( - concat_columns([x._column for x in indexes]) - ) - return result - if step is not None: - next_ = obj[-1] + step - - stop = non_empty_indexes[-1].stop if next_ is None else next_ - return RangeIndex(start, stop, step) - - -@_performance_tracking -def _extended_gcd(a: int, b: int) -> tuple[int, int, int]: - """ - Extended Euclidean algorithms to solve Bezout's identity: - a*x + b*y = gcd(x, y) - Finds one particular solution for x, y: s, t - Returns: gcd, s, t - """ - s, old_s = 0, 1 - t, old_t = 1, 0 - r, old_r = b, a - while r: - quotient = old_r // r - old_r, r = r, old_r - quotient * r - old_s, s = s, old_s - quotient * s - old_t, t = t, old_t - quotient * t - return old_r, old_s, old_t - - -def _get_indexer_basic(index, positions, method, target_col, tolerance): - # `positions` will be modified in-place, so it is the - # responsibility of the caller to decide whether or not - # to make a copy of it before passing it to this method. - nonexact = positions == -1 - positions[nonexact] = index.searchsorted( - target_col[nonexact], - side="left" if method in {"pad", "ffill"} else "right", - ) - if method in {"pad", "ffill"}: - # searchsorted returns "indices into a sorted array such that, - # if the corresponding elements in v were inserted before the - # indices, the order of a would be preserved". - # Thus, we need to subtract 1 to find values to the left. - positions[nonexact] -= 1 - # This also mapped not found values (values of 0 from - # np.searchsorted) to -1, which conveniently is also our - # sentinel for missing values - else: - # Mark indices to the right of the largest value as not found - positions[positions == len(index)] = np.int32(-1) - - if tolerance is not None: - distance = abs(index[positions] - target_col) - return positions.where(distance <= tolerance, -1) - return positions - - -def _get_nearest_indexer( - index: Index, - positions: cudf.Series, - target_col: cudf.core.column.ColumnBase, - tolerance: int | float, -): - """ - Get the indexer for the nearest index labels; requires an index with - values that can be subtracted from each other. - """ - left_indexer = _get_indexer_basic( - index=index, - positions=positions.copy(deep=True), - method="pad", - target_col=target_col, - tolerance=tolerance, - ) - right_indexer = _get_indexer_basic( - index=index, - # positions no longer used so don't copy - positions=positions, - method="backfill", - target_col=target_col, - tolerance=tolerance, - ) - - left_distances = abs(index[left_indexer] - target_col) - right_distances = abs(index[right_indexer] - target_col) - - op = operator.lt if index.is_monotonic_increasing else operator.le - indexer = left_indexer.where( - op(left_distances, right_distances) | (right_indexer == -1), - right_indexer, - ) - - if tolerance is not None: - distance = abs(index[indexer] - target_col) - return indexer.where(distance <= tolerance, -1) - return indexer - - -def _validate_freq(freq: Any) -> cudf.DateOffset | None: - if isinstance(freq, str): - return cudf.DateOffset._from_freqstr(freq) - elif freq is None: - return freq - elif freq is not None and not isinstance(freq, cudf.DateOffset): - raise ValueError(f"Invalid frequency: {freq}") - return cast(cudf.DateOffset, freq) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py deleted file mode 100644 index 5952815deef..00000000000 --- a/python/cudf/cudf/core/indexed_frame.py +++ /dev/null @@ -1,6684 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -"""Base class for Frame types that have an index.""" - -from __future__ import annotations - -import operator -import textwrap -import warnings -from collections import Counter, abc -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Literal, - MutableMapping, - TypeVar, - cast, -) -from uuid import uuid4 - -import cupy as cp -import numpy as np -import pandas as pd -from typing_extensions import Self - -import pylibcudf - -import cudf -import cudf._lib as libcudf -import cudf.core -import cudf.core.algorithms -from cudf.api.extensions import no_default -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - is_dict_like, - is_list_like, - is_scalar, -) -from cudf.core._base_index import BaseIndex -from cudf.core._compat import PANDAS_LT_300 -from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase, NumericalColumn, as_column -from cudf.core.column_accessor import ColumnAccessor -from cudf.core.copy_types import BooleanMask, GatherMap -from cudf.core.dtypes import ListDtype -from cudf.core.frame import Frame -from cudf.core.groupby.groupby import GroupBy -from cudf.core.index import RangeIndex, _index_from_data, ensure_index -from cudf.core.missing import NA -from cudf.core.multiindex import MultiIndex -from cudf.core.resample import _Resampler -from cudf.core.udf.utils import ( - _compile_or_get, - _get_input_args_from_frame, - _post_process_output_col, - _return_arr_from_dtype, -) -from cudf.core.window import ExponentialMovingWindow, Rolling -from cudf.utils import docutils, ioutils -from cudf.utils._numba import _CUDFNumbaConfig -from cudf.utils.docutils import copy_docstring -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import _warn_no_dask_cudf - -if TYPE_CHECKING: - from cudf._typing import ( - ColumnLike, - DataFrameOrSeries, - Dtype, - NotImplementedType, - ) - - -doc_reset_index_template = """ - Reset the index of the {klass}, or a level of it. - - Parameters - ---------- - level : int, str, tuple, or list, default None - Only remove the given levels from the index. Removes all levels by - default. - drop : bool, default False - Do not try to insert index into dataframe columns. This resets - the index to the default integer index. -{argument} - inplace : bool, default False - Modify the DataFrame in place (do not create a new object). - allow_duplicates : bool, default False - Allow duplicate column labels to be created. - Currently not supported. - - Returns - ------- - {return_type} - {klass} with the new index or None if ``inplace=True``.{return_doc} - - Examples - -------- - {example} -""" - - -doc_binop_template = textwrap.dedent( - """ - Get {operation} of DataFrame or Series and other, element-wise (binary - operator `{op_name}`). - - Equivalent to ``frame + other``, but with support to substitute a - ``fill_value`` for missing data in one of the inputs. - - Parameters - ---------- - other : scalar, sequence, Series, or DataFrame - Any single or multiple element data structure, or list-like object. - axis : int or string - Only ``0`` is supported for series, ``1`` or ``columns`` supported - for dataframe - level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level. Not yet supported. - fill_value : float or None, default None - Fill existing missing (NaN) values, and any new element needed - for successful DataFrame alignment, with this value before - computation. If data in both corresponding DataFrame locations - is missing the result will be missing. - - Returns - ------- - DataFrame or Series - Result of the arithmetic operation. - - Examples - -------- - - **DataFrame** - - >>> df = cudf.DataFrame( - ... {{'angles': [0, 3, 4], 'degrees': [360, 180, 360]}}, - ... index=['circle', 'triangle', 'rectangle'] - ... ) - {df_op_example} - - **Series** - - >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd']) - >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e']) - {ser_op_example} - """ -) - - -def _get_unique_drop_labels(array): - """Return labels to be dropped for IndexFrame.drop.""" - if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)): - yield from np.unique(as_column(array).values_host) - elif is_scalar(array): - yield array - else: - yield from set(array) - - -def _indices_from_labels(obj, labels): - if not isinstance(labels, cudf.MultiIndex): - labels = cudf.core.column.as_column(labels) - labels = labels.astype(obj.index.dtype) - idx_labels = cudf.Index._from_column(labels) - else: - idx_labels = labels - - # join is not guaranteed to maintain the index ordering - # so we will sort it with its initial ordering which is stored - # in column "__" - lhs = cudf.DataFrame( - {"__": as_column(range(len(idx_labels)))}, index=idx_labels - ) - rhs = cudf.DataFrame({"_": as_column(range(len(obj)))}, index=obj.index) - return lhs.join(rhs).sort_values(by=["__", "_"])["_"] - - -def _get_label_range_or_mask(index, start, stop, step): - if ( - not (start is None and stop is None) - and type(index) is cudf.core.index.DatetimeIndex - ): - start = pd.to_datetime(start) - stop = pd.to_datetime(stop) - if start is not None and stop is not None: - if start > stop: - return slice(0, 0, None) - if (start in index) and (stop in index): - # when we have a non-monotonic datetime index, return - # values in the slice defined by index_of(start) and - # index_of(end) - start_loc = index.get_loc(start) - stop_loc = index.get_loc(stop) + 1 - return slice(start_loc, stop_loc) - else: - raise KeyError( - "Value based partial slicing on non-monotonic " - "DatetimeIndexes with non-existing keys is not allowed.", - ) - elif start is not None: - if index.is_monotonic_increasing: - return index >= start - elif index.is_monotonic_decreasing: - return index <= start - else: - return index.find_label_range(slice(start, stop, step)) - else: - if index.is_monotonic_increasing: - return index <= stop - elif index.is_monotonic_decreasing: - return index >= stop - else: - return index.find_label_range(slice(start, stop, step)) - else: - return index.find_label_range(slice(start, stop, step)) - - -class _FrameIndexer: - """Parent class for indexers.""" - - def __init__(self, frame): - self._frame = frame - - -_LocIndexerClass = TypeVar("_LocIndexerClass", bound="_FrameIndexer") -_IlocIndexerClass = TypeVar("_IlocIndexerClass", bound="_FrameIndexer") - - -class IndexedFrame(Frame): - """A frame containing an index. - - This class encodes the common behaviors for core user-facing classes like - DataFrame and Series that consist of a sequence of columns along with a - special set of index columns. - - Parameters - ---------- - data : dict - An dict mapping column names to Columns - index : Table - A Frame representing the (optional) index columns. - """ - - # mypy can't handle bound type variables as class members - _loc_indexer_type: type[_LocIndexerClass] # type: ignore - _iloc_indexer_type: type[_IlocIndexerClass] # type: ignore - _groupby = GroupBy - _resampler = _Resampler - - _VALID_SCANS = { - "cumsum", - "cumprod", - "cummin", - "cummax", - } - - # Necessary because the function names don't directly map to the docs. - _SCAN_DOCSTRINGS = { - "cumsum": {"op_name": "cumulative sum"}, - "cumprod": {"op_name": "cumulative product"}, - "cummin": {"op_name": "cumulative min"}, - "cummax": {"op_name": "cumulative max"}, - } - - def __init__( - self, - data: ColumnAccessor | MutableMapping[Any, ColumnBase], - index: BaseIndex, - ): - super().__init__(data=data) - if not isinstance(index, cudf.core._base_index.BaseIndex): - raise ValueError( - f"index must be a cudf index not {type(index).__name__}" - ) - self._index = index - - @property - def _num_rows(self) -> int: - # Important to use the index because the data may be empty. - return len(self.index) - - @property - def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? - return self.index._column_names - - @classmethod - def _from_data( - cls, - data: MutableMapping, - index: BaseIndex | None = None, - ): - out = super()._from_data(data) - if not (index is None or isinstance(index, BaseIndex)): - raise ValueError( - f"index must be None or a cudf.Index not {type(index).__name__}" - ) - # out._num_rows requires .index to be defined - out._index = RangeIndex(out._data.nrows) if index is None else index - return out - - @_performance_tracking - def _from_data_like_self(self, data: MutableMapping): - out = super()._from_data_like_self(data) - out.index = self.index - return out - - @_performance_tracking - def _from_columns_like_self( - self, - columns: list[ColumnBase], - column_names: abc.Iterable[str] | None = None, - index_names: list[str] | None = None, - ) -> Self: - """Construct a `Frame` from a list of columns with metadata from self. - - If `index_names` is set, the first `len(index_names)` columns are - used to construct the index of the frame. - """ - if column_names is None: - column_names = self._column_names - - data_columns = columns - index = None - - if index_names is not None: - n_index_columns = len(index_names) - data_columns = columns[n_index_columns:] - index = _index_from_data( - dict(enumerate(columns[:n_index_columns])) - ) - index = index._copy_type_metadata(self.index) - # TODO: Should this if statement be handled in Index._copy_type_metadata? - if ( - isinstance(self.index, cudf.CategoricalIndex) - and not isinstance(index, cudf.CategoricalIndex) - ) or ( - isinstance(self.index, cudf.MultiIndex) - and not isinstance(index, cudf.MultiIndex) - ): - index = type(self.index)._from_data(index._data) - if isinstance(index, cudf.MultiIndex): - index.names = index_names - else: - index.name = index_names[0] - - data = dict(zip(column_names, data_columns)) - frame = type(self)._from_data(data, index) - return frame._copy_type_metadata(self) - - def __round__(self, digits=0): - # Shouldn't be added to BinaryOperand - # because pandas Index doesn't implement - # this method. - return self.round(decimals=digits) - - def _mimic_inplace( - self, result: Self, inplace: bool = False - ) -> Self | None: - if inplace: - self._index = result.index - return super()._mimic_inplace(result, inplace) - - @_performance_tracking - def _scan(self, op, axis=None, skipna=True): - """ - Return {op_name} of the {cls}. - - Parameters - ---------- - axis: {{index (0), columns(1)}} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, - the result will be NA. - - Returns - ------- - {cls} - - Examples - -------- - **Series** - - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.cumsum() - 0 1 - 1 6 - 2 8 - 3 12 - 4 15 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({{'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}}) - >>> s.cumsum() - a b - 0 1 7 - 1 3 15 - 2 6 24 - 3 10 34 - """ - cast_to_int = op in ("cumsum", "cumprod") - skipna = True if skipna is None else skipna - - results = [] - for col in self._columns: - if skipna: - result_col = col.nans_to_nulls() - else: - if col.has_nulls(include_nan=True): - first_index = col.isnull().find_first_value(True) - result_col = col.copy() - result_col[first_index:] = None - else: - result_col = col - - if cast_to_int and result_col.dtype.kind in "uib": - # For reductions that accumulate a value (e.g. sum, not max) - # pandas returns an int64 dtype for all int or bool dtypes. - result_col = result_col.astype(np.int64) - results.append(getattr(result_col, op)()) - return self._from_data_like_self( - self._data._from_columns_like_self(results) - ) - - def _check_data_index_length_match(self) -> None: - # Validate that the number of rows in the data matches the index if the - # data is not empty. This is a helper for the constructor. - # TODO: Use self._num_rows once DataFrame.__init__ is cleaned up - if self._data.nrows > 0 and self._data.nrows != len(self.index): - raise ValueError( - f"Length of values ({self._data.nrows}) does not " - f"match length of index ({len(self.index)})" - ) - - @property - @_performance_tracking - def empty(self): - """ - Indicator whether DataFrame or Series is empty. - - True if DataFrame/Series is entirely empty (no items), - meaning any of the axes are of length 0. - - Returns - ------- - out : bool - If DataFrame/Series is empty, return True, if not return False. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'A' : []}) - >>> df - Empty DataFrame - Columns: [A] - Index: [] - >>> df.empty - True - - If we only have `null` values in our DataFrame, it is - not considered empty! We will need to drop - the `null`'s to make the DataFrame empty: - - >>> df = cudf.DataFrame({'A' : [None, None]}) - >>> df - A - 0 - 1 - >>> df.empty - False - >>> df.dropna().empty - True - - Non-empty and empty Series example: - - >>> s = cudf.Series([1, 2, None]) - >>> s - 0 1 - 1 2 - 2 - dtype: int64 - >>> s.empty - False - >>> s = cudf.Series([]) - >>> s - Series([], dtype: float64) - >>> s.empty - True - - .. pandas-compat:: - :attr:`pandas.DataFrame.empty`, :attr:`pandas.Series.empty` - - If DataFrame/Series contains only `null` values, it is still not - considered empty. See the example above. - """ - return self.size == 0 - - @_performance_tracking - @ioutils.doc_to_json() - def to_json(self, path_or_buf=None, *args, **kwargs): - """{docstring}""" - - return cudf.io.json.to_json( - self, path_or_buf=path_or_buf, *args, **kwargs - ) - - @_performance_tracking - @ioutils.doc_to_hdf() - def to_hdf(self, path_or_buf, key, *args, **kwargs): - """{docstring}""" - - cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) - - @_performance_tracking - def to_string(self): - r""" - Convert to string - - cuDF uses Pandas internals for efficient string formatting. - Set formatting options using pandas string formatting options and - cuDF objects will print identically to Pandas objects. - - cuDF supports `null/None` as a value in any column type, which - is transparently supported during this output process. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2] - >>> df['val'] = [float(i + 10) for i in range(3)] - >>> df.to_string() - ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0' - """ - return str(self) - - def copy(self, deep: bool = True) -> Self: - """Make a copy of this object's indices and data. - - When ``deep=True`` (default), a new object will be created with a - copy of the calling object's data and indices. Modifications to - the data or indices of the copy will not be reflected in the - original object (see notes below). - When ``deep=False``, a new object will be created without copying - the calling object's data or index (only references to the data - and index are copied). Any changes to the data of the original - will be reflected in the shallow copy (and vice versa). - - Parameters - ---------- - deep : bool, default True - Make a deep copy, including a copy of the data and the indices. - With ``deep=False`` neither the indices nor the data are copied. - - Returns - ------- - copy : Series or DataFrame - Object type matches caller. - - Examples - -------- - >>> s = cudf.Series([1, 2], index=["a", "b"]) - >>> s - a 1 - b 2 - dtype: int64 - >>> s_copy = s.copy() - >>> s_copy - a 1 - b 2 - dtype: int64 - - **Shallow copy versus default (deep) copy:** - - >>> s = cudf.Series([1, 2], index=["a", "b"]) - >>> deep = s.copy() - >>> shallow = s.copy(deep=False) - - Updates to the data shared by shallow copy and original is reflected - in both; deep copy remains unchanged. - - >>> s['a'] = 3 - >>> shallow['b'] = 4 - >>> s - a 3 - b 4 - dtype: int64 - >>> shallow - a 3 - b 4 - dtype: int64 - >>> deep - a 1 - b 2 - dtype: int64 - """ - return self._from_data( - self._data.copy(deep=deep), - # Indexes are immutable so copies can always be shallow. - self.index.copy(deep=False), - ) - - @_performance_tracking - def equals(self, other) -> bool: # noqa: D102 - return super().equals(other) and self.index.equals(other.index) - - @property - def index(self): - """Get the labels for the rows.""" - return self._index - - @index.setter - def index(self, value): - old_length = len(self) - new_length = len(value) - - # A DataFrame with 0 columns can have an index of arbitrary length. - if self._num_columns > 0 and new_length != old_length: - raise ValueError( - f"Length mismatch: Expected axis has {old_length} elements, " - f"new values have {len(value)} elements" - ) - # avoid unnecessary cast to Index - value = ensure_index(value) - self._index = value - - @_performance_tracking - def replace( - self, - to_replace=None, - value=no_default, - inplace=False, - limit=None, - regex=False, - method=no_default, - ): - """Replace values given in ``to_replace`` with ``value``. - - Parameters - ---------- - to_replace : numeric, str or list-like - Value(s) to replace. - - * numeric or str: - - values equal to ``to_replace`` will be replaced - with ``value`` - * list of numeric or str: - - If ``value`` is also list-like, ``to_replace`` and - ``value`` must be of same length. - * dict: - - Dicts can be used to specify different replacement values - for different existing values. For example, {'a': 'b', - 'y': 'z'} replaces the value 'a' with 'b' and - 'y' with 'z'. - To use a dict in this way the ``value`` parameter should - be ``None``. - value : scalar, dict, list-like, str, default None - Value to replace any values matching ``to_replace`` with. - inplace : bool, default False - If True, in place. - - See Also - -------- - Series.fillna - - Raises - ------ - TypeError - - If ``to_replace`` is not a scalar, array-like, dict, or None - - If ``to_replace`` is a dict and value is not a list, dict, - or Series - ValueError - - If a list is passed to ``to_replace`` and ``value`` but they - are not the same length. - - Returns - ------- - result : Series - Series after replacement. The mask and index are preserved. - - Examples - -------- - **Series** - - Scalar ``to_replace`` and ``value`` - - >>> import cudf - >>> s = cudf.Series([0, 1, 2, 3, 4]) - >>> s - 0 0 - 1 1 - 2 2 - 3 3 - 4 4 - dtype: int64 - >>> s.replace(0, 5) - 0 5 - 1 1 - 2 2 - 3 3 - 4 4 - dtype: int64 - - List-like ``to_replace`` - - >>> s.replace([1, 2], 10) - 0 0 - 1 10 - 2 10 - 3 3 - 4 4 - dtype: int64 - - dict-like ``to_replace`` - - >>> s.replace({1:5, 3:50}) - 0 0 - 1 5 - 2 2 - 3 50 - 4 4 - dtype: int64 - >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a']) - >>> s - 0 b - 1 a - 2 a - 3 b - 4 a - dtype: object - >>> s.replace({'a': None}) - 0 b - 1 - 2 - 3 b - 4 - dtype: object - - If there is a mismatch in types of the values in - ``to_replace`` & ``value`` with the actual series, then - cudf exhibits different behavior with respect to pandas - and the pairs are ignored silently: - - >>> s = cudf.Series(['b', 'a', 'a', 'b', 'a']) - >>> s - 0 b - 1 a - 2 a - 3 b - 4 a - dtype: object - >>> s.replace('a', 1) - 0 b - 1 a - 2 a - 3 b - 4 a - dtype: object - >>> s.replace(['a', 'c'], [1, 2]) - 0 b - 1 a - 2 a - 3 b - 4 a - dtype: object - - **DataFrame** - - Scalar ``to_replace`` and ``value`` - - >>> import cudf - >>> df = cudf.DataFrame({'A': [0, 1, 2, 3, 4], - ... 'B': [5, 6, 7, 8, 9], - ... 'C': ['a', 'b', 'c', 'd', 'e']}) - >>> df - A B C - 0 0 5 a - 1 1 6 b - 2 2 7 c - 3 3 8 d - 4 4 9 e - >>> df.replace(0, 5) - A B C - 0 5 5 a - 1 1 6 b - 2 2 7 c - 3 3 8 d - 4 4 9 e - - List-like ``to_replace`` - - >>> df.replace([0, 1, 2, 3], 4) - A B C - 0 4 5 a - 1 4 6 b - 2 4 7 c - 3 4 8 d - 4 4 9 e - >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1]) - A B C - 0 4 5 a - 1 3 6 b - 2 2 7 c - 3 1 8 d - 4 4 9 e - - dict-like ``to_replace`` - - >>> df.replace({0: 10, 1: 100}) - A B C - 0 10 5 a - 1 100 6 b - 2 2 7 c - 3 3 8 d - 4 4 9 e - >>> df.replace({'A': 0, 'B': 5}, 100) - A B C - 0 100 100 a - 1 1 6 b - 2 2 7 c - 3 3 8 d - 4 4 9 e - - .. pandas-compat:: - :meth:`pandas.DataFrame.replace`, :meth:`pandas.Series.replace` - - Parameters that are currently not supported are: `limit`, `regex`, - `method` - """ - if limit is not None: - raise NotImplementedError("limit parameter is not implemented yet") - - if regex: - raise NotImplementedError("regex parameter is not implemented yet") - - if method is not no_default: - warnings.warn( - "The 'method' keyword in " - f"{type(self).__name__}.replace is deprecated and " - "will be removed in a future version.", - FutureWarning, - ) - elif method not in {"pad", None, no_default}: - raise NotImplementedError("method parameter is not implemented") - - if ( - value is no_default - and method is no_default - and not is_dict_like(to_replace) - and regex is False - ): - warnings.warn( - f"{type(self).__name__}.replace without 'value' and with " - "non-dict-like 'to_replace' is deprecated " - "and will raise in a future version. " - "Explicitly specify the new values instead.", - FutureWarning, - ) - if not (to_replace is None and value is no_default): - ( - all_na_per_column, - to_replace_per_column, - replacements_per_column, - ) = _get_replacement_values_for_columns( - to_replace=to_replace, - value=value, - columns_dtype_map=dict(self._dtypes), - ) - copy_data = [] - for name, col in self._column_labels_and_values: - try: - replaced = col.find_and_replace( - to_replace_per_column[name], - replacements_per_column[name], - all_na_per_column[name], - ) - except (KeyError, OverflowError): - # We need to create a deep copy if: - # i. `find_and_replace` was not successful or any of - # `to_replace_per_column`, `replacements_per_column`, - # `all_na_per_column` don't contain the `name` - # that exists in `copy_data`. - # ii. There is an OverflowError while trying to cast - # `to_replace_per_column` to `replacements_per_column`. - replaced = col.copy(deep=True) - copy_data.append(replaced) - result = self._from_data_like_self( - self._data._from_columns_like_self(copy_data) - ) - else: - result = self.copy() - - return self._mimic_inplace(result, inplace=inplace) - - @_performance_tracking - def clip(self, lower=None, upper=None, axis=1, inplace=False): - """ - Trim values at input threshold(s). - - Assigns values outside boundary to boundary values. - Thresholds can be singular values or array like, - and in the latter case the clipping is performed - element-wise in the specified axis. Currently only - `axis=1` is supported. - - Parameters - ---------- - lower : scalar or array_like, default None - Minimum threshold value. All values below this - threshold will be set to it. If it is None, - there will be no clipping based on lower. - In case of Series/Index, lower is expected to be - a scalar or an array of size 1. - upper : scalar or array_like, default None - Maximum threshold value. All values below this - threshold will be set to it. If it is None, - there will be no clipping based on upper. - In case of Series, upper is expected to be - a scalar or an array of size 1. - inplace : bool, default False - - Returns - ------- - Clipped DataFrame/Series/Index/MultiIndex - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"a":[1, 2, 3, 4], "b":['a', 'b', 'c', 'd']}) - >>> df.clip(lower=[2, 'b'], upper=[3, 'c']) - a b - 0 2 b - 1 2 b - 2 3 c - 3 3 c - - >>> df.clip(lower=None, upper=[3, 'c']) - a b - 0 1 a - 1 2 b - 2 3 c - 3 3 c - - >>> df.clip(lower=[2, 'b'], upper=None) - a b - 0 2 b - 1 2 b - 2 3 c - 3 4 d - - >>> df.clip(lower=2, upper=3, inplace=True) - >>> df - a b - 0 2 2 - 1 2 3 - 2 3 3 - 3 3 3 - - >>> import cudf - >>> sr = cudf.Series([1, 2, 3, 4]) - >>> sr.clip(lower=2, upper=3) - 0 2 - 1 2 - 2 3 - 3 3 - dtype: int64 - - >>> sr.clip(lower=None, upper=3) - 0 1 - 1 2 - 2 3 - 3 3 - dtype: int64 - - >>> sr.clip(lower=2, upper=None, inplace=True) - >>> sr - 0 2 - 1 2 - 2 3 - 3 4 - dtype: int64 - """ - if axis != 1: - raise NotImplementedError("`axis is not yet supported in clip`") - - if lower is None and upper is None: - return None if inplace is True else self.copy(deep=True) - - if is_scalar(lower): - lower = np.full(self._num_columns, lower) - if is_scalar(upper): - upper = np.full(self._num_columns, upper) - - if len(lower) != len(upper): - raise ValueError("Length of lower and upper should be equal") - - if len(lower) != self._num_columns: - raise ValueError( - "Length of lower/upper should be equal to number of columns" - ) - - if self.ndim == 1: - # In case of series and Index, - # swap lower and upper if lower > upper - if ( - lower[0] is not None - and upper[0] is not None - and (lower[0] > upper[0]) - ): - lower[0], upper[0] = upper[0], lower[0] - - data = ( - col.clip(low, high) - for col, low, high in zip(self._columns, lower, upper) - ) - output = self._from_data_like_self( - self._data._from_columns_like_self(data) - ) - return self._mimic_inplace(output, inplace=inplace) - - @_performance_tracking - def abs(self): - """ - Return a Series/DataFrame with absolute numeric value of each element. - - This function only applies to elements that are all numeric. - - Returns - ------- - DataFrame/Series - Absolute value of each element. - - Examples - -------- - Absolute numeric values in a Series - - >>> s = cudf.Series([-1.10, 2, -3.33, 4]) - >>> s.abs() - 0 1.10 - 1 2.00 - 2 3.33 - 3 4.00 - dtype: float64 - """ - return self._unaryop("abs") - - @_performance_tracking - def dot(self, other, reflect=False): - """ - Get dot product of frame and other, (binary operator `dot`). - - Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`, - `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`, - `@`. - - Parameters - ---------- - other : Sequence, Series, or DataFrame - Any multiple element data structure, or list-like object. - reflect : bool, default False - If ``True``, swap the order of the operands. See - https://docs.python.org/3/reference/datamodel.html#object.__ror__ - for more information on when this is necessary. - - Returns - ------- - scalar, Series, or DataFrame - The result of the operation. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame([[1, 2, 3, 4], - ... [5, 6, 7, 8]]) - >>> df @ df.T - 0 1 - 0 30 70 - 1 70 174 - >>> s = cudf.Series([1, 1, 1, 1]) - >>> df @ s - 0 10 - 1 26 - dtype: int64 - >>> [1, 2, 3, 4] @ s - 10 - """ - # TODO: This function does not currently support nulls. - lhs = self.values - result_index = None - result_cols = None - if isinstance(self, cudf.Series) and isinstance( - other, (cudf.Series, cudf.DataFrame) - ): - common = self.index.union(other.index) - if len(common) > len(self.index) or len(common) > len(other.index): - raise ValueError("matrices are not aligned") - - lhs = self.reindex(index=common, copy=False).values - rhs = other.reindex(index=common, copy=False).values - if isinstance(other, cudf.DataFrame): - result_index = other._data.to_pandas_index() - elif isinstance(self, cudf.DataFrame) and isinstance( - other, (cudf.Series, cudf.DataFrame) - ): - common = self._data.to_pandas_index().union( - other.index.to_pandas() - ) - if len(common) > self._num_columns or len(common) > len( - other.index - ): - raise ValueError("matrices are not aligned") - - lhs = self.reindex(columns=common, copy=False) - result_index = lhs.index - - rhs = other.reindex(index=common, copy=False).values - lhs = lhs.values - if isinstance(other, cudf.DataFrame): - result_cols = other._data.to_pandas_index() - - elif isinstance( - other, (cp.ndarray, np.ndarray) - ) or cudf.utils.dtypes.can_convert_to_column(other): - rhs = cp.asarray(other) - else: - # TODO: This should raise an exception, not return NotImplemented, - # but __matmul__ relies on the current behavior. We should either - # move this implementation to __matmul__ and call it from here - # (checking for NotImplemented and raising NotImplementedError if - # that's what's returned), or __matmul__ should catch a - # NotImplementedError from here and return NotImplemented. The - # latter feels cleaner (putting the implementation in this method - # rather than in the operator) but will be slower in the (highly - # unlikely) case that we're multiplying a cudf object with another - # type of object that somehow supports this behavior. - return NotImplemented - if reflect: - lhs, rhs = rhs, lhs - - result = lhs.dot(rhs) - if len(result.shape) == 1: - return cudf.Series( - result, - index=self.index if result_index is None else result_index, - ) - if len(result.shape) == 2: - return cudf.DataFrame( - result, - index=self.index if result_index is None else result_index, - columns=result_cols, - ) - return result.item() - - @_performance_tracking - def __matmul__(self, other): - return self.dot(other) - - @_performance_tracking - def __rmatmul__(self, other): - return self.dot(other, reflect=True) - - @_performance_tracking - def head(self, n=5): - """ - Return the first `n` rows. - This function returns the first `n` rows for the object based - on position. It is useful for quickly testing if your object - has the right type of data in it. - For negative values of `n`, this function returns all rows except - the last `n` rows, equivalent to ``df[:-n]``. - - Parameters - ---------- - n : int, default 5 - Number of rows to select. - - Returns - ------- - DataFrame or Series - The first `n` rows of the caller object. - - Examples - -------- - **Series** - - >>> ser = cudf.Series(['alligator', 'bee', 'falcon', - ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) - >>> ser - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - 6 shark - 7 whale - 8 zebra - dtype: object - - Viewing the first 5 lines - - >>> ser.head() - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - dtype: object - - Viewing the first `n` lines (three in this case) - - >>> ser.head(3) - 0 alligator - 1 bee - 2 falcon - dtype: object - - For negative values of `n` - - >>> ser.head(-3) - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - dtype: object - - **DataFrame** - - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.head(2) - key val - 0 0 10.0 - 1 1 11.0 - """ - return self.iloc[:n] - - @_performance_tracking - def tail(self, n=5): - """ - Returns the last n rows as a new DataFrame or Series - - Examples - -------- - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.tail(2) - key val - 3 3 13.0 - 4 4 14.0 - - **Series** - - >>> import cudf - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.tail(2) - 3 1 - 4 0 - """ - if n == 0: - return self.iloc[0:0] - - return self.iloc[-n:] - - @_performance_tracking - def pipe(self, func, *args, **kwargs): - """ - Apply ``func(self, *args, **kwargs)``. - - Parameters - ---------- - func : function - Function to apply to the Series/DataFrame. - ``args``, and ``kwargs`` are passed into ``func``. - Alternatively a ``(callable, data_keyword)`` tuple where - ``data_keyword`` is a string indicating the keyword of - ``callable`` that expects the Series/DataFrame. - args : iterable, optional - Positional arguments passed into ``func``. - kwargs : mapping, optional - A dictionary of keyword arguments passed into ``func``. - - Returns - ------- - object : the return type of ``func``. - - Examples - -------- - Use ``.pipe`` when chaining together functions that expect - Series, DataFrames or GroupBy objects. Instead of writing - - >>> func(g(h(df), arg1=a), arg2=b, arg3=c) - - You can write - - >>> (df.pipe(h) - ... .pipe(g, arg1=a) - ... .pipe(func, arg2=b, arg3=c) - ... ) - - If you have a function that takes the data as (say) the second - argument, pass a tuple indicating which keyword expects the - data. For example, suppose ``f`` takes its data as ``arg2``: - - >>> (df.pipe(h) - ... .pipe(g, arg1=a) - ... .pipe((func, 'arg2'), arg1=a, arg3=c) - ... ) - """ - return cudf.core.common.pipe(self, func, *args, **kwargs) - - @_performance_tracking - def sum( - self, - axis=no_default, - skipna=True, - dtype=None, - numeric_only=False, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.sum() - a 10 - b 34 - dtype: int64 - """ - return self._reduce( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - @_performance_tracking - def product( - self, - axis=no_default, - skipna=True, - dtype=None, - numeric_only=False, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.product() - a 24 - b 5040 - dtype: int64 - """ - - return self._reduce( - # cuDF columns use "product" as the op name, but cupy uses "prod" - # and we need cupy if axis == 1. - "prod" if axis in {1, "columns"} else "product", - axis=axis, - skipna=skipna, - dtype=dtype, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - # Alias for pandas compatibility. - prod = product - - @_performance_tracking - def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs): - """ - Return the mean of the values for the requested axis. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'} - Axis for the function to be applied on. - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - mean : Series or DataFrame (if level specified) - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.mean() - a 2.5 - b 8.5 - dtype: float64 - """ - return self._reduce( - "mean", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - def median( - self, axis=no_default, skipna=True, numeric_only=None, **kwargs - ): - """ - Return the median of the values for the requested axis. - - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. For Series this - parameter is unused and defaults to 0. - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - scalar - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser - 0 10 - 1 25 - 2 3 - 3 25 - 4 24 - 5 6 - dtype: int64 - >>> ser.median() - 17.0 - """ - return self._reduce( - "median", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - @_performance_tracking - def std( - self, - axis=no_default, - skipna=True, - ddof=1, - numeric_only=False, - **kwargs, - ): - """ - Return sample standard deviation of the DataFrame. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.std() - a 1.290994 - b 1.290994 - dtype: float64 - """ - - return self._reduce( - "std", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - - @_performance_tracking - def var( - self, - axis=no_default, - skipna=True, - ddof=1, - numeric_only=False, - **kwargs, - ): - """ - Return unbiased variance of the DataFrame. - - Normalized by N-1 by default. This can be changed using the - ddof argument. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - scalar - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.var() - a 1.666667 - b 1.666667 - dtype: float64 - """ - return self._reduce( - "var", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - - @_performance_tracking - def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs): - """ - Return Fisher's unbiased kurtosis of a sample. - - Kurtosis obtained using Fisher's definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - Series or scalar - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4]) - >>> series.kurtosis() - -1.1999999999999904 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.kurt() - a -1.2 - b -1.2 - dtype: float64 - """ - if axis not in (0, "index", None, no_default): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._reduce( - "kurtosis", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - # Alias for kurtosis. - kurt = kurtosis - - @_performance_tracking - def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): - """ - Return unbiased Fisher-Pearson skew of a sample. - - Parameters - ---------- - skipna: bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - If True, includes only float, int, boolean columns. - If False, will raise error in-case there are - non-numeric columns. - - Returns - ------- - Series - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - 6 6 - dtype: int64 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 8, 10, 10]}) - >>> df.skew() - a 0.00000 - b -0.37037 - dtype: float64 - - .. pandas-compat:: - :meth:`pandas.DataFrame.skew`, :meth:`pandas.Series.skew` - - The `axis` parameter is not currently supported. - """ - if axis not in (0, "index", None, no_default): - raise NotImplementedError("Only axis=0 is currently supported.") - - return self._reduce( - "skew", - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - **kwargs, - ) - - @_performance_tracking - def mask( - self, - cond, - other=None, - inplace: bool = False, - axis=None, - level=None, - ) -> Self | None: - """ - Replace values where the condition is True. - - Parameters - ---------- - cond : bool Series/DataFrame, array-like - Where cond is False, keep the original value. - Where True, replace with corresponding value from other. - Callables are not supported. - other: scalar, list of scalars, Series/DataFrame - Entries where cond is True are replaced with - corresponding value from other. Callables are not - supported. Default is None. - - DataFrame expects only Scalar or array like with scalars or - dataframe with same dimension as self. - - Series expects only scalar or series like with same length - inplace : bool, default False - Whether to perform the operation in place on the data. - - Returns - ------- - Same type as caller - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) - >>> df.mask(df % 2 == 0, [-1, -1]) - A B - 0 1 3 - 1 -1 5 - 2 5 -1 - - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.mask(ser > 2, 10) - 0 10 - 1 10 - 2 2 - 3 1 - 4 0 - dtype: int64 - >>> ser.mask(ser > 2) - 0 - 1 - 2 2 - 3 1 - 4 0 - dtype: int64 - """ - if axis is not None: - raise NotImplementedError("axis is not supported.") - elif level is not None: - raise NotImplementedError("level is not supported.") - - if not hasattr(cond, "__invert__"): - # We Invert `cond` below and call `where`, so - # making sure the object supports - # `~`(inversion) operator or `__invert__` method - cond = cp.asarray(cond) - - return self.where(cond=~cond, other=other, inplace=inplace) - - @_performance_tracking - @copy_docstring(Rolling) - def rolling( - self, - window, - min_periods=None, - center: bool = False, - win_type: str | None = None, - on=None, - axis=0, - closed: str | None = None, - step: int | None = None, - method: str = "single", - ): - return Rolling( - self, - window, - min_periods=min_periods, - center=center, - axis=axis, - on=on, - win_type=win_type, - closed=closed, - step=step, - method=method, - ) - - @copy_docstring(ExponentialMovingWindow) - def ewm( - self, - com: float | None = None, - span: float | None = None, - halflife: float | None = None, - alpha: float | None = None, - min_periods: int | None = 0, - adjust: bool = True, - ignore_na: bool = False, - axis: int = 0, - times: str | np.ndarray | None = None, - method: Literal["single", "table"] = "single", - ): - return ExponentialMovingWindow( - self, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na, - axis=axis, - times=times, - method=method, - ) - - @_performance_tracking - def nans_to_nulls(self): - """ - Convert nans (if any) to nulls - - Returns - ------- - DataFrame or Series - - Examples - -------- - **Series** - - >>> import cudf, numpy as np - >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) - >>> series - 0 1.0 - 1 2.0 - 2 NaN - 3 - 4 10.0 - dtype: float64 - >>> series.nans_to_nulls() - 0 1.0 - 1 2.0 - 2 - 3 - 4 10.0 - dtype: float64 - - **DataFrame** - - >>> df = cudf.DataFrame() - >>> df['a'] = cudf.Series([1, None, np.nan], nan_as_null=False) - >>> df['b'] = cudf.Series([None, 3.14, np.nan], nan_as_null=False) - >>> df - a b - 0 1.0 - 1 3.14 - 2 NaN NaN - >>> df.nans_to_nulls() - a b - 0 1.0 - 1 3.14 - 2 - """ - result = [] - for col in self._columns: - converted = col.nans_to_nulls() - if converted is col: - converted = converted.copy() - result.append(converted) - return self._from_data_like_self( - self._data._from_columns_like_self(result) - ) - - @_performance_tracking - def interpolate( - self, - method="linear", - axis=0, - limit=None, - inplace=False, - limit_direction=None, - limit_area=None, - downcast=None, - **kwargs, - ): - """ - Interpolate data values between some points. - - Parameters - ---------- - method : str, default 'linear' - Interpolation technique to use. Currently, - only 'linear` is supported. - * 'linear': Ignore the index and treat the values as - equally spaced. This is the only method supported on MultiIndexes. - * 'index', 'values': linearly interpolate using the index as - an x-axis. Unsorted indices can lead to erroneous results. - axis : int, default 0 - Axis to interpolate along. Currently, - only 'axis=0' is supported. - inplace : bool, default False - Update the data in place if possible. - - Returns - ------- - Series or DataFrame - Returns the same object type as the caller, interpolated at - some or all ``NaN`` values - - """ - if method in {"pad", "ffill"} and limit_direction != "forward": - raise ValueError( - f"`limit_direction` must be 'forward' for method `{method}`" - ) - if method in {"backfill", "bfill"} and limit_direction != "backward": - raise ValueError( - f"`limit_direction` must be 'backward' for method `{method}`" - ) - - if method.lower() in {"ffill", "bfill", "pad", "backfill"}: - warnings.warn( - f"{type(self).__name__}.interpolate with method={method} is " - "deprecated and will raise in a future version. " - "Use obj.ffill() or obj.bfill() instead.", - FutureWarning, - ) - elif method not in {"linear", "values", "index"}: - raise ValueError(f"Interpolation method `{method}` not found") - - data = self - - if not isinstance(data.index, cudf.RangeIndex): - perm_sort = data.index.argsort() - data = data._gather( - GatherMap.from_column_unchecked( - cudf.core.column.as_column(perm_sort), - len(data), - nullify=False, - ) - ) - - if method == "linear": - interp_index = RangeIndex(self._num_rows) - else: - interp_index = data.index - columns = [] - for col in data._columns: - if isinstance(col, cudf.core.column.StringColumn): - warnings.warn( - f"{type(self).__name__}.interpolate with object dtype is " - "deprecated and will raise in a future version.", - FutureWarning, - ) - if col.nullable: - col = col.astype("float64").fillna(np.nan) - - columns.append( - cudf.core.algorithms._interpolation(col, index=interp_index) - ) - - result = self._from_data_like_self( - self._data._from_columns_like_self(columns) - ) - result.index = data.index - - return ( - result - if isinstance(data.index, cudf.RangeIndex) - # TODO: This should be a scatter, avoiding an argsort. - else result._gather( - GatherMap.from_column_unchecked( - cudf.core.column.as_column(perm_sort.argsort()), - len(result), - nullify=False, - ) - ) - ) - - @_performance_tracking - def shift( - self, - periods=1, - freq=None, - axis=0, - fill_value=None, - suffix: str | None = None, - ): - """Shift values by `periods` positions.""" - axis = self._get_axis_from_axis_arg(axis) - if axis != 0: - raise NotImplementedError("Only axis=0 is supported.") - if freq is not None: - raise NotImplementedError( - "The freq argument is not yet supported." - ) - if suffix is not None: - raise NotImplementedError( - "The suffix argument is not yet supported." - ) - - data_columns = ( - col.shift(periods, fill_value) for col in self._columns - ) - return self._from_data_like_self( - self._data._from_columns_like_self(data_columns) - ) - - @_performance_tracking - def truncate(self, before=None, after=None, axis=0, copy=True): - """ - Truncate a Series or DataFrame before and after some index value. - - This is a useful shorthand for boolean indexing based on index - values above or below certain thresholds. - - Parameters - ---------- - before : date, str, int - Truncate all rows before this index value. - after : date, str, int - Truncate all rows after this index value. - axis : {0 or 'index', 1 or 'columns'}, optional - Axis to truncate. Truncates the index (rows) by default. - copy : bool, default is True, - Return a copy of the truncated section. - - Returns - ------- - The truncated Series or DataFrame. - - Notes - ----- - If the index being truncated contains only datetime values, - `before` and `after` may be specified as strings instead of - Timestamps. - - Examples - -------- - **Series** - - >>> import cudf - >>> cs1 = cudf.Series([1, 2, 3, 4]) - >>> cs1 - 0 1 - 1 2 - 2 3 - 3 4 - dtype: int64 - - >>> cs1.truncate(before=1, after=2) - 1 2 - 2 3 - dtype: int64 - - >>> import cudf - >>> dates = cudf.date_range( - ... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s' - ... ) - >>> cs2 = cudf.Series(range(len(dates)), index=dates) - >>> cs2 - 2021-01-01 23:45:00 0 - 2021-01-01 23:45:01 1 - 2021-01-01 23:45:02 2 - 2021-01-01 23:45:03 3 - 2021-01-01 23:45:04 4 - 2021-01-01 23:45:05 5 - 2021-01-01 23:45:06 6 - 2021-01-01 23:45:07 7 - 2021-01-01 23:45:08 8 - 2021-01-01 23:45:09 9 - 2021-01-01 23:45:10 10 - 2021-01-01 23:45:11 11 - 2021-01-01 23:45:12 12 - 2021-01-01 23:45:13 13 - 2021-01-01 23:45:14 14 - 2021-01-01 23:45:15 15 - 2021-01-01 23:45:16 16 - 2021-01-01 23:45:17 17 - 2021-01-01 23:45:18 18 - 2021-01-01 23:45:19 19 - 2021-01-01 23:45:20 20 - 2021-01-01 23:45:21 21 - 2021-01-01 23:45:22 22 - 2021-01-01 23:45:23 23 - 2021-01-01 23:45:24 24 - ... - 2021-01-01 23:45:56 56 - 2021-01-01 23:45:57 57 - 2021-01-01 23:45:58 58 - 2021-01-01 23:45:59 59 - dtype: int64 - - - >>> cs2.truncate( - ... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ... ) - 2021-01-01 23:45:18 18 - 2021-01-01 23:45:19 19 - 2021-01-01 23:45:20 20 - 2021-01-01 23:45:21 21 - 2021-01-01 23:45:22 22 - 2021-01-01 23:45:23 23 - 2021-01-01 23:45:24 24 - 2021-01-01 23:45:25 25 - 2021-01-01 23:45:26 26 - 2021-01-01 23:45:27 27 - dtype: int64 - - >>> cs3 = cudf.Series({'A': 1, 'B': 2, 'C': 3, 'D': 4}) - >>> cs3 - A 1 - B 2 - C 3 - D 4 - dtype: int64 - - >>> cs3.truncate(before='B', after='C') - B 2 - C 3 - dtype: int64 - - **DataFrame** - - >>> df = cudf.DataFrame({ - ... 'A': ['a', 'b', 'c', 'd', 'e'], - ... 'B': ['f', 'g', 'h', 'i', 'j'], - ... 'C': ['k', 'l', 'm', 'n', 'o'] - ... }, index=[1, 2, 3, 4, 5]) - >>> df - A B C - 1 a f k - 2 b g l - 3 c h m - 4 d i n - 5 e j o - - >>> df.truncate(before=2, after=4) - A B C - 2 b g l - 3 c h m - 4 d i n - - >>> df.truncate(before="A", after="B", axis="columns") - A B - 1 a f - 2 b g - 3 c h - 4 d i - 5 e j - - >>> import cudf - >>> dates = cudf.date_range( - ... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s' - ... ) - >>> df2 = cudf.DataFrame(data={'A': 1, 'B': 2}, index=dates) - >>> df2.head() - A B - 2021-01-01 23:45:00 1 2 - 2021-01-01 23:45:01 1 2 - 2021-01-01 23:45:02 1 2 - 2021-01-01 23:45:03 1 2 - 2021-01-01 23:45:04 1 2 - - >>> df2.truncate( - ... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ... ) - A B - 2021-01-01 23:45:18 1 2 - 2021-01-01 23:45:19 1 2 - 2021-01-01 23:45:20 1 2 - 2021-01-01 23:45:21 1 2 - 2021-01-01 23:45:22 1 2 - 2021-01-01 23:45:23 1 2 - 2021-01-01 23:45:24 1 2 - 2021-01-01 23:45:25 1 2 - 2021-01-01 23:45:26 1 2 - 2021-01-01 23:45:27 1 2 - - .. pandas-compat:: - :meth:`pandas.DataFrame.truncate`, :meth:`pandas.Series.truncate` - - The ``copy`` parameter is only present for API compatibility, but - ``copy=False`` is not supported. This method always generates a - copy. - """ - if not copy: - raise ValueError("Truncating with copy=False is not supported.") - axis = self._get_axis_from_axis_arg(axis) - ax = self.index if axis == 0 else self._data.to_pandas_index() - - if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing: - raise ValueError("truncate requires a sorted index") - - if type(ax) is cudf.core.index.DatetimeIndex: - before = pd.to_datetime(before) - after = pd.to_datetime(after) - - if before is not None and after is not None and before > after: - raise ValueError(f"Truncate: {after} must be after {before}") - - if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1: - before, after = after, before - - slicer = [slice(None, None)] * self.ndim - slicer[axis] = slice(before, after) - return self.loc[tuple(slicer)].copy() - - @property - def loc(self): - """Select rows and columns by label or boolean mask. - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series([10, 11, 12], index=['a', 'b', 'c']) - >>> series - a 10 - b 11 - c 12 - dtype: int64 - >>> series.loc['b'] - 11 - - **DataFrame** - - DataFrame with string index. - - >>> df - a b - a 0 5 - b 1 6 - c 2 7 - d 3 8 - e 4 9 - - Select a single row by label. - - >>> df.loc['a'] - a 0 - b 5 - Name: a, dtype: int64 - - Select multiple rows and a single column. - - >>> df.loc[['a', 'c', 'e'], 'b'] - a 5 - c 7 - e 9 - Name: b, dtype: int64 - - Selection by boolean mask. - - >>> df.loc[df.a > 2] - a b - d 3 8 - e 4 9 - - Setting values using loc. - - >>> df.loc[['a', 'c', 'e'], 'a'] = 0 - >>> df - a b - a 0 5 - b 1 6 - c 0 7 - d 3 8 - e 0 9 - - """ - return self._loc_indexer_type(self) - - @property - def iloc(self): - """Select values by position. - - Examples - -------- - **Series** - - >>> import cudf - >>> s = cudf.Series([10, 20, 30]) - >>> s - 0 10 - 1 20 - 2 30 - dtype: int64 - >>> s.iloc[2] - 30 - - **DataFrame** - - Selecting rows and column by position. - - >>> df = cudf.DataFrame({'a': range(20), - ... 'b': range(20), - ... 'c': range(20)}) - - Select a single row using an integer index. - - >>> df.iloc[1] - a 1 - b 1 - c 1 - Name: 1, dtype: int64 - - Select multiple rows using a list of integers. - - >>> df.iloc[[0, 2, 9, 18]] - a b c - 0 0 0 0 - 2 2 2 2 - 9 9 9 9 - 18 18 18 18 - - Select rows using a slice. - - >>> df.iloc[3:10:2] - a b c - 3 3 3 3 - 5 5 5 5 - 7 7 7 7 - 9 9 9 9 - - Select both rows and columns. - - >>> df.iloc[[1, 3, 5, 7], 2] - 1 1 - 3 3 - 5 5 - 7 7 - Name: c, dtype: int64 - - Setting values in a column using iloc. - - >>> df.iloc[:4] = 0 - >>> df - a b c - 0 0 0 0 - 1 0 0 0 - 2 0 0 0 - 3 0 0 0 - 4 4 4 4 - 5 5 5 5 - 6 6 6 6 - 7 7 7 7 - 8 8 8 8 - 9 9 9 9 - [10 more rows] - - """ - return self._iloc_indexer_type(self) - - @property # type:ignore - @_performance_tracking - def axes(self): - """ - Return a list representing the axes of the Series. - - Series.axes returns a list containing the row index. - - Examples - -------- - >>> import cudf - >>> csf1 = cudf.Series([1, 2, 3, 4]) - >>> csf1.axes - [RangeIndex(start=0, stop=4, step=1)] - - """ - return [self.index] - - def squeeze(self, axis: Literal["index", "columns", 0, 1, None] = None): - """ - Squeeze 1 dimensional axis objects into scalars. - - Series or DataFrames with a single element are squeezed to a scalar. - DataFrames with a single column or a single row are squeezed to a - Series. Otherwise the object is unchanged. - - This method is most useful when you don't know if your - object is a Series or DataFrame, but you do know it has just a single - column. In that case you can safely call `squeeze` to ensure you have a - Series. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns', None}, default None - A specific axis to squeeze. By default, all length-1 axes are - squeezed. For `Series` this parameter is unused and defaults - to `None`. - - Returns - ------- - DataFrame, Series, or scalar - The projection after squeezing `axis` or all the axes. - - See Also - -------- - Series.iloc : Integer-location based indexing for selecting scalars. - DataFrame.iloc : Integer-location based indexing for selecting Series. - Series.to_frame : Inverse of DataFrame.squeeze for a - single-column DataFrame. - - Examples - -------- - >>> primes = cudf.Series([2, 3, 5, 7]) - - Slicing might produce a Series with a single value: - - >>> even_primes = primes[primes % 2 == 0] - >>> even_primes - 0 2 - dtype: int64 - - >>> even_primes.squeeze() - 2 - - Squeezing objects with more than one value in every axis does nothing: - - >>> odd_primes = primes[primes % 2 == 1] - >>> odd_primes - 1 3 - 2 5 - 3 7 - dtype: int64 - - >>> odd_primes.squeeze() - 1 3 - 2 5 - 3 7 - dtype: int64 - - Squeezing is even more effective when used with DataFrames. - - >>> df = cudf.DataFrame([[1, 2], [3, 4]], columns=["a", "b"]) - >>> df - a b - 0 1 2 - 1 3 4 - - Slicing a single column will produce a DataFrame with the columns - having only one value: - - >>> df_a = df[["a"]] - >>> df_a - a - 0 1 - 1 3 - - So the columns can be squeezed down, resulting in a Series: - - >>> df_a.squeeze("columns") - 0 1 - 1 3 - Name: a, dtype: int64 - - Slicing a single row from a single column will produce a single - scalar DataFrame: - - >>> df_0a = df.loc[df.index < 1, ["a"]] - >>> df_0a - a - 0 1 - - Squeezing the rows produces a single scalar Series: - - >>> df_0a.squeeze("rows") - a 1 - Name: 0, dtype: int64 - - Squeezing all axes will project directly into a scalar: - - >>> df_0a.squeeze() - 1 - """ - axes = ( - range(len(self.axes)) - if axis is None - else (self._get_axis_from_axis_arg(axis),) - ) - indexer = tuple( - 0 if i in axes and len(a) == 1 else slice(None) - for i, a in enumerate(self.axes) - ) - return self.iloc[indexer] - - @_performance_tracking - def scale(self): - """ - Scale values to [0, 1] in float64 - - Returns - ------- - DataFrame or Series - Values scaled to [0, 1]. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 0.5, 1]) - >>> series - 0 10.0 - 1 11.0 - 2 12.0 - 3 0.5 - 4 1.0 - dtype: float64 - >>> series.scale() - 0 0.826087 - 1 0.913043 - 2 1.000000 - 3 0.000000 - 4 0.043478 - dtype: float64 - """ - vmin = self.min() - vmax = self.max() - scaled = (self - vmin) / (vmax - vmin) - scaled.index = self.index.copy(deep=False) - return scaled - - @_performance_tracking - def sort_index( - self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind=None, - na_position="last", - sort_remaining=True, - ignore_index=False, - key=None, - ): - """Sort object by labels (along an axis). - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis along which to sort. The value 0 identifies the rows, - and 1 identifies the columns. - level : int or level name or list of ints or list of level names - If not None, sort on values in specified index level(s). - This is only useful in the case of MultiIndex. - ascending : bool, default True - Sort ascending vs. descending. - inplace : bool, default False - If True, perform operation in-place. - kind : sorting method such as `quick sort` and others. - Not yet supported. - na_position : {'first', 'last'}, default 'last' - Puts NaNs at the beginning if first; last puts NaNs at the end. - sort_remaining : bool, default True - When sorting a multiindex on a subset of its levels, - should entries be lexsorted by the remaining - (non-specified) levels as well? - ignore_index : bool, default False - if True, index will be replaced with RangeIndex. - key : callable, optional - If not None, apply the key function to the index values before - sorting. This is similar to the key argument in the builtin - sorted() function, with the notable difference that this key - function should be vectorized. It should expect an Index and return - an Index of the same shape. For MultiIndex inputs, the key is - applied per level. - - Returns - ------- - Frame or None - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) - >>> series - 3 a - 2 b - 1 c - 4 d - dtype: object - >>> series.sort_index() - 1 c - 2 b - 3 a - 4 d - dtype: object - - Sort Descending - - >>> series.sort_index(ascending=False) - 4 d - 3 a - 2 b - 1 c - dtype: object - - **DataFrame** - - >>> df = cudf.DataFrame( - ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2]) - >>> df.sort_index(axis=0) - b a - 1 3 2 - 2 1 3 - 3 2 1 - >>> df.sort_index(axis=1) - a b - 1 2 3 - 3 1 2 - 2 3 1 - - .. pandas-compat:: - :meth:`pandas.DataFrame.sort_index`, :meth:`pandas.Series.sort_index` - - * Not supporting: kind, sort_remaining=False - """ - if kind is not None: - raise NotImplementedError("kind is not yet supported") - - if key is not None: - raise NotImplementedError("key is not yet supported.") - - if na_position not in {"first", "last"}: - raise ValueError(f"invalid na_position: {na_position}") - - if axis in (0, "index"): - idx = self.index - if isinstance(idx, MultiIndex): - if level is not None: - if not is_list_like(level): - level = [level] - by = list(map(idx._get_level_label, level)) - if sort_remaining: - handled = set(by) - by.extend( - filter( - lambda n: n not in handled, - self.index._column_names, - ) - ) - else: - by = list(idx._column_names) - - inds = idx._get_sorted_inds( - by=by, ascending=ascending, na_position=na_position - ) - out = self._gather( - GatherMap.from_column_unchecked( - inds, len(self), nullify=False - ) - ) - # TODO: frame factory function should handle multilevel column - # names - if ( - isinstance(self, cudf.core.dataframe.DataFrame) - and self._data.multiindex - ): - out._set_columns_like(self._data) - elif (ascending and idx.is_monotonic_increasing) or ( - not ascending and idx.is_monotonic_decreasing - ): - out = self.copy() - else: - inds = idx.argsort( - ascending=ascending, na_position=na_position - ) - out = self._gather( - GatherMap.from_column_unchecked( - cudf.core.column.as_column(inds), - len(self), - nullify=False, - ) - ) - if ( - isinstance(self, cudf.core.dataframe.DataFrame) - and self._data.multiindex - ): - out._set_columns_like(self._data) - if ignore_index: - out = out.reset_index(drop=True) - else: - labels = sorted(self._column_names, reverse=not ascending) - result_columns = (self._data[label] for label in labels) - if ignore_index: - ca = ColumnAccessor( - dict(enumerate(result_columns)), - rangeindex=True, - verify=False, - ) - else: - ca = ColumnAccessor( - dict(zip(labels, result_columns)), - rangeindex=self._data.rangeindex, - multiindex=self._data.multiindex, - level_names=self._data.level_names, - label_dtype=self._data.label_dtype, - verify=False, - ) - out = self._from_data_like_self(ca) - - return self._mimic_inplace(out, inplace=inplace) - - def memory_usage(self, index=True, deep=False): - """Return the memory usage of an object. - - Parameters - ---------- - index : bool, default True - Specifies whether to include the memory usage of the index. - deep : bool, default False - The deep parameter is ignored and is only included for pandas - compatibility. - - Returns - ------- - Series or scalar - For DataFrame, a Series whose index is the original column names - and whose values is the memory usage of each column in bytes. For a - Series the total memory usage. - - Examples - -------- - **DataFrame** - - >>> dtypes = ['int64', 'float64', 'object', 'bool'] - >>> data = dict([(t, np.ones(shape=5000).astype(t)) - ... for t in dtypes]) - >>> df = cudf.DataFrame(data) - >>> df.head() - int64 float64 object bool - 0 1 1.0 1.0 True - 1 1 1.0 1.0 True - 2 1 1.0 1.0 True - 3 1 1.0 1.0 True - 4 1 1.0 1.0 True - >>> df.memory_usage(index=False) - int64 40000 - float64 40000 - object 40000 - bool 5000 - dtype: int64 - - Use a Categorical for efficient storage of an object-dtype column with - many repeated values. - - >>> df['object'].astype('category').memory_usage(deep=True) - 5008 - - **Series** - >>> s = cudf.Series(range(3), index=['a','b','c']) - >>> s.memory_usage() - 43 - - Not including the index gives the size of the rest of the data, which - is necessarily smaller: - - >>> s.memory_usage(index=False) - 24 - """ - raise NotImplementedError - - def hash_values(self, method="murmur3", seed=None): - """Compute the hash of values in this column. - - Parameters - ---------- - method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3' - Hash function to use: - - * murmur3: MurmurHash3 hash function - * md5: MD5 hash function - * xxhash64: xxHash64 hash function - - seed : int, optional - Seed value to use for the hash function. This parameter is only - supported for 'murmur3' and 'xxhash64'. - - - Returns - ------- - Series - A Series with hash values. - - Examples - -------- - **Series** - - >>> import cudf - >>> series = cudf.Series([10, 120, 30]) - >>> series - 0 10 - 1 120 - 2 30 - dtype: int64 - >>> series.hash_values(method="murmur3") - 0 -1930516747 - 1 422619251 - 2 -941520876 - dtype: int32 - >>> series.hash_values(method="md5") - 0 7be4bbacbfdb05fb3044e36c22b41e8b - 1 947ca8d2c5f0f27437f156cfbfab0969 - 2 d0580ef52d27c043c8e341fd5039b166 - dtype: object - >>> series.hash_values(method="murmur3", seed=42) - 0 2364453205 - 1 422621911 - 2 3353449140 - dtype: uint32 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]}) - >>> df - a b - 0 10 0.00 - 1 120 0.25 - 2 30 0.50 - >>> df.hash_values(method="murmur3") - 0 -330519225 - 1 -397962448 - 2 -1345834934 - dtype: int32 - >>> df.hash_values(method="md5") - 0 57ce879751b5169c525907d5c563fae1 - 1 948d6221a7c4963d4be411bcead7e32b - 2 fe061786ea286a515b772d91b0dfcd70 - dtype: object - """ - seed_hash_methods = {"murmur3", "xxhash64"} - if seed is None: - seed = 0 - elif method not in seed_hash_methods: - warnings.warn( - "Provided seed value has no effect for the hash method " - f"`{method}`. Only {seed_hash_methods} support seeds." - ) - # Note that both Series and DataFrame return Series objects from this - # calculation, necessitating the unfortunate circular reference to the - # child class here. - return cudf.Series._from_column( - libcudf.hash.hash([*self._columns], method, seed), - index=self.index, - ) - - def _gather( - self, - gather_map: GatherMap, - keep_index=True, - ): - """Gather rows of frame specified by indices in `gather_map`. - - Maintain the index if keep_index is True. - - This function does no expensive bounds checking, but does - check that the number of rows of self matches the validated - number of rows. - """ - if not gather_map.nullify and len(self) != gather_map.nrows: - raise IndexError("Gather map is out of bounds") - return self._from_columns_like_self( - libcudf.copying.gather( - list(self.index._columns + self._columns) - if keep_index - else list(self._columns), - gather_map.column, - nullify=gather_map.nullify, - ), - self._column_names, - self.index.names if keep_index else None, - ) - - def _slice(self, arg: slice, keep_index: bool = True) -> Self: - """Slice a frame. - - Parameters - ---------- - arg - The slice - keep_index - Preserve the index when slicing? - - Returns - ------- - Sliced frame - - Notes - ----- - This slicing has normal python semantics. - """ - num_rows = len(self) - if num_rows == 0: - return self - start, stop, stride = arg.indices(num_rows) - index = self.index - has_range_index = isinstance(index, RangeIndex) - if len(range(start, stop, stride)) == 0: - # Avoid materialising the range index column - result = self._empty_like( - keep_index=keep_index and not has_range_index - ) - if keep_index and has_range_index: - lo = index.start + start * index.step - hi = index.start + stop * index.step - step = index.step * stride - result.index = RangeIndex( - start=lo, stop=hi, step=step, name=index.name - ) - return result - if start < 0: - start = start + num_rows - - # At this point, we have converted slice arguments into - # indices that no longer wrap around. - # For example slice(4, None, -1) will produce the - # start, stop, stride tuple (4, -1, -1) - # This check makes sure -1 is not wrapped (again) to - # produce -1 + num_rows. - if stop < 0 and not (stride < 0 and stop == -1): - stop = stop + num_rows - stride = 1 if stride is None else stride - - if (stop - start) * stride <= 0: - return self._empty_like(keep_index=True) - - start = min(start, num_rows) - stop = min(stop, num_rows) - - if stride != 1: - return self._gather( - GatherMap.from_column_unchecked( - cast( - NumericalColumn, - as_column( - range(start, stop, stride), - dtype=libcudf.types.size_type_dtype, - ), - ), - len(self), - nullify=False, - ), - keep_index=keep_index, - ) - - columns_to_slice = [ - *( - self.index._columns - if keep_index and not has_range_index - else [] - ), - *self._columns, - ] - result = self._from_columns_like_self( - libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0], - self._column_names, - None if has_range_index or not keep_index else self.index.names, - ) - - if keep_index and has_range_index: - result.index = self.index[start:stop] - return result - - def _positions_from_column_names( - self, column_names, offset_by_index_columns=False - ): - """Map each column name into their positions in the frame. - - Return positions of the provided column names, offset by the number of - index columns if `offset_by_index_columns` is True. The order of - indices returned corresponds to the column order in this Frame. - """ - num_index_columns = ( - len(self.index._data) if offset_by_index_columns else 0 - ) - return [ - i + num_index_columns - for i, name in enumerate(self._column_names) - if name in set(column_names) - ] - - def drop_duplicates( - self, - subset=None, - keep="first", - nulls_are_equal=True, - ignore_index=False, - ): - """ - Drop duplicate rows in frame. - - subset : list, optional - List of columns to consider when dropping rows. - keep : ["first", "last", False] - "first" will keep the first duplicate entry, "last" will keep the - last duplicate entry, and False will drop all duplicates. - nulls_are_equal: bool, default True - Null elements are considered equal to other null elements. - ignore_index: bool, default False - If True, the resulting axis will be labeled 0, 1, ..., n - 1. - """ - if not isinstance(ignore_index, (np.bool_, bool)): - raise ValueError( - f"{ignore_index=} must be bool, " - f"not {type(ignore_index).__name__}" - ) - subset = self._preprocess_subset(subset) - subset_cols = [name for name in self._column_names if name in subset] - if len(subset_cols) == 0: - return self.copy(deep=True) - - keys = self._positions_from_column_names( - subset, offset_by_index_columns=not ignore_index - ) - return self._from_columns_like_self( - libcudf.stream_compaction.drop_duplicates( - list(self._columns) - if ignore_index - else list(self.index._columns + self._columns), - keys=keys, - keep=keep, - nulls_are_equal=nulls_are_equal, - ), - self._column_names, - self.index.names if not ignore_index else None, - ) - - @_performance_tracking - def duplicated(self, subset=None, keep="first"): - """ - Return boolean Series denoting duplicate rows. - - Considering certain columns is optional. - - Parameters - ---------- - subset : column label or sequence of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns. - keep : {'first', 'last', False}, default 'first' - Determines which duplicates (if any) to mark. - - - ``'first'`` : Mark duplicates as ``True`` except for the first - occurrence. - - ``'last'`` : Mark duplicates as ``True`` except for the last - occurrence. - - ``False`` : Mark all duplicates as ``True``. - - Returns - ------- - Series - Boolean series indicating duplicated rows. - - See Also - -------- - Index.duplicated : Equivalent method on index. - Series.duplicated : Equivalent method on Series. - Series.drop_duplicates : Remove duplicate values from Series. - DataFrame.drop_duplicates : Remove duplicate values from DataFrame. - - Examples - -------- - Consider a dataset containing ramen product ratings. - - >>> import cudf - >>> df = cudf.DataFrame({ - ... 'brand': ['Yum Yum', 'Yum Yum', 'Maggie', 'Maggie', 'Maggie'], - ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], - ... 'rating': [4, 4, 3.5, 15, 5] - ... }) - >>> df - brand style rating - 0 Yum Yum cup 4.0 - 1 Yum Yum cup 4.0 - 2 Maggie cup 3.5 - 3 Maggie pack 15.0 - 4 Maggie pack 5.0 - - By default, for each set of duplicated values, the first occurrence - is set to False and all others to True. - - >>> df.duplicated() - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool - - By using 'last', the last occurrence of each set of duplicated values - is set to False and all others to True. - - >>> df.duplicated(keep='last') - 0 True - 1 False - 2 False - 3 False - 4 False - dtype: bool - - By setting ``keep`` to False, all duplicates are True. - - >>> df.duplicated(keep=False) - 0 True - 1 True - 2 False - 3 False - 4 False - dtype: bool - - To find duplicates on specific column(s), use ``subset``. - - >>> df.duplicated(subset=['brand']) - 0 False - 1 True - 2 False - 3 True - 4 True - dtype: bool - """ - subset = self._preprocess_subset(subset) - - name = None - if isinstance(self, cudf.Series): - columns = [self._column] - name = self.name - else: - columns = [self._data[n] for n in subset] - distinct = libcudf.stream_compaction.distinct_indices( - columns, keep=keep - ) - result = libcudf.copying.scatter( - [cudf.Scalar(False, dtype=bool)], - distinct, - [as_column(True, length=len(self), dtype=bool)], - bounds_check=False, - )[0] - return cudf.Series._from_column(result, index=self.index, name=name) - - @_performance_tracking - def _empty_like(self, keep_index=True) -> Self: - result = self._from_columns_like_self( - libcudf.copying.columns_empty_like( - [ - *(self.index._columns if keep_index else ()), - *self._columns, - ] - ), - self._column_names, - self.index.names if keep_index else None, - ) - result._data.label_dtype = self._data.label_dtype - result._data.rangeindex = self._data.rangeindex - return result - - def _split(self, splits, keep_index=True): - if self._num_rows == 0: - return [] - - columns_split = libcudf.copying.columns_split( - [ - *(self.index._columns if keep_index else []), - *self._columns, - ], - splits, - ) - - return [ - self._from_columns_like_self( - columns_split[i], - self._column_names, - self.index.names if keep_index else None, - ) - for i in range(len(splits) + 1) - ] - - @_performance_tracking - def bfill( - self, value=None, axis=None, inplace=None, limit=None, limit_area=None - ): - """ - Synonym for :meth:`Series.fillna` with ``method='bfill'``. - - Returns - ------- - Object with missing values filled or None if ``inplace=True``. - """ - if limit_area is not None: - raise NotImplementedError("limit_area is currently not supported.") - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - return self.fillna( - method="bfill", - value=value, - axis=axis, - inplace=inplace, - limit=limit, - ) - - @_performance_tracking - def backfill(self, value=None, axis=None, inplace=None, limit=None): - """ - Synonym for :meth:`Series.fillna` with ``method='bfill'``. - - .. deprecated:: 23.06 - Use `DataFrame.bfill/Series.bfill` instead. - - Returns - ------- - Object with missing values filled or None if ``inplace=True``. - """ - # Do not remove until pandas removes this. - warnings.warn( - "DataFrame.backfill/Series.backfill is deprecated. Use " - "DataFrame.bfill/Series.bfill instead", - FutureWarning, - ) - return self.bfill(value=value, axis=axis, inplace=inplace, limit=limit) - - @_performance_tracking - def ffill( - self, - value=None, - axis=None, - inplace=None, - limit=None, - limit_area: Literal["inside", "outside", None] = None, - ): - """ - Synonym for :meth:`Series.fillna` with ``method='ffill'``. - - Returns - ------- - Object with missing values filled or None if ``inplace=True``. - """ - if limit_area is not None: - raise NotImplementedError("limit_area is currently not supported.") - - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - return self.fillna( - method="ffill", - value=value, - axis=axis, - inplace=inplace, - limit=limit, - ) - - @_performance_tracking - def pad(self, value=None, axis=None, inplace=None, limit=None): - """ - Synonym for :meth:`Series.fillna` with ``method='ffill'``. - - .. deprecated:: 23.06 - Use `DataFrame.ffill/Series.ffill` instead. - - Returns - ------- - Object with missing values filled or None if ``inplace=True``. - """ - # Do not remove until pandas removes this. - warnings.warn( - "DataFrame.pad/Series.pad is deprecated. Use " - "DataFrame.ffill/Series.ffill instead", - FutureWarning, - ) - return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit) - - def add_prefix(self, prefix, axis=None): - """ - Prefix labels with string `prefix`. - - For Series, the row labels are prefixed. - For DataFrame, the column labels are prefixed. - - Parameters - ---------- - prefix : str - The string to add before each label. - - Returns - ------- - Series or DataFrame - New Series with updated labels or DataFrame with updated labels. - - See Also - -------- - Series.add_suffix: Suffix row labels with string 'suffix'. - DataFrame.add_suffix: Suffix column labels with string 'suffix'. - - Examples - -------- - **Series** - - >>> s = cudf.Series([1, 2, 3, 4]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - dtype: int64 - >>> s.add_prefix('item_') - item_0 1 - item_1 2 - item_2 3 - item_3 4 - dtype: int64 - - **DataFrame** - - >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) - >>> df - A B - 0 1 3 - 1 2 4 - 2 3 5 - 3 4 6 - >>> df.add_prefix('col_') - col_A col_B - 0 1 3 - 1 2 4 - 2 3 5 - 3 4 6 - """ - raise NotImplementedError( - "`IndexedFrame.add_prefix` not currently implemented. \ - Use `Series.add_prefix` or `DataFrame.add_prefix`" - ) - - def add_suffix(self, suffix, axis=None): - """ - Suffix labels with string `suffix`. - - For Series, the row labels are suffixed. - For DataFrame, the column labels are suffixed. - - Parameters - ---------- - prefix : str - The string to add after each label. - - Returns - ------- - Series or DataFrame - New Series with updated labels or DataFrame with updated labels. - - See Also - -------- - Series.add_prefix: prefix row labels with string 'prefix'. - DataFrame.add_prefix: Prefix column labels with string 'prefix'. - - Examples - -------- - **Series** - - >>> s = cudf.Series([1, 2, 3, 4]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - dtype: int64 - >>> s.add_suffix('_item') - 0_item 1 - 1_item 2 - 2_item 3 - 3_item 4 - dtype: int64 - - **DataFrame** - - >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) - >>> df - A B - 0 1 3 - 1 2 4 - 2 3 5 - 3 4 6 - >>> df.add_suffix('_col') - A_col B_col - 0 1 3 - 1 2 4 - 2 3 5 - 3 4 6 - """ - raise NotImplementedError - - @acquire_spill_lock() - @_performance_tracking - def _apply(self, func, kernel_getter, *args, **kwargs): - """Apply `func` across the rows of the frame.""" - if kwargs: - raise ValueError("UDFs using **kwargs are not yet supported.") - try: - kernel, retty = _compile_or_get( - self, func, args, kernel_getter=kernel_getter - ) - except Exception as e: - raise ValueError( - "user defined function compilation failed." - ) from e - - # Mask and data column preallocated - ans_col = _return_arr_from_dtype(retty, len(self)) - ans_mask = as_column(True, length=len(self), dtype="bool") - output_args = [(ans_col, ans_mask), len(self)] - input_args = _get_input_args_from_frame(self) - launch_args = output_args + input_args + list(args) - try: - with _CUDFNumbaConfig(): - kernel.forall(len(self))(*launch_args) - except Exception as e: - raise RuntimeError("UDF kernel execution failed.") from e - - col = _post_process_output_col(ans_col, retty) - - col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) - result = cudf.Series._from_column(col, index=self.index) - - return result - - def sort_values( - self, - by, - axis=0, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, - key=None, - ): - """Sort by the values along either axis. - - Parameters - ---------- - by : str or list of str - Name or list of names to sort by. - ascending : bool or list of bool, default True - Sort ascending vs. descending. Specify list for multiple sort - orders. If this is a list of bools, must match the length of the - by. - na_position : {'first', 'last'}, default 'last' - 'first' puts nulls at the beginning, 'last' puts nulls at the end - ignore_index : bool, default False - If True, index will not be sorted. - key : callable, optional - Apply the key function to the values - before sorting. This is similar to the ``key`` argument in the - builtin ``sorted`` function, with the notable difference that - this ``key`` function should be *vectorized*. It should expect a - ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. - Currently not supported. - - Returns - ------- - Frame : Frame with sorted values. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['a'] = [0, 1, 2] - >>> df['b'] = [-3, 2, 0] - >>> df.sort_values('b') - a b - 0 0 -3 - 2 2 0 - 1 1 2 - - .. pandas-compat:: - :meth:`pandas.DataFrame.sort_values`, :meth:`pandas.Series.sort_values` - - * Support axis='index' only. - * Not supporting: inplace, kind - """ - if na_position not in {"first", "last"}: - raise ValueError(f"invalid na_position: {na_position}") - if inplace: - raise NotImplementedError("`inplace` not currently implemented.") - if kind != "quicksort": - if kind not in {"mergesort", "heapsort", "stable"}: - raise AttributeError( - f"{kind} is not a valid sorting algorithm for " - f"'DataFrame' object" - ) - warnings.warn( - f"GPU-accelerated {kind} is currently not supported, " - f"defaulting to quicksort." - ) - if axis != 0: - raise NotImplementedError("`axis` not currently implemented.") - if key is not None: - raise NotImplementedError("key is not currently supported.") - - if len(self) == 0: - return self - - try: - by_in_columns = self._get_columns_by_label(by) - except KeyError: - by_in_columns = None - if self.ndim == 1: - # For Series case, we're never selecting an index level. - by_in_index = None - else: - try: - by_in_index = self.index._get_columns_by_label(by) - except KeyError: - by_in_index = None - - if by_in_columns is not None and by_in_index is not None: - raise ValueError( - f"{by=} appears in the {type(self).__name__} columns " - "and as an index level which is ambiguous." - ) - elif by_in_columns is not None: - by_columns = by_in_columns - elif by_in_index is not None: - by_columns = by_in_index - else: - raise KeyError(by) - # argsort the `by` column - out = self._gather( - GatherMap.from_column_unchecked( - by_columns._get_sorted_inds( - ascending=ascending, na_position=na_position - ), - len(self), - nullify=False, - ), - keep_index=not ignore_index, - ) - return out - - def _n_largest_or_smallest( - self, largest: bool, n: int, columns, keep: Literal["first", "last"] - ): - # Get column to operate on - if isinstance(columns, str): - columns = [columns] - - method = "nlargest" if largest else "nsmallest" - for col in columns: - if isinstance(self._data[col], cudf.core.column.StringColumn): - if isinstance(self, cudf.DataFrame): - error_msg = ( - f"Column '{col}' has dtype {self._data[col].dtype}, " - f"cannot use method '{method}' with this dtype" - ) - else: - error_msg = ( - f"Cannot use method '{method}' with " - f"dtype {self._data[col].dtype}" - ) - raise TypeError(error_msg) - if len(self) == 0: - return self - - if keep == "first": - if n < 0: - n = 0 - - # argsort the `by` column - return self._gather( - GatherMap.from_column_unchecked( - self._get_columns_by_label(columns) - ._get_sorted_inds(ascending=not largest) - .slice(*slice(None, n).indices(len(self))), - len(self), - nullify=False, - ), - keep_index=True, - ) - elif keep == "last": - indices = self._get_columns_by_label(columns)._get_sorted_inds( - ascending=largest - ) - - if n <= 0: - # Empty slice. - indices = indices.slice(0, 0) - else: - indices = indices.slice( - *slice(None, -n - 1, -1).indices(len(self)) - ) - return self._gather( - GatherMap.from_column_unchecked( - indices, len(self), nullify=False - ), - keep_index=True, - ) - else: - raise ValueError('keep must be either "first", "last"') - - def _align_to_index( - self, - index: ColumnLike, - how: str = "outer", - sort: bool = True, - allow_non_unique: bool = False, - ) -> Self: - index = ensure_index(index) - - if self.index.equals(index): - return self - if not allow_non_unique: - if not self.index.is_unique or not index.is_unique: - raise ValueError("Cannot align indices with non-unique values") - - lhs = cudf.DataFrame._from_data(self._data, index=self.index) - rhs = cudf.DataFrame._from_data({}, index=index) - - # create a temporary column that we will later sort by - # to recover ordering after index alignment. - sort_col_id = str(uuid4()) - if how == "left": - lhs[sort_col_id] = as_column(range(len(lhs))) - elif how == "right": - rhs[sort_col_id] = as_column(range(len(rhs))) - - result = lhs.join(rhs, how=how, sort=sort) - if how in ("left", "right"): - result = result.sort_values(sort_col_id) - del result[sort_col_id] - - out = self._from_data( - self._data._from_columns_like_self(result._columns) - ) - out.index = result.index - out.index.names = self.index.names - return out - - @_performance_tracking - def _reindex( - self, - column_names, - dtypes=None, - deep=False, - index=None, - inplace=False, - fill_value=NA, - level=None, - method=None, - limit=None, - tolerance=None, - ): - """ - Helper for `.reindex` - - Parameters - ---------- - columns_names : array-like - array-like of columns to select from the Frame, - if ``columns`` is a superset of ``Frame.columns`` new - columns are created. - dtypes : dict - Mapping of dtypes for the empty columns being created. - deep : boolean, optional, default False - Whether to make deep copy or shallow copy of the columns. - index : Index or array-like, default None - The ``index`` to be used to reindex the Frame with. - inplace : bool, default False - Whether to perform the operation in place on the data. - fill_value : value with which to replace nulls in the result - - Returns - ------- - Series or DataFrame - """ - if method is not None: - raise NotImplementedError("method is not currently supported.") - if level is not None: - raise NotImplementedError("level is not currently supported.") - if limit is not None: - raise NotImplementedError("limit is not currently supported.") - if tolerance is not None: - raise NotImplementedError("tolerance is not currently supported.") - - if dtypes is None: - dtypes = {} - - df = self - if index is not None: - if not df.index.is_unique: - raise ValueError( - "cannot reindex on an axis with duplicate labels" - ) - index = cudf.Index( - index, name=getattr(index, "name", self.index.name) - ) - - idx_dtype_match = (df.index.nlevels == index.nlevels) and all( - _is_same_dtype(left_dtype, right_dtype) - for left_dtype, right_dtype in zip( - (dtype for _, dtype in df.index._dtypes), - (dtype for _, dtype in index._dtypes), - ) - ) - - if not idx_dtype_match: - column_names = ( - column_names - if column_names is not None - else list(df._column_names) - ) - df = cudf.DataFrame() - else: - lhs = cudf.DataFrame._from_data({}, index=index) - rhs = cudf.DataFrame._from_data( - { - # bookkeeping workaround for unnamed series - (name or 0) - if isinstance(self, cudf.Series) - else name: col - for name, col in df._column_labels_and_values - }, - index=df.index, - ) - df = lhs.join(rhs, how="left", sort=True) - # double-argsort to map back from sorted to unsorted positions - df = df.take(index.argsort(ascending=True).argsort()) - - index = index if index is not None else df.index - - if column_names is None: - names = list(df._column_names) - level_names = self._data.level_names - multiindex = self._data.multiindex - rangeindex = self._data.rangeindex - elif isinstance(column_names, (pd.Index, cudf.Index)): - if isinstance(column_names, (pd.MultiIndex, cudf.MultiIndex)): - multiindex = True - if isinstance(column_names, cudf.MultiIndex): - names = list(iter(column_names.to_pandas())) - else: - names = list(iter(column_names)) - rangeindex = False - else: - multiindex = False - names = column_names - if isinstance(names, cudf.Index): - names = names.to_pandas() - rangeindex = isinstance( - column_names, (pd.RangeIndex, cudf.RangeIndex) - ) - level_names = tuple(column_names.names) - else: - names = column_names - level_names = None - multiindex = False - rangeindex = False - - cols = { - name: ( - df._data[name].copy(deep=deep) - if name in df._data - else cudf.core.column.column.column_empty( - dtype=dtypes.get(name, np.float64), - masked=True, - row_count=len(index), - ) - ) - for name in names - } - - result = self.__class__._from_data( - data=cudf.core.column_accessor.ColumnAccessor( - cols, - multiindex=multiindex, - level_names=level_names, - rangeindex=rangeindex, - ), - index=index, - ) - - result.fillna(fill_value, inplace=True) - return self._mimic_inplace(result, inplace=inplace) - - def round(self, decimals=0, how="half_even"): - """ - Round to a variable number of decimal places. - - Parameters - ---------- - decimals : int, dict, Series - Number of decimal places to round each column to. This parameter - must be an int for a Series. For a DataFrame, a dict or a Series - are also valid inputs. If an int is given, round each column to the - same number of places. Otherwise dict and Series round to variable - numbers of places. Column names should be in the keys if - `decimals` is a dict-like, or in the index if `decimals` is a - Series. Any columns not included in `decimals` will be left as is. - Elements of `decimals` which are not columns of the input will be - ignored. - how : str, optional - Type of rounding. Can be either "half_even" (default) - or "half_up" rounding. - - Returns - ------- - Series or DataFrame - A Series or DataFrame with the affected columns rounded to the - specified number of decimal places. - - Examples - -------- - **Series** - - >>> s = cudf.Series([0.1, 1.4, 2.9]) - >>> s.round() - 0 0.0 - 1 1.0 - 2 3.0 - dtype: float64 - - **DataFrame** - - >>> df = cudf.DataFrame( - ... [(.21, .32), (.01, .67), (.66, .03), (.21, .18)], - ... columns=['dogs', 'cats'], - ... ) - >>> df - dogs cats - 0 0.21 0.32 - 1 0.01 0.67 - 2 0.66 0.03 - 3 0.21 0.18 - - By providing an integer each column is rounded to the same number - of decimal places. - - >>> df.round(1) - dogs cats - 0 0.2 0.3 - 1 0.0 0.7 - 2 0.7 0.0 - 3 0.2 0.2 - - With a dict, the number of places for specific columns can be - specified with the column names as keys and the number of decimal - places as values. - - >>> df.round({'dogs': 1, 'cats': 0}) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 - - Using a Series, the number of places for specific columns can be - specified with the column names as the index and the number of - decimal places as the values. - - >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs']) - >>> df.round(decimals) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 - """ - if isinstance(decimals, cudf.Series): - decimals = decimals.to_pandas() - - if isinstance(decimals, pd.Series): - if not decimals.index.is_unique: - raise ValueError("Index of decimals must be unique") - decimals = decimals.to_dict() - elif isinstance(decimals, int): - decimals = {name: decimals for name in self._column_names} - elif not isinstance(decimals, abc.Mapping): - raise TypeError( - "decimals must be an integer, a dict-like or a Series" - ) - - cols = ( - col.round(decimals[name], how=how) - if name in decimals and col.dtype.kind in "fiu" - else col.copy(deep=True) - for name, col in self._column_labels_and_values - ) - return self._from_data_like_self( - self._data._from_columns_like_self(cols) - ) - - def resample( - self, - rule, - axis=0, - closed: Literal["right", "left"] | None = None, - label: Literal["right", "left"] | None = None, - convention: Literal["start", "end", "s", "e"] = "start", - kind=None, - on=None, - level=None, - origin="start_day", - offset=None, - group_keys: bool = False, - ): - """ - Convert the frequency of ("resample") the given time series data. - - Parameters - ---------- - rule: str - The offset string representing the frequency to use. - Note that DateOffset objects are not yet supported. - closed: {"right", "left"}, default None - Which side of bin interval is closed. The default is - "left" for all frequency offsets except for "M" and "W", - which have a default of "right". - label: {"right", "left"}, default None - Which bin edge label to label bucket with. The default is - "left" for all frequency offsets except for "M" and "W", - which have a default of "right". - on: str, optional - For a DataFrame, column to use instead of the index for - resampling. Column must be a datetime-like. - level: str or int, optional - For a MultiIndex, level to use instead of the index for - resampling. The level must be a datetime-like. - - Returns - ------- - A Resampler object - - Examples - -------- - First, we create a time series with 1 minute intervals: - - >>> index = cudf.date_range(start="2001-01-01", periods=10, freq="1T") - >>> sr = cudf.Series(range(10), index=index) - >>> sr - 2001-01-01 00:00:00 0 - 2001-01-01 00:01:00 1 - 2001-01-01 00:02:00 2 - 2001-01-01 00:03:00 3 - 2001-01-01 00:04:00 4 - 2001-01-01 00:05:00 5 - 2001-01-01 00:06:00 6 - 2001-01-01 00:07:00 7 - 2001-01-01 00:08:00 8 - 2001-01-01 00:09:00 9 - dtype: int64 - - Downsampling to 3 minute intervals, followed by a "sum" aggregation: - - >>> sr.resample("3T").sum() - 2001-01-01 00:00:00 3 - 2001-01-01 00:03:00 12 - 2001-01-01 00:06:00 21 - 2001-01-01 00:09:00 9 - dtype: int64 - - Use the right side of each interval to label the bins: - - >>> sr.resample("3T", label="right").sum() - 2001-01-01 00:03:00 3 - 2001-01-01 00:06:00 12 - 2001-01-01 00:09:00 21 - 2001-01-01 00:12:00 9 - dtype: int64 - - Close the right side of the interval instead of the left: - - >>> sr.resample("3T", closed="right").sum() - 2000-12-31 23:57:00 0 - 2001-01-01 00:00:00 6 - 2001-01-01 00:03:00 15 - 2001-01-01 00:06:00 24 - dtype: int64 - - Upsampling to 30 second intervals: - - >>> sr.resample("30s").asfreq()[:5] # show the first 5 rows - 2001-01-01 00:00:00 0 - 2001-01-01 00:00:30 - 2001-01-01 00:01:00 1 - 2001-01-01 00:01:30 - 2001-01-01 00:02:00 2 - dtype: int64 - - Upsample and fill nulls using the "bfill" method: - - >>> sr.resample("30s").bfill()[:5] - 2001-01-01 00:00:00 0 - 2001-01-01 00:00:30 1 - 2001-01-01 00:01:00 1 - 2001-01-01 00:01:30 2 - 2001-01-01 00:02:00 2 - dtype: int64 - - Resampling by a specified column of a Dataframe: - - >>> df = cudf.DataFrame({ - ... "price": [10, 11, 9, 13, 14, 18, 17, 19], - ... "volume": [50, 60, 40, 100, 50, 100, 40, 50], - ... "week_starting": cudf.date_range( - ... "2018-01-01", periods=8, freq="7D" - ... ) - ... }) - >>> df - price volume week_starting - 0 10 50 2018-01-01 - 1 11 60 2018-01-08 - 2 9 40 2018-01-15 - 3 13 100 2018-01-22 - 4 14 50 2018-01-29 - 5 18 100 2018-02-05 - 6 17 40 2018-02-12 - 7 19 50 2018-02-19 - >>> df.resample("M", on="week_starting").mean() - price volume - week_starting - 2018-01-31 11.4 60.000000 - 2018-02-28 18.0 63.333333 - - - .. pandas-compat:: - :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` - - Note that the dtype of the index (or the 'on' column if using - 'on=') in the result will be of a frequency closest to the - resampled frequency. For example, if resampling from - nanoseconds to milliseconds, the index will be of dtype - 'datetime64[ms]'. - """ - import cudf.core.resample - - if kind is not None: - warnings.warn( - "The 'kind' keyword in is " - "deprecated and will be removed in a future version. ", - FutureWarning, - ) - raise NotImplementedError("kind is currently not supported.") - if axis != 0: - warnings.warn( - "The 'axis' keyword in is " - "deprecated and will be removed in a future version. ", - FutureWarning, - ) - raise NotImplementedError("axis is currently not supported.") - if convention != "start": - warnings.warn( - "The 'convention' keyword in is " - "deprecated and will be removed in a future version. ", - FutureWarning, - ) - raise NotImplementedError("convention is currently not supported.") - if origin != "start_day": - raise NotImplementedError("origin is currently not supported.") - if offset is not None: - raise NotImplementedError("offset is currently not supported.") - if group_keys is not False: - raise NotImplementedError("group_keys is currently not supported.") - by = cudf.Grouper( - key=on, freq=rule, closed=closed, label=label, level=level - ) - return ( - cudf.core.resample.SeriesResampler(self, by=by) - if isinstance(self, cudf.Series) - else cudf.core.resample.DataFrameResampler(self, by=by) - ) - - def dropna( - self, - axis=0, - how="any", - thresh=None, - subset=None, - inplace=False, - ignore_index: bool = False, - ): - """ - Drop rows (or columns) containing nulls from a Column. - - Parameters - ---------- - axis : {0, 1}, optional - Whether to drop rows (axis=0, default) or columns (axis=1) - containing nulls. - how : {"any", "all"}, optional - Specifies how to decide whether to drop a row (or column). - any (default) drops rows (or columns) containing at least - one null value. all drops only rows (or columns) containing - *all* null values. - thresh: int, optional - If specified, then drops every row (or column) containing - less than `thresh` non-null values - subset : list, optional - List of columns to consider when dropping rows (all columns - are considered by default). Alternatively, when dropping - columns, subset is a list of rows to consider. - inplace : bool, default False - If True, do operation inplace and return None. - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - - Returns - ------- - Copy of the DataFrame with rows/columns containing nulls dropped. - - See Also - -------- - cudf.DataFrame.isna - Indicate null values. - cudf.DataFrame.notna - Indicate non-null values. - cudf.DataFrame.fillna - Replace null values. - cudf.Series.dropna - Drop null values. - cudf.Index.dropna - Drop null indices. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], - ... "toy": ['Batmobile', None, 'Bullwhip'], - ... "born": [np.datetime64("1940-04-25"), - ... np.datetime64("NaT"), - ... np.datetime64("NaT")]}) - >>> df - name toy born - 0 Alfred Batmobile 1940-04-25 00:00:00 - 1 Batman - 2 Catwoman Bullwhip - - Drop the rows where at least one element is null. - - >>> df.dropna() - name toy born - 0 Alfred Batmobile 1940-04-25 - - Drop the columns where at least one element is null. - - >>> df.dropna(axis='columns') - name - 0 Alfred - 1 Batman - 2 Catwoman - - Drop the rows where all elements are null. - - >>> df.dropna(how='all') - name toy born - 0 Alfred Batmobile 1940-04-25 00:00:00 - 1 Batman - 2 Catwoman Bullwhip - - Keep only the rows with at least 2 non-null values. - - >>> df.dropna(thresh=2) - name toy born - 0 Alfred Batmobile 1940-04-25 00:00:00 - 2 Catwoman Bullwhip - - Define in which columns to look for null values. - - >>> df.dropna(subset=['name', 'born']) - name toy born - 0 Alfred Batmobile 1940-04-25 - - Keep the DataFrame with valid entries in the same variable. - - >>> df.dropna(inplace=True) - >>> df - name toy born - 0 Alfred Batmobile 1940-04-25 - """ - if axis == 0: - result = self._drop_na_rows(how=how, subset=subset, thresh=thresh) - if ignore_index: - result.index = RangeIndex(len(result)) - else: - result = self._drop_na_columns( - how=how, subset=subset, thresh=thresh - ) - - return self._mimic_inplace(result, inplace=inplace) - - @_performance_tracking - def _drop_na_columns(self, how="any", subset=None, thresh=None): - """ - Drop columns containing nulls - """ - out_cols = [] - - if subset is None: - df = self - else: - df = self.take(subset) - - if thresh is None: - if how == "all": - thresh = 1 - else: - thresh = len(df) - - for name, col in df._column_labels_and_values: - check_col = col.nans_to_nulls() - no_threshold_valid_count = ( - len(col) - check_col.null_count - ) < thresh - if no_threshold_valid_count: - continue - out_cols.append(name) - - return self[out_cols] - - def _drop_na_rows(self, how="any", subset=None, thresh=None): - """ - Drop null rows from `self`. - - how : {"any", "all"}, optional - Specifies how to decide whether to drop a row. - any (default) drops rows containing at least - one null value. all drops only rows containing - *all* null values. - subset : list, optional - List of columns to consider when dropping rows. - thresh : int, optional - If specified, then drops every row containing - less than `thresh` non-null values. - """ - subset = self._preprocess_subset(subset) - - if len(subset) == 0: - return self.copy(deep=True) - - data_columns = [col.nans_to_nulls() for col in self._columns] - - return self._from_columns_like_self( - libcudf.stream_compaction.drop_nulls( - [*self.index._columns, *data_columns], - how=how, - keys=self._positions_from_column_names( - subset, offset_by_index_columns=True - ), - thresh=thresh, - ), - self._column_names, - self.index.names, - ) - - def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): - """Apply boolean mask to each row of `self`. - - Rows corresponding to `False` is dropped. - - If keep_index is False, the index is not preserved. - """ - if len(boolean_mask.column) != len(self): - raise IndexError( - "Boolean mask has wrong length: " - f"{len(boolean_mask.column)} not {len(self)}" - ) - return self._from_columns_like_self( - libcudf.stream_compaction.apply_boolean_mask( - list(self.index._columns + self._columns) - if keep_index - else list(self._columns), - boolean_mask.column, - ), - column_names=self._column_names, - index_names=self.index.names if keep_index else None, - ) - - def take(self, indices, axis=0): - """Return a new frame containing the rows specified by *indices*. - - Parameters - ---------- - indices : array-like - Array of ints indicating which positions to take. - axis : Unsupported - - Returns - ------- - out : Series or DataFrame - New object with desired subset of rows. - - Examples - -------- - **Series** - >>> s = cudf.Series(['a', 'b', 'c', 'd', 'e']) - >>> s.take([2, 0, 4, 3]) - 2 c - 0 a - 4 e - 3 d - dtype: object - - **DataFrame** - - >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0], - ... 'b': cudf.Series(['a', 'b', 'c'])}) - >>> a.take([0, 2, 2]) - a b - 0 1.0 a - 2 3.0 c - 2 3.0 c - >>> a.take([True, False, True]) - a b - 0 1.0 a - 2 3.0 c - """ - if self._get_axis_from_axis_arg(axis) != 0: - raise NotImplementedError("Only axis=0 is supported.") - - return self._gather(GatherMap(indices, len(self), nullify=False)) - - def _reset_index( - self, - level, - drop, - col_level=0, - col_fill="", - allow_duplicates: bool = False, - names: abc.Hashable | abc.Sequence[abc.Hashable] | None = None, - ): - """Shared path for DataFrame.reset_index and Series.reset_index.""" - if allow_duplicates is not False: - raise NotImplementedError( - "allow_duplicates is not currently supported." - ) - elif names is not None: - raise NotImplementedError("names is not currently supported.") - if level is not None: - if ( - isinstance(level, int) - and level > 0 - and not isinstance(self.index, MultiIndex) - ): - raise IndexError( - f"Too many levels: Index has only 1 level, not {level + 1}" - ) - if not isinstance(level, (tuple, list)): - level = (level,) - _check_duplicate_level_names(level, self.index.names) - - index = self.index._new_index_for_reset_index(level, self.index.name) - if index is None: - index = RangeIndex(len(self)) - if drop: - return self._data, index - - new_column_data = {} - for name, col in self.index._columns_for_reset_index(level): - if name == "index" and "index" in self._data: - name = "level_0" - name = ( - tuple( - name if i == col_level else col_fill - for i in range(self._data.nlevels) - ) - if self._data.multiindex - else name - ) - new_column_data[name] = col - # This is to match pandas where the new data columns are always - # inserted to the left of existing data columns. - return ( - ColumnAccessor( - {**new_column_data, **self._data}, - self._data.multiindex, - self._data._level_names, - ), - index, - ) - - def _first_or_last( - self, offset, idx: int, op: Callable, side: str, slice_func: Callable - ) -> "IndexedFrame": - """Shared code path for ``first`` and ``last``.""" - if not isinstance(self.index, cudf.core.index.DatetimeIndex): - raise TypeError("'first' only supports a DatetimeIndex index.") - if not isinstance(offset, str): - raise NotImplementedError( - f"Unsupported offset type {type(offset)}." - ) - - if len(self) == 0: - return self.copy() - - pd_offset = pd.tseries.frequencies.to_offset(offset) - to_search = op( - pd.Timestamp(self.index._column.element_indexing(idx)), pd_offset - ) - if ( - idx == 0 - and not isinstance(pd_offset, pd.tseries.offsets.Tick) - and pd_offset.is_on_offset(pd.Timestamp(self.index[0])) - ): - # Special handle is required when the start time of the index - # is on the end of the offset. See pandas gh29623 for detail. - to_search = to_search - pd_offset.base - return self.loc[:to_search] - needle = as_column(to_search, dtype=self.index.dtype) - end_point = int( - self.index._column.searchsorted( - needle, side=side - ).element_indexing(0) - ) - return slice_func(end_point) - - def first(self, offset): - """Select initial periods of time series data based on a date offset. - - When having a DataFrame with **sorted** dates as index, this function - can select the first few rows based on a date offset. - - Parameters - ---------- - offset: str - The offset length of the data that will be selected. For instance, - '1M' will display all rows having their index within the first - month. - - Returns - ------- - Series or DataFrame - A subset of the caller. - - Raises - ------ - TypeError - If the index is not a ``DatetimeIndex`` - - Examples - -------- - >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i) - >>> ts - A - 2018-04-09 1 - 2018-04-11 2 - 2018-04-13 3 - 2018-04-15 4 - >>> ts.first('3D') - A - 2018-04-09 1 - 2018-04-11 2 - """ - # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." - warnings.warn( - "first is deprecated and will be removed in a future version. " - "Please create a mask and filter using `.loc` instead", - FutureWarning, - ) - return self._first_or_last( - offset, - idx=0, - op=operator.__add__, - side="left", - slice_func=lambda i: self.iloc[:i], - ) - - def last(self, offset): - """Select final periods of time series data based on a date offset. - - When having a DataFrame with **sorted** dates as index, this function - can select the last few rows based on a date offset. - - Parameters - ---------- - offset: str - The offset length of the data that will be selected. For instance, - '3D' will display all rows having their index within the last 3 - days. - - Returns - ------- - Series or DataFrame - A subset of the caller. - - Raises - ------ - TypeError - If the index is not a ``DatetimeIndex`` - - Examples - -------- - >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D') - >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i) - >>> ts - A - 2018-04-09 1 - 2018-04-11 2 - 2018-04-13 3 - 2018-04-15 4 - >>> ts.last('3D') - A - 2018-04-13 3 - 2018-04-15 4 - """ - # Do not remove until pandas 3.0 support is added. - assert PANDAS_LT_300, "Need to drop after pandas-3.0 support is added." - warnings.warn( - "last is deprecated and will be removed in a future version. " - "Please create a mask and filter using `.loc` instead", - FutureWarning, - ) - return self._first_or_last( - offset, - idx=-1, - op=operator.__sub__, - side="right", - slice_func=lambda i: self.iloc[i:], - ) - - @_performance_tracking - def sample( - self, - n=None, - frac=None, - replace=False, - weights=None, - random_state=None, - axis=None, - ignore_index=False, - ): - """Return a random sample of items from an axis of object. - - If reproducible results are required, a random number generator may be - provided via the `random_state` parameter. This function will always - produce the same sample given an identical `random_state`. - - Parameters - ---------- - n : int, optional - Number of items from axis to return. Cannot be used with `frac`. - Default = 1 if frac = None. - frac : float, optional - Fraction of axis items to return. Cannot be used with n. - replace : bool, default False - Allow or disallow sampling of the same row more than once. - `replace == True` is not supported for axis = 1/"columns". - `replace == False` is not supported for axis = 0/"index" given - `random_state` is `None` or a cupy random state, and `weights` is - specified. - weights : ndarray-like, optional - Default `None` for uniform probability distribution over rows to - sample from. If `ndarray` is passed, the length of `weights` should - equal to the number of rows to sample from, and will be normalized - to have a sum of 1. Unlike pandas, index alignment is not currently - not performed. - random_state : int, numpy/cupy RandomState, or None, default None - If None, default cupy random state is chosen. - If int, the seed for the default cupy random state. - If RandomState, rows-to-sample are generated from the RandomState. - axis : {0 or `index`, 1 or `columns`, None}, default None - Axis to sample. Accepts axis number or name. - Default is stat axis for given data type - (0 for Series and DataFrames). Series doesn't support axis=1. - ignore_index : bool, default False - If True, the resulting index will be labeled 0, 1, …, n - 1. - - Returns - ------- - Series or DataFrame - A new object of same type as caller containing n items - randomly sampled from the caller object. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"a":{1, 2, 3, 4, 5}}) - >>> df.sample(3) - a - 1 2 - 3 4 - 0 1 - - >>> sr = cudf.Series([1, 2, 3, 4, 5]) - >>> sr.sample(10, replace=True) - 1 4 - 3 1 - 2 4 - 0 5 - 0 1 - 4 5 - 4 1 - 0 2 - 0 3 - 3 2 - dtype: int64 - - >>> df = cudf.DataFrame( - ... {"a": [1, 2], "b": [2, 3], "c": [3, 4], "d": [4, 5]} - ... ) - >>> df.sample(2, axis=1) - a c - 0 1 3 - 1 2 4 - - .. pandas-compat:: - :meth:`pandas.DataFrame.sample`, :meth:`pandas.Series.sample` - - When sampling from ``axis=0/'index'``, ``random_state`` can be - either a numpy random state (``numpy.random.RandomState``) - or a cupy random state (``cupy.random.RandomState``). When a numpy - random state is used, the output is guaranteed to match the output - of the corresponding pandas method call, but generating the sample - maybe slow. If exact pandas equivalence is not required, using a - cupy random state will achieve better performance, - especially when sampling large number of - items. It's advised to use the matching `ndarray` type to - the random state for the `weights` array. - """ - axis = 0 if axis is None else self._get_axis_from_axis_arg(axis) - size = self.shape[axis] - - # Compute `n` from parameter `frac`. - if frac is None: - n = 1 if n is None else n - else: - if frac > 1 and not replace: - raise ValueError( - "Replace has to be set to `True` when upsampling the " - "population `frac` > 1." - ) - if n is not None: - raise ValueError( - "Please enter a value for `frac` OR `n`, not both." - ) - n = int(round(size * frac)) - - if n > 0 and size == 0: - raise ValueError( - "Cannot take a sample larger than 0 when axis is empty." - ) - - if isinstance(random_state, cp.random.RandomState): - lib = cp - elif isinstance(random_state, np.random.RandomState): - lib = np - else: - # Construct random state if `random_state` parameter is None or a - # seed. By default, cupy random state is used to sample rows - # and numpy is used to sample columns. This is because row data - # is stored on device, and the column objects are stored on host. - lib = cp if axis == 0 else np - random_state = lib.random.RandomState(seed=random_state) - - # Normalize `weights` array. - if weights is not None: - if isinstance(weights, str): - raise NotImplementedError( - "Weights specified by string is unsupported yet." - ) - - if size != len(weights): - raise ValueError( - "Weights and axis to be sampled must be of same length." - ) - - weights = lib.asarray(weights) - weights = weights / weights.sum() - - if axis == 0: - return self._sample_axis_0( - n, weights, replace, random_state, ignore_index - ) - else: - if isinstance(random_state, cp.random.RandomState): - raise ValueError( - "Sampling from `axis=1`/`columns` with cupy random state" - "isn't supported." - ) - return self._sample_axis_1( - n, weights, replace, random_state, ignore_index - ) - - def _sample_axis_0( - self, - n: int, - weights: ColumnLike | None, - replace: bool, - random_state: np.random.RandomState | cp.random.RandomState, - ignore_index: bool, - ): - try: - gather_map = GatherMap.from_column_unchecked( - cast( - NumericalColumn, - cudf.core.column.as_column( - random_state.choice( - len(self), size=n, replace=replace, p=weights - ) - ), - ), - len(self), - nullify=False, - ) - except NotImplementedError as e: - raise NotImplementedError( - "Random sampling with cupy does not support these inputs." - ) from e - - return self._gather(gather_map, keep_index=not ignore_index) - - def _sample_axis_1( - self, - n: int, - weights: ColumnLike | None, - replace: bool, - random_state: np.random.RandomState, - ignore_index: bool, - ): - raise NotImplementedError( - f"Sampling from axis 1 is not implemented for {self.__class__}." - ) - - def _binaryop( - self, - other: Any, - op: str, - fill_value: Any = None, - can_reindex: bool = False, - *args, - **kwargs, - ): - reflect, op = self._check_reflected_op(op) - ( - operands, - out_index, - can_use_self_column_name, - ) = self._make_operands_and_index_for_binop( - other, op, fill_value, reflect, can_reindex - ) - if operands is NotImplemented: - return NotImplemented - - level_names = ( - self._data._level_names if can_use_self_column_name else None - ) - return self._from_data( - ColumnAccessor( - type(self)._colwise_binop(operands, op), - level_names=level_names, - ), - index=out_index, - ) - - def _make_operands_and_index_for_binop( - self, - other: Any, - fn: str, - fill_value: Any = None, - reflect: bool = False, - can_reindex: bool = False, - ) -> tuple[ - dict[str | None, tuple[ColumnBase, Any, bool, Any]] - | NotImplementedType, - cudf.BaseIndex | None, - bool, - ]: - raise NotImplementedError( - f"Binary operations are not supported for {self.__class__}" - ) - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs) - fname = ufunc.__name__ - - if ret is not None: - return ret - - # Attempt to dispatch all other functions to cupy. - cupy_func = getattr(cp, fname) - if cupy_func: - if ufunc.nin == 2: - other = inputs[self is inputs[0]] - inputs, index, _ = self._make_operands_and_index_for_binop( - other, fname - ) - else: - # This works for Index too - inputs = { - name: (col, None, False, None) - for name, col in self._column_labels_and_values - } - index = self.index - - data = self._apply_cupy_ufunc_to_operands( - ufunc, cupy_func, inputs, **kwargs - ) - - out = tuple(self._from_data(out, index=index) for out in data) - return out[0] if ufunc.nout == 1 else out - - return NotImplemented - - @_performance_tracking - def repeat(self, repeats, axis=None): - """Repeats elements consecutively. - - Returns a new object of caller type(DataFrame/Series) where each - element of the current object is repeated consecutively a given - number of times. - - Parameters - ---------- - repeats : int, or array of ints - The number of repetitions for each element. This should - be a non-negative integer. Repeating 0 times will return - an empty object. - - Returns - ------- - Series/DataFrame - A newly created object of same type as caller - with repeated elements. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30]}) - >>> df - a b - 0 1 10 - 1 2 20 - 2 3 30 - >>> df.repeat(3) - a b - 0 1 10 - 0 1 10 - 0 1 10 - 1 2 20 - 1 2 20 - 1 2 20 - 2 3 30 - 2 3 30 - 2 3 30 - - Repeat on Series - - >>> s = cudf.Series([0, 2]) - >>> s - 0 0 - 1 2 - dtype: int64 - >>> s.repeat([3, 4]) - 0 0 - 0 0 - 0 0 - 1 2 - 1 2 - 1 2 - 1 2 - dtype: int64 - >>> s.repeat(2) - 0 0 - 0 0 - 1 2 - 1 2 - dtype: int64 - """ - res = self._from_columns_like_self( - Frame._repeat( - [*self.index._columns, *self._columns], repeats, axis - ), - self._column_names, - self._index_names, - ) - if isinstance(res.index, cudf.DatetimeIndex): - res.index._freq = None - return res - - def astype( - self, - dtype: dict[Any, Dtype], - copy: bool = False, - errors: Literal["raise", "ignore"] = "raise", - ) -> Self: - """Cast the object to the given dtype. - - Parameters - ---------- - dtype : data type, or dict of column name -> data type - Use a :class:`numpy.dtype` or Python type to cast entire DataFrame - object to the same type. Alternatively, use ``{col: dtype, ...}``, - where col is a column label and dtype is a :class:`numpy.dtype` - or Python type to cast one or more of the DataFrame's columns to - column-specific types. - copy : bool, default False - Return a deep-copy when ``copy=True``. Note by default - ``copy=False`` setting is used and hence changes to - values then may propagate to other cudf objects. - errors : {'raise', 'ignore', 'warn'}, default 'raise' - Control raising of exceptions on invalid data for provided dtype. - - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original - object. - - Returns - ------- - DataFrame/Series - - Examples - -------- - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]}) - >>> df - a b - 0 10 1 - 1 20 2 - 2 30 3 - >>> df.dtypes - a int64 - b int64 - dtype: object - - Cast all columns to `int32`: - - >>> df.astype('int32').dtypes - a int32 - b int32 - dtype: object - - Cast `a` to `float32` using a dictionary: - - >>> df.astype({'a': 'float32'}).dtypes - a float32 - b int64 - dtype: object - >>> df.astype({'a': 'float32'}) - a b - 0 10.0 1 - 1 20.0 2 - 2 30.0 3 - - **Series** - - >>> import cudf - >>> series = cudf.Series([1, 2], dtype='int32') - >>> series - 0 1 - 1 2 - dtype: int32 - >>> series.astype('int64') - 0 1 - 1 2 - dtype: int64 - - Convert to categorical type: - - >>> series.astype('category') - 0 1 - 1 2 - dtype: category - Categories (2, int64): [1, 2] - - Convert to ordered categorical type with custom ordering: - - >>> cat_dtype = cudf.CategoricalDtype(categories=[2, 1], ordered=True) - >>> series.astype(cat_dtype) - 0 1 - 1 2 - dtype: category - Categories (2, int64): [2 < 1] - - Note that using ``copy=False`` (enabled by default) - and changing data on a new Series will - propagate changes: - - >>> s1 = cudf.Series([1, 2]) - >>> s1 - 0 1 - 1 2 - dtype: int64 - >>> s2 = s1.astype('int64', copy=False) - >>> s2[0] = 10 - >>> s1 - 0 10 - 1 2 - dtype: int64 - """ - if errors not in ("ignore", "raise"): - raise ValueError("invalid error value specified") - - try: - return super().astype(dtype, copy) - except Exception as e: - if errors == "raise": - raise e - return self - - @_performance_tracking - def drop( - self, - labels=None, - axis=0, - index=None, - columns=None, - level=None, - inplace=False, - errors="raise", - ): - """Drop specified labels from rows or columns. - - Remove rows or columns by specifying label names and corresponding - axis, or by specifying directly index or column names. When using a - multi-index, labels on different levels can be removed by specifying - the level. - - Parameters - ---------- - labels : single label or list-like - Index or column labels to drop. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Whether to drop labels from the index (0 or 'index') or - columns (1 or 'columns'). - index : single label or list-like - Alternative to specifying axis (``labels, axis=0`` - is equivalent to ``index=labels``). - columns : single label or list-like - Alternative to specifying axis (``labels, axis=1`` - is equivalent to ``columns=labels``). - level : int or level name, optional - For MultiIndex, level from which the labels will be removed. - inplace : bool, default False - If False, return a copy. Otherwise, do operation - inplace and return None. - errors : {'ignore', 'raise'}, default 'raise' - If 'ignore', suppress error and only existing labels are - dropped. - - Returns - ------- - DataFrame or Series - DataFrame or Series without the removed index or column labels. - - Raises - ------ - KeyError - If any of the labels is not found in the selected axis. - - See Also - -------- - DataFrame.loc : Label-location based indexer for selection by label. - DataFrame.dropna : Return DataFrame with labels on given axis omitted - where (all or any) data are missing. - DataFrame.drop_duplicates : Return DataFrame with duplicate rows - removed, optionally only considering certain columns. - Series.reindex - Return only specified index labels of Series - Series.dropna - Return series without null values - Series.drop_duplicates - Return series with duplicate values removed - - Examples - -------- - **Series** - - >>> s = cudf.Series([1,2,3], index=['x', 'y', 'z']) - >>> s - x 1 - y 2 - z 3 - dtype: int64 - - Drop labels x and z - - >>> s.drop(labels=['x', 'z']) - y 2 - dtype: int64 - - Drop a label from the second level in MultiIndex Series. - - >>> midx = cudf.MultiIndex.from_product([[0, 1, 2], ['x', 'y']]) - >>> s = cudf.Series(range(6), index=midx) - >>> s - 0 x 0 - y 1 - 1 x 2 - y 3 - 2 x 4 - y 5 - dtype: int64 - >>> s.drop(labels='y', level=1) - 0 x 0 - 1 x 2 - 2 x 4 - Name: 2, dtype: int64 - - **DataFrame** - - >>> import cudf - >>> df = cudf.DataFrame({"A": [1, 2, 3, 4], - ... "B": [5, 6, 7, 8], - ... "C": [10, 11, 12, 13], - ... "D": [20, 30, 40, 50]}) - >>> df - A B C D - 0 1 5 10 20 - 1 2 6 11 30 - 2 3 7 12 40 - 3 4 8 13 50 - - Drop columns - - >>> df.drop(['B', 'C'], axis=1) - A D - 0 1 20 - 1 2 30 - 2 3 40 - 3 4 50 - >>> df.drop(columns=['B', 'C']) - A D - 0 1 20 - 1 2 30 - 2 3 40 - 3 4 50 - - Drop a row by index - - >>> df.drop([0, 1]) - A B C D - 2 3 7 12 40 - 3 4 8 13 50 - - Drop columns and/or rows of MultiIndex DataFrame - - >>> midx = cudf.MultiIndex(levels=[['lama', 'cow', 'falcon'], - ... ['speed', 'weight', 'length']], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], - ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) - >>> df = cudf.DataFrame(index=midx, columns=['big', 'small'], - ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], - ... [250, 150], [1.5, 0.8], [320, 250], - ... [1, 0.8], [0.3, 0.2]]) - >>> df - big small - lama speed 45.0 30.0 - weight 200.0 100.0 - length 1.5 1.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - length 1.5 0.8 - falcon speed 320.0 250.0 - weight 1.0 0.8 - length 0.3 0.2 - >>> df.drop(index='cow', columns='small') - big - lama speed 45.0 - weight 200.0 - length 1.5 - falcon speed 320.0 - weight 1.0 - length 0.3 - >>> df.drop(index='length', level=1) - big small - lama speed 45.0 30.0 - weight 200.0 100.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - falcon speed 320.0 250.0 - weight 1.0 0.8 - """ - if labels is not None: - if index is not None or columns is not None: - raise ValueError( - "Cannot specify both 'labels' and 'index'/'columns'" - ) - target = labels - elif index is not None: - target = index - axis = 0 - elif columns is not None: - target = columns - axis = 1 - else: - raise ValueError( - "Need to specify at least one of 'labels', " - "'index' or 'columns'" - ) - - if inplace: - out = self - else: - out = self.copy() - - if axis in (1, "columns"): - for label in _get_unique_drop_labels(target): - out._drop_column(label, errors=errors) - elif axis in (0, "index"): - dropped = _drop_rows_by_labels(out, target, level, errors) - - if columns is not None: - for label in _get_unique_drop_labels(columns): - dropped._drop_column(label, errors=errors) - - out._mimic_inplace(dropped, inplace=True) - - if not inplace: - return out - - @_performance_tracking - def _explode(self, explode_column: Any, ignore_index: bool): - # Helper function for `explode` in `Series` and `Dataframe`, explodes a - # specified nested column. Other columns' corresponding rows are - # duplicated. If ignore_index is set, the original index is not - # exploded and will be replaced with a `RangeIndex`. - if not isinstance(self._data[explode_column].dtype, ListDtype): - result = self.copy() - if ignore_index: - result.index = RangeIndex(len(result)) - return result - - column_index = self._column_names.index(explode_column) - if not ignore_index: - idx_cols = self.index._columns - else: - idx_cols = () - - exploded = libcudf.lists.explode_outer( - [*idx_cols, *self._columns], - column_index + len(idx_cols), - ) - # We must copy inner datatype of the exploded list column to - # maintain struct dtype key names - element_type = cast( - ListDtype, self._columns[column_index].dtype - ).element_type - exploded = [ - column._with_type_metadata(element_type) - if i == column_index - else column - for i, column in enumerate(exploded, start=-len(idx_cols)) - ] - return self._from_columns_like_self( - exploded, - self._column_names, - self.index.names if not ignore_index else None, - ) - - @_performance_tracking - def tile(self, count): - """Repeats the rows `count` times to form a new Frame. - - Parameters - ---------- - self : input Table containing columns to interleave. - count : Number of times to tile "rows". Must be non-negative. - - Examples - -------- - >>> import cudf - >>> df = cudf.Dataframe([[8, 4, 7], [5, 2, 3]]) - >>> count = 2 - >>> df.tile(df, count) - 0 1 2 - 0 8 4 7 - 1 5 2 3 - 0 8 4 7 - 1 5 2 3 - - Returns - ------- - The indexed frame containing the tiled "rows". - """ - return self._from_columns_like_self( - libcudf.reshape.tile( - [*self.index._columns, *self._columns], count - ), - column_names=self._column_names, - index_names=self._index_names, - ) - - @_performance_tracking - def groupby( - self, - by=None, - axis=0, - level=None, - as_index=True, - sort=no_default, - group_keys=False, - observed=True, - dropna=True, - ): - if sort is no_default: - sort = cudf.get_option("mode.pandas_compatible") - - if axis not in (0, "index"): - raise NotImplementedError("axis parameter is not yet implemented") - - if not observed: - raise NotImplementedError( - "observed parameter is not yet implemented" - ) - - if by is None and level is None: - raise TypeError( - "groupby() requires either by or level to be specified." - ) - if group_keys is None: - group_keys = False - - return ( - self.__class__._resampler(self, by=by) - if isinstance(by, cudf.Grouper) and by.freq - else self.__class__._groupby( - self, - by=by, - level=level, - as_index=as_index, - dropna=dropna, - sort=sort, - group_keys=group_keys, - ) - ) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Addition", - op_name="add", - equivalent_op="frame + other", - df_op_example=textwrap.dedent( - """ - >>> df.add(1) - angles degrees - circle 1 361 - triangle 4 181 - rectangle 5 361 - """, - ), - ser_op_example=textwrap.dedent( - """ - >>> a.add(b) - a 2 - b - c - d - e - dtype: int64 - >>> a.add(b, fill_value=0) - a 2 - b 1 - c 1 - d 1 - e - dtype: int64 - """ - ), - ) - ) - def add(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__add__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Addition", - op_name="radd", - equivalent_op="other + frame", - df_op_example=textwrap.dedent( - """ - >>> df.radd(1) - angles degrees - circle 1 361 - triangle 4 181 - rectangle 5 361 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.radd(b) - a 2 - b - c - d - e - dtype: int64 - >>> a.radd(b, fill_value=0) - a 2 - b 1 - c 1 - d 1 - e - dtype: int64 - """ - ), - ) - ) - def radd(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__radd__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Subtraction", - op_name="sub", - equivalent_op="frame - other", - df_op_example=textwrap.dedent( - """ - >>> df.sub(1) - angles degrees - circle -1 359 - triangle 2 179 - rectangle 3 359 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.sub(b) - a 0 - b - c - d - e - dtype: int64 - >>> a.sub(b, fill_value=0) - a 2 - b 1 - c 1 - d -1 - e - dtype: int64 - """ - ), - ) - ) - def subtract(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__sub__", fill_value) - - sub = subtract - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Subtraction", - op_name="rsub", - equivalent_op="other - frame", - df_op_example=textwrap.dedent( - """ - >>> df.rsub(1) - angles degrees - circle 1 -359 - triangle -2 -179 - rectangle -3 -359 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.rsub(b) - a 0 - b - c - d - e - dtype: int64 - >>> a.rsub(b, fill_value=0) - a 0 - b -1 - c -1 - d 1 - e - dtype: int64 - """ - ), - ) - ) - def rsub(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__rsub__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Multiplication", - op_name="mul", - equivalent_op="frame * other", - df_op_example=textwrap.dedent( - """ - >>> df.multiply(1) - angles degrees - circle 0 360 - triangle 3 180 - rectangle 4 360 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.multiply(b) - a 1 - b - c - d - e - dtype: int64 - >>> a.multiply(b, fill_value=0) - a 1 - b 0 - c 0 - d 0 - e - dtype: int64 - """ - ), - ) - ) - def multiply(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__mul__", fill_value) - - mul = multiply - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Multiplication", - op_name="rmul", - equivalent_op="other * frame", - df_op_example=textwrap.dedent( - """ - >>> df.rmul(1) - angles degrees - circle 0 360 - triangle 3 180 - rectangle 4 360 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.rmul(b) - a 1 - b - c - d - e - dtype: int64 - >>> a.rmul(b, fill_value=0) - a 1 - b 0 - c 0 - d 0 - e - dtype: int64 - """ - ), - ) - ) - def rmul(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__rmul__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Modulo", - op_name="mod", - equivalent_op="frame % other", - df_op_example=textwrap.dedent( - """ - >>> df.mod(1) - angles degrees - circle 0 0 - triangle 0 0 - rectangle 0 0 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.mod(b) - a 0 - b - c - d - e - dtype: int64 - >>> a.mod(b, fill_value=0) - a 0 - b 4294967295 - c 4294967295 - d 0 - e - dtype: int64 - """ - ), - ) - ) - def mod(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__mod__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Modulo", - op_name="rmod", - equivalent_op="other % frame", - df_op_example=textwrap.dedent( - """ - >>> df.rmod(1) - angles degrees - circle 4294967295 1 - triangle 1 1 - rectangle 1 1 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.rmod(b) - a 0 - b - c - d - e - dtype: int64 - >>> a.rmod(b, fill_value=0) - a 0 - b 0 - c 0 - d 4294967295 - e - dtype: int64 - """ - ), - ) - ) - def rmod(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__rmod__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Exponential", - op_name="pow", - equivalent_op="frame ** other", - df_op_example=textwrap.dedent( - """ - >>> df.pow(1) - angles degrees - circle 0 360 - triangle 2 180 - rectangle 4 360 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.pow(b) - a 1 - b - c - d - e - dtype: int64 - >>> a.pow(b, fill_value=0) - a 1 - b 1 - c 1 - d 0 - e - dtype: int64 - """ - ), - ) - ) - def pow(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__pow__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Exponential", - op_name="rpow", - equivalent_op="other ** frame", - df_op_example=textwrap.dedent( - """ - >>> df.rpow(1) - angles degrees - circle 1 1 - triangle 1 1 - rectangle 1 1 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.rpow(b) - a 1 - b - c - d - e - dtype: int64 - >>> a.rpow(b, fill_value=0) - a 1 - b 0 - c 0 - d 1 - e - dtype: int64 - """ - ), - ) - ) - def rpow(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__rpow__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Integer division", - op_name="floordiv", - equivalent_op="frame // other", - df_op_example=textwrap.dedent( - """ - >>> df.floordiv(1) - angles degrees - circle 0 360 - triangle 3 180 - rectangle 4 360 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.floordiv(b) - a 1 - b - c - d - e - dtype: int64 - >>> a.floordiv(b, fill_value=0) - a 1 - b 9223372036854775807 - c 9223372036854775807 - d 0 - e - dtype: int64 - """ - ), - ) - ) - def floordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__floordiv__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Integer division", - op_name="rfloordiv", - equivalent_op="other // frame", - df_op_example=textwrap.dedent( - """ - >>> df.rfloordiv(1) - angles degrees - circle 9223372036854775807 0 - triangle 0 0 - rectangle 0 0 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.rfloordiv(b) - a 1 - b - c - d - e - dtype: int64 - >>> a.rfloordiv(b, fill_value=0) - a 1 - b 0 - c 0 - d 9223372036854775807 - e - dtype: int64 - """ - ), - ) - ) - def rfloordiv(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__rfloordiv__", fill_value) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Floating division", - op_name="truediv", - equivalent_op="frame / other", - df_op_example=textwrap.dedent( - """ - >>> df.truediv(1) - angles degrees - circle 0.0 360.0 - triangle 3.0 180.0 - rectangle 4.0 360.0 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.truediv(b) - a 1.0 - b - c - d - e - dtype: float64 - >>> a.truediv(b, fill_value=0) - a 1.0 - b Inf - c Inf - d 0.0 - e - dtype: float64 - """ - ), - ) - ) - def truediv(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__truediv__", fill_value) - - # Alias for truediv - div = truediv - divide = truediv - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Floating division", - op_name="rtruediv", - equivalent_op="other / frame", - df_op_example=textwrap.dedent( - """ - >>> df.rtruediv(1) - angles degrees - circle inf 0.002778 - triangle 0.333333 0.005556 - rectangle 0.250000 0.002778 - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.rtruediv(b) - a 1.0 - b - c - d - e - dtype: float64 - >>> a.rtruediv(b, fill_value=0) - a 1.0 - b 0.0 - c 0.0 - d Inf - e - dtype: float64 - """ - ), - ) - ) - def rtruediv(self, other, axis, level=None, fill_value=None): # noqa: D102 - if level is not None: - raise NotImplementedError("level parameter is not supported yet.") - - return self._binaryop(other, "__rtruediv__", fill_value) - - # Alias for rtruediv - rdiv = rtruediv - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Equal to", - op_name="eq", - equivalent_op="frame == other", - df_op_example=textwrap.dedent( - """ - >>> df.eq(1) - angles degrees - circle False False - triangle False False - rectangle False False - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.eq(b) - a True - b - c - d - e - dtype: bool - >>> a.eq(b, fill_value=0) - a True - b False - c False - d False - e - dtype: bool - """ - ), - ) - ) - def eq(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 - return self._binaryop( - other=other, op="__eq__", fill_value=fill_value, can_reindex=True - ) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Not equal to", - op_name="ne", - equivalent_op="frame != other", - df_op_example=textwrap.dedent( - """ - >>> df.ne(1) - angles degrees - circle True True - triangle True True - rectangle True True - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.ne(b) - a False - b - c - d - e - dtype: bool - >>> a.ne(b, fill_value=0) - a False - b True - c True - d True - e - dtype: bool - """ - ), - ) - ) - def ne(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 - return self._binaryop( - other=other, op="__ne__", fill_value=fill_value, can_reindex=True - ) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Less than", - op_name="lt", - equivalent_op="frame < other", - df_op_example=textwrap.dedent( - """ - >>> df.lt(1) - angles degrees - circle True False - triangle False False - rectangle False False - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.lt(b) - a False - b - c - d - e - dtype: bool - >>> a.lt(b, fill_value=0) - a False - b False - c False - d True - e - dtype: bool - """ - ), - ) - ) - def lt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 - return self._binaryop( - other=other, op="__lt__", fill_value=fill_value, can_reindex=True - ) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Less than or equal to", - op_name="le", - equivalent_op="frame <= other", - df_op_example=textwrap.dedent( - """ - >>> df.le(1) - angles degrees - circle True False - triangle False False - rectangle False False - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.le(b) - a True - b - c - d - e - dtype: bool - >>> a.le(b, fill_value=0) - a True - b False - c False - d True - e - dtype: bool - """ - ), - ) - ) - def le(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 - return self._binaryop( - other=other, op="__le__", fill_value=fill_value, can_reindex=True - ) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Greater than", - op_name="gt", - equivalent_op="frame > other", - df_op_example=textwrap.dedent( - """ - >>> df.gt(1) - angles degrees - circle False True - triangle True True - rectangle True True - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.gt(b) - a False - b - c - d - e - dtype: bool - >>> a.gt(b, fill_value=0) - a False - b True - c True - d False - e - dtype: bool - """ - ), - ) - ) - def gt(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 - return self._binaryop( - other=other, op="__gt__", fill_value=fill_value, can_reindex=True - ) - - @_performance_tracking - @docutils.doc_apply( - doc_binop_template.format( - operation="Greater than or equal to", - op_name="ge", - equivalent_op="frame >= other", - df_op_example=textwrap.dedent( - """ - >>> df.ge(1) - angles degrees - circle False True - triangle True True - rectangle True True - """ - ), - ser_op_example=textwrap.dedent( - """ - >>> a.ge(b) - a True - b - c - d - e - dtype: bool - >>> a.ge(b, fill_value=0) - a True - b True - c True - d False - e - dtype: bool - """ - ), - ) - ) - def ge(self, other, axis="columns", level=None, fill_value=None): # noqa: D102 - return self._binaryop( - other=other, op="__ge__", fill_value=fill_value, can_reindex=True - ) - - def _preprocess_subset(self, subset): - if subset is None: - subset = self._column_names - elif ( - not np.iterable(subset) - or isinstance(subset, str) - or isinstance(subset, tuple) - and subset in self._column_names - ): - subset = (subset,) - diff = set(subset) - set(self._data) - if len(diff) != 0: - raise KeyError(f"columns {diff} do not exist") - return subset - - @_performance_tracking - def rank( - self, - axis=0, - method="average", - numeric_only=False, - na_option="keep", - ascending=True, - pct=False, - ): - """ - Compute numerical data ranks (1 through n) along axis. - - By default, equal values are assigned a rank that is the average of the - ranks of those values. - - Parameters - ---------- - axis : {0 or 'index'}, default 0 - Index to direct ranking. - method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - How to rank the group of records that have the same value - (i.e. ties): - * average: average rank of the group - * min: lowest rank in the group - * max: highest rank in the group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups. - numeric_only : bool, default False - For DataFrame objects, rank only numeric columns if set to True. - na_option : {'keep', 'top', 'bottom'}, default 'keep' - How to rank NaN values: - * keep: assign NaN rank to NaN values - * top: assign smallest rank to NaN values if ascending - * bottom: assign highest rank to NaN values if ascending. - ascending : bool, default True - Whether or not the elements should be ranked in ascending order. - pct : bool, default False - Whether or not to display the returned rankings in percentile - form. - - Returns - ------- - same type as caller - Return a Series or DataFrame with data ranks as values. - """ - if method not in {"average", "min", "max", "first", "dense"}: - raise KeyError(method) - - method_enum = pylibcudf.aggregation.RankMethod[method.upper()] - if na_option not in {"keep", "top", "bottom"}: - raise ValueError( - "na_option must be one of 'keep', 'top', or 'bottom'" - ) - - if axis not in (0, "index"): - raise NotImplementedError( - f"axis must be `0`/`index`, " - f"axis={axis} is not yet supported in rank" - ) - - num_cols = self._num_columns - dropped_cols = False - source = self - if numeric_only: - if isinstance( - source, cudf.Series - ) and not _is_non_decimal_numeric_dtype(self.dtype): - raise TypeError( - "Series.rank does not allow numeric_only=True with " - "non-numeric dtype." - ) - numeric_cols = ( - name - for name, dtype in self._dtypes - if _is_non_decimal_numeric_dtype(dtype) - ) - source = self._get_columns_by_label(numeric_cols) - if source.empty: - return source.astype("float64") - elif source._num_columns != num_cols: - dropped_cols = True - - result_columns = libcudf.sort.rank_columns( - [*source._columns], method_enum, na_option, ascending, pct - ) - - if dropped_cols: - result = type(source)._from_data( - ColumnAccessor( - dict(zip(source._column_names, result_columns)), - multiindex=self._data.multiindex, - level_names=self._data.level_names, - label_dtype=self._data.label_dtype, - verify=False, - ), - ) - else: - result = source._from_data_like_self( - self._data._from_columns_like_self(result_columns) - ) - result.index = source.index - return result.astype(np.float64) - - def convert_dtypes( - self, - infer_objects: bool = True, - convert_string: bool = True, - convert_integer: bool = True, - convert_boolean: bool = True, - convert_floating: bool = True, - dtype_backend=None, - ) -> Self: - """ - Convert columns to the best possible nullable dtypes. - - If the dtype is numeric, and consists of all integers, convert - to an appropriate integer extension type. Otherwise, convert - to an appropriate floating type. - - All other dtypes are always returned as-is as all dtypes in - cudf are nullable. - """ - if not (convert_floating and convert_integer): - return self.copy() - else: - cols = [] - for col in self._columns: - if col.dtype.kind == "f": - col = col.fillna(0) - as_int = col.astype("int64") - if cp.allclose(col, as_int): - cols.append(as_int) - continue - cols.append(col) - return self._from_data_like_self( - self._data._from_columns_like_self(cols, verify=False) - ) - - @_warn_no_dask_cudf - def __dask_tokenize__(self): - from dask.base import normalize_token - - return [ - type(self), - str(dict(self._dtypes)), - *[ - normalize_token(col.dtype.categories) - for col in self._columns - if col.dtype == "category" - ], - normalize_token(self.index), - normalize_token(self.hash_values().values_host), - ] - - -def _check_duplicate_level_names(specified, level_names): - """Raise if any of `specified` has duplicates in `level_names`.""" - if specified is None: - return - if len(set(level_names)) == len(level_names): - return - duplicates = {key for key, val in Counter(level_names).items() if val > 1} - - duplicates_specified = [spec for spec in specified if spec in duplicates] - if not len(duplicates_specified) == 0: - # Note: pandas raises first encountered duplicates, cuDF raises all. - raise ValueError( - f"The names {duplicates_specified} occurs multiple times, use a" - " level number" - ) - - -@_performance_tracking -def _get_replacement_values_for_columns( - to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any] -) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]: - """ - Returns a per column mapping for the values to be replaced, new - values to be replaced with and if all the values are empty. - - Parameters - ---------- - to_replace : numeric, str, list-like or dict - Contains the values to be replaced. - value : numeric, str, list-like, or dict - Contains the values to replace `to_replace` with. - columns_dtype_map : dict - A column to dtype mapping representing dtype of columns. - - Returns - ------- - all_na_columns : dict - A dict mapping of all columns if they contain all na values - to_replace_columns : dict - A dict mapping of all columns and the existing values that - have to be replaced. - values_columns : dict - A dict mapping of all columns and the corresponding values - to be replaced with. - """ - to_replace_columns: dict[Any, Any] = {} - values_columns: dict[Any, Any] = {} - all_na_columns: dict[Any, Any] = {} - - if is_scalar(to_replace) and is_scalar(value): - to_replace_columns = {col: [to_replace] for col in columns_dtype_map} - values_columns = {col: [value] for col in columns_dtype_map} - elif cudf.api.types.is_list_like(to_replace) or isinstance( - to_replace, (ColumnBase, BaseIndex) - ): - if is_scalar(value): - to_replace_columns = {col: to_replace for col in columns_dtype_map} - values_columns = { - col: [value] - if _is_non_decimal_numeric_dtype(columns_dtype_map[col]) - else as_column( - value, - length=len(to_replace), - dtype=cudf.dtype(type(value)), - ) - for col in columns_dtype_map - } - elif cudf.api.types.is_list_like( - value - ) or cudf.utils.dtypes.is_column_like(value): - if len(to_replace) != len(value): - raise ValueError( - f"Replacement lists must be " - f"of same length." - f" Expected {len(to_replace)}, got {len(value)}." - ) - else: - to_replace_columns = { - col: to_replace for col in columns_dtype_map - } - values_columns = {col: value for col in columns_dtype_map} - else: - raise TypeError( - "value argument must be scalar, list-like or Series" - ) - elif _is_series(to_replace): - if value is None or value is no_default: - to_replace_columns = { - col: as_column(to_replace.index) for col in columns_dtype_map - } - values_columns = {col: to_replace for col in columns_dtype_map} - elif is_dict_like(value): - to_replace_columns = { - col: to_replace[col] - for col in columns_dtype_map - if col in to_replace - } - values_columns = { - col: value[col] for col in to_replace_columns if col in value - } - elif is_scalar(value) or _is_series(value): - to_replace_columns = { - col: to_replace[col] - for col in columns_dtype_map - if col in to_replace - } - values_columns = { - col: [value] if is_scalar(value) else value[col] - for col in to_replace_columns - if col in value - } - else: - raise ValueError( - "Series.replace cannot use dict-like to_replace and non-None " - "value" - ) - elif is_dict_like(to_replace): - if value is None or value is no_default: - to_replace_columns = { - col: list(to_replace.keys()) for col in columns_dtype_map - } - values_columns = { - col: list(to_replace.values()) for col in columns_dtype_map - } - elif is_dict_like(value): - to_replace_columns = { - col: to_replace[col] - for col in columns_dtype_map - if col in to_replace - } - values_columns = { - col: value[col] for col in columns_dtype_map if col in value - } - elif is_scalar(value) or _is_series(value): - to_replace_columns = { - col: to_replace[col] - for col in columns_dtype_map - if col in to_replace - } - values_columns = { - col: [value] if is_scalar(value) else value - for col in columns_dtype_map - if col in to_replace - } - else: - raise TypeError("value argument must be scalar, dict, or Series") - else: - raise TypeError( - "Expecting 'to_replace' to be either a scalar, array-like, " - "dict or None, got invalid type " - f"'{type(to_replace).__name__}'" - ) - - to_replace_columns = { - key: [value] if is_scalar(value) else value - for key, value in to_replace_columns.items() - } - values_columns = { - key: [value] if is_scalar(value) else value - for key, value in values_columns.items() - } - - for i in to_replace_columns: - if i in values_columns: - if isinstance(values_columns[i], list): - all_na = values_columns[i].count(None) == len( - values_columns[i] - ) - else: - all_na = False - all_na_columns[i] = all_na - - return all_na_columns, to_replace_columns, values_columns - - -def _is_series(obj: Any) -> bool: - """ - Checks if the `obj` is of type `cudf.Series` - instead of checking for isinstance(obj, cudf.Series) - to avoid circular imports. - """ - return isinstance(obj, IndexedFrame) and obj.ndim == 1 - - -@_performance_tracking -def _drop_rows_by_labels( - obj: DataFrameOrSeries, - labels: ColumnLike | abc.Iterable | str, - level: int | str, - errors: str, -) -> DataFrameOrSeries: - """Remove rows specified by `labels`. - - If `errors="raise"`, an error is raised if some items in `labels` do not - exist in `obj.index`. - - Will raise if level(int) is greater or equal to index nlevels. - """ - if isinstance(level, int) and level >= obj.index.nlevels: - raise ValueError("Param level out of bounds.") - - if not isinstance(labels, cudf.core.single_column_frame.SingleColumnFrame): - labels = as_column(labels) - - if isinstance(obj.index, cudf.MultiIndex): - if level is None: - level = 0 - - levels_index = obj.index.get_level_values(level) - if errors == "raise" and not labels.isin(levels_index).all(): # type: ignore[union-attr] - raise KeyError("One or more values not found in axis") - - if isinstance(level, int): - ilevel = level - else: - ilevel = obj.index.names.index(level) - - # 1. Merge Index df and data df along column axis: - # | id | .index df | data column(s) | - idx_nlv = obj.index.nlevels - working_df = obj.index.to_frame(index=False) - working_df.columns = list(range(idx_nlv)) - for i, col in enumerate(obj._data): - working_df[idx_nlv + i] = obj._data[col] - # 2. Set `level` as common index: - # | level | .index df w/o level | data column(s) | - working_df = working_df.set_index(level) - - # 3. Use "leftanti" join to drop - # TODO: use internal API with "leftanti" and specify left and right - # join keys to bypass logic check - if isinstance(labels, ColumnBase): - join_index = cudf.Index._from_column(labels, name=level) - else: - join_index = cudf.Index(labels, name=level) - to_join = cudf.DataFrame(index=join_index) - join_res = working_df.join(to_join, how="leftanti") - - # 4. Reconstruct original layout, and rename - join_res._insert( - ilevel, name=join_res.index.name, value=join_res.index - ) - - midx = cudf.MultiIndex.from_frame( - join_res.iloc[:, 0:idx_nlv], names=obj.index.names - ) - - if isinstance(obj, cudf.Series): - return obj.__class__._from_data( - join_res.iloc[:, idx_nlv:]._data, index=midx, name=obj.name - ) - else: - return obj.__class__._from_data( - join_res.iloc[:, idx_nlv:]._data, - index=midx, - columns=obj._data.to_pandas_index(), - ) - - else: - if errors == "raise" and not labels.isin(obj.index).all(): # type: ignore[union-attr] - raise KeyError("One or more values not found in axis") - - if isinstance(labels, ColumnBase): - idx = cudf.Index._from_column(labels, name=obj.index.name) - else: - idx = cudf.Index(labels, name=labels.name) - key_df = cudf.DataFrame._from_data(data={}, index=idx) - if isinstance(obj, cudf.DataFrame): - res = obj.join(key_df, how="leftanti") - else: - res = obj.to_frame(name="tmp").join(key_df, how="leftanti")["tmp"] - res.name = obj.name - # Join changes the index to common type, - # but we need to preserve the type of - # index being returned, Hence this type-cast. - res.index = res.index.astype(obj.index.dtype) - return res - - -def _is_same_dtype(lhs_dtype, rhs_dtype): - # Utility specific to `_reindex` to check - # for matching column dtype. - if lhs_dtype == rhs_dtype: - return True - elif ( - isinstance(lhs_dtype, cudf.CategoricalDtype) - and isinstance(rhs_dtype, cudf.CategoricalDtype) - and lhs_dtype.categories.dtype == rhs_dtype.categories.dtype - ): - # OK if categories are not all the same - return True - elif ( - isinstance(lhs_dtype, cudf.CategoricalDtype) - and not isinstance(rhs_dtype, cudf.CategoricalDtype) - and lhs_dtype.categories.dtype == rhs_dtype - ): - return True - elif ( - isinstance(rhs_dtype, cudf.CategoricalDtype) - and not isinstance(lhs_dtype, cudf.CategoricalDtype) - and rhs_dtype.categories.dtype == lhs_dtype - ): - return True - else: - return False diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py deleted file mode 100644 index 8182e5cede2..00000000000 --- a/python/cudf/cudf/core/indexing_utils.py +++ /dev/null @@ -1,234 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -from dataclasses import dataclass -from typing import Any, List, Union - -from typing_extensions import TypeAlias - -import cudf -from cudf.api.types import _is_scalar_or_zero_d_array, is_integer -from cudf.core.copy_types import BooleanMask, GatherMap - - -class EmptyIndexer: - """An indexer that will produce an empty result.""" - - pass - - -@dataclass -class MapIndexer: - """An indexer for a gather map.""" - - key: GatherMap - - -@dataclass -class MaskIndexer: - """An indexer for a boolean mask.""" - - key: BooleanMask - - -@dataclass -class SliceIndexer: - """An indexer for a slice.""" - - key: slice - - -@dataclass -class ScalarIndexer: - """An indexer for a scalar value.""" - - key: GatherMap - - -IndexingSpec: TypeAlias = Union[ - EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer -] - -ColumnLabels: TypeAlias = List[str] - - -def destructure_iloc_key( - key: Any, frame: cudf.Series | cudf.DataFrame -) -> tuple[Any, ...]: - """ - Destructure a potentially tuple-typed key into row and column indexers. - - Tuple arguments to iloc indexing are treated specially. They are - picked apart into indexers for the row and column. If the number - of entries is less than the number of modes of the frame, missing - entries are slice-expanded. - - If the user-provided key is not a tuple, it is treated as if it - were a singleton tuple, and then slice-expanded. - - Once this destructuring has occurred, any entries that are - callables are then called with the indexed frame. This should - return a valid indexing object for the rows (respectively - columns), namely one of: - - - A boolean mask of the same length as the frame in the given - dimension - - A scalar integer that indexes the frame - - An array-like of integers that index the frame - - A slice that indexes the frame - - Integer and slice-based indexing follows usual Python conventions. - - Parameters - ---------- - key - The key to destructure - frame - DataFrame or Series to provide context - - Returns - ------- - tuple - Indexers with length equal to the dimension of the frame - - Raises - ------ - IndexError - If there are too many indexers, or any individual indexer is a tuple. - """ - n = len(frame.shape) - if isinstance(key, tuple): - # Key potentially indexes rows and columns, slice-expand to - # shape of frame - indexers = key + (slice(None),) * (n - len(key)) - if len(indexers) > n: - raise IndexError( - f"Too many indexers: got {len(indexers)} expected {n}" - ) - else: - # Key indexes rows, slice-expand to shape of frame - indexers = (key, *(slice(None),) * (n - 1)) - indexers = tuple(k(frame) if callable(k) else k for k in indexers) - if any(isinstance(k, tuple) for k in indexers): - raise IndexError( - "Too many indexers: can't have nested tuples in iloc indexing" - ) - return indexers - - -def destructure_dataframe_iloc_indexer( - key: Any, frame: cudf.DataFrame -) -> tuple[Any, tuple[bool, ColumnLabels]]: - """Destructure an index key for DataFrame iloc getitem. - - Parameters - ---------- - key - Key to destructure - frame - DataFrame to provide context context - - Returns - ------- - tuple - 2-tuple of a key for the rows and tuple of - (column_index_is_scalar, column_names) for the columns - - Raises - ------ - TypeError - If the column indexer is invalid - IndexError - If the provided key does not destructure correctly - NotImplementedError - If the requested column indexer repeats columns - """ - rows, cols = destructure_iloc_key(key, frame) - if cols is Ellipsis: - cols = slice(None) - scalar = is_integer(cols) - try: - column_names: ColumnLabels = list( - frame._data.get_labels_by_index(cols) - ) - except TypeError: - raise TypeError( - "Column indices must be integers, slices, " - "or list-like of integers" - ) - if scalar: - assert ( - len(column_names) == 1 - ), "Scalar column indexer should not produce more than one column" - - return rows, (scalar, column_names) - - -def destructure_series_iloc_indexer(key: Any, frame: cudf.Series) -> Any: - """Destructure an index key for Series iloc getitem. - - Parameters - ---------- - key - Key to destructure - frame - Series for unpacking context - - Returns - ------- - Single key that will index the rows - """ - (rows,) = destructure_iloc_key(key, frame) - return rows - - -def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec: - """ - Normalize and produce structured information about a row indexer. - - Given a row indexer that has already been destructured by - :func:`destructure_iloc_key`, inspect further and produce structured - information for indexing operations to act upon. - - Parameters - ---------- - key - Suitably destructured key for row indexing - n - Length of frame to index - - Returns - ------- - IndexingSpec - Structured data for indexing. A tag + parsed data. - - Raises - ------ - IndexError - If a valid type of indexer is provided, but it is out of - bounds - TypeError - If the indexing key is otherwise invalid. - """ - if key is Ellipsis: - return SliceIndexer(slice(None)) - elif isinstance(key, slice): - return SliceIndexer(key) - elif _is_scalar_or_zero_d_array(key): - return ScalarIndexer(GatherMap(key, n, nullify=False)) - else: - key = cudf.core.column.as_column(key) - if isinstance(key, cudf.core.column.CategoricalColumn): - key = key.astype(key.codes.dtype) - if key.dtype.kind == "b": - return MaskIndexer(BooleanMask(key, n)) - elif len(key) == 0: - return EmptyIndexer() - elif key.dtype.kind in "iu": - return MapIndexer(GatherMap(key, n, nullify=False)) - else: - raise TypeError( - "Cannot index by location " - f"with non-integer key of type {type(key)}" - ) diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py deleted file mode 100644 index 71a91c398ad..00000000000 --- a/python/cudf/cudf/core/join/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. - -from cudf.core.join.join import Merge, MergeSemi diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py deleted file mode 100644 index 854c44ff1a1..00000000000 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import warnings -from collections import abc -from typing import TYPE_CHECKING, Any, cast - -import numpy as np - -import cudf -from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype -from cudf.core.column import CategoricalColumn -from cudf.core.dtypes import CategoricalDtype - -if TYPE_CHECKING: - from cudf.core.column import ColumnBase - - -class _Indexer: - # Indexer into a column (either a data column or index level). - # - # >>> df - # a - # b - # 4 1 - # 5 2 - # 6 3 - # >>> _Indexer("a", column=True).get(df) # returns column "a" of df - # >>> _Indexer("b", index=True).get(df) # returns index level "b" of df - - def __init__(self, name: Any): - self.name = name - - -class _ColumnIndexer(_Indexer): - def get(self, obj: cudf.DataFrame) -> ColumnBase: - return obj._data[self.name] - - def set(self, obj: cudf.DataFrame, value: ColumnBase): - obj._data.set_by_label(self.name, value) - - -class _IndexIndexer(_Indexer): - def get(self, obj: cudf.DataFrame) -> ColumnBase: - return obj.index._data[self.name] - - def set(self, obj: cudf.DataFrame, value: ColumnBase): - obj.index._data.set_by_label(self.name, value) - - -def _match_join_keys( - lcol: ColumnBase, rcol: ColumnBase, how: str -) -> tuple[ColumnBase, ColumnBase]: - # Casts lcol and rcol to a common dtype for use as join keys. If no casting - # is necessary, they are returned as is. - - common_type = None - - # cast the keys lcol and rcol to a common dtype - ltype = lcol.dtype - rtype = rcol.dtype - - # if either side is categorical, different logic - left_is_categorical = isinstance(ltype, CategoricalDtype) - right_is_categorical = isinstance(rtype, CategoricalDtype) - if left_is_categorical and right_is_categorical: - return _match_categorical_dtypes_both( - cast(CategoricalColumn, lcol), cast(CategoricalColumn, rcol), how - ) - elif left_is_categorical or right_is_categorical: - if left_is_categorical: - if how in {"left", "leftsemi", "leftanti"}: - return lcol, rcol.astype(ltype) - common_type = ltype.categories.dtype - else: - common_type = rtype.categories.dtype - common_type = cudf.utils.dtypes._dtype_pandas_compatible(common_type) - return lcol.astype(common_type), rcol.astype(common_type) - - if is_dtype_equal(ltype, rtype): - return lcol, rcol - - if is_decimal_dtype(ltype) or is_decimal_dtype(rtype): - raise TypeError( - "Decimal columns can only be merged with decimal columns " - "of the same precision and scale" - ) - - if ( - is_numeric_dtype(ltype) - and is_numeric_dtype(rtype) - and not (ltype.kind == "m" or rtype.kind == "m") - ): - common_type = ( - max(ltype, rtype) - if ltype.kind == rtype.kind - else np.result_type(ltype, rtype) - ) - elif (ltype.kind == "M" and rtype.kind == "M") or ( - ltype.kind == "m" and rtype.kind == "m" - ): - common_type = max(ltype, rtype) - elif ltype.kind in "mM" and not rcol.fillna(0).can_cast_safely(ltype): - raise TypeError( - f"Cannot join between {ltype} and {rtype}, please type-cast both " - "columns to the same type." - ) - elif rtype.kind in "mM" and not lcol.fillna(0).can_cast_safely(rtype): - raise TypeError( - f"Cannot join between {rtype} and {ltype}, please type-cast both " - "columns to the same type." - ) - - if how == "left" and rcol.fillna(0).can_cast_safely(ltype): - return lcol, rcol.astype(ltype) - - return lcol.astype(common_type), rcol.astype(common_type) - - -def _match_categorical_dtypes_both( - lcol: CategoricalColumn, rcol: CategoricalColumn, how: str -) -> tuple[ColumnBase, ColumnBase]: - ltype, rtype = lcol.dtype, rcol.dtype - - # when both are ordered and both have the same categories, - # no casting required: - if ltype == rtype: - return lcol, rcol - - # Merging categorical variables when only one side is ordered is - # ambiguous and not allowed. - if ltype.ordered != rtype.ordered: - raise TypeError( - "Merging on categorical variables with mismatched" - " ordering is ambiguous" - ) - - if ltype.ordered and rtype.ordered: - # if we get to here, categories must be what causes the - # dtype equality check to fail. And we can never merge - # two ordered categoricals with different categories - raise TypeError( - f"{how} merge between categoricals with " - "different categories is only valid when " - "neither side is ordered" - ) - - if how == "inner": - # cast to category types -- we must cast them back later - return _match_join_keys( - lcol._get_decategorized_column(), - rcol._get_decategorized_column(), - how, - ) - elif how in {"left", "leftanti", "leftsemi"}: - # always cast to left type - return lcol, rcol.astype(ltype) - else: - # merge categories - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - merged_categories = cudf.concat( - [ltype.categories, rtype.categories] - ).unique() - common_type = cudf.CategoricalDtype( - categories=merged_categories, ordered=False - ) - return lcol.astype(common_type), rcol.astype(common_type) - - -def _coerce_to_tuple(obj): - if isinstance(obj, abc.Iterable) and not isinstance(obj, str): - return tuple(obj) - else: - return (obj,) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py deleted file mode 100644 index cfeaca00888..00000000000 --- a/python/cudf/cudf/core/join/join.py +++ /dev/null @@ -1,553 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import itertools -from typing import Any, ClassVar - -import cudf -from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype -from cudf.core.copy_types import GatherMap -from cudf.core.join._join_helpers import ( - _coerce_to_tuple, - _ColumnIndexer, - _IndexIndexer, - _match_join_keys, -) - - -class Merge: - # The joiner function must have the following signature: - # - # def joiner( - # lhs: Frame, - # rhs: Frame - # ) -> Tuple[Optional[Column], Optional[Column]]: - # ... - # - # where `lhs` and `rhs` are Frames composed of the left and right - # join key. The `joiner` returns a tuple of two Columns - # representing the rows to gather from the left- and right- side - # tables respectively. - _joiner: ClassVar[staticmethod] = staticmethod(libcudf.join.join) - - def __init__( - self, - lhs, - rhs, - *, - on, - left_on, - right_on, - left_index, - right_index, - how, - sort, - indicator, - suffixes, - ): - """ - Manage the merging of two Frames. - - Parameters - ---------- - lhs : DataFrame - The left operand of the merge - rhs : DataFrame - The right operand of the merge - on : string or list like - A set of key columns in the left and right operands - elements must be common to both frames - left_on : string or list like - A set of key columns in the left operand. Must be - specified with right_on or right_index concurrently - right_on : string or list like - A set of key columns in the right operand. Must be - specified with left_on or left_index concurrently - left_index : bool - Boolean flag indicating the left index column or columns - are to be used as join keys in order. - right_index : bool - Boolean flag indicating the right index column or columns - are to be used as join keys in order. - how : string - The type of join. Possible values are - 'inner', 'outer', 'left', 'leftsemi' and 'leftanti' - sort : bool - Boolean flag indicating if the output Frame is to be - sorted on the output's join keys, in left to right order. - suffixes : list like - Left and right suffixes specified together, unpacked into lsuffix - and rsuffix. - """ - self._validate_merge_params( - lhs, - rhs, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - how=how, - suffixes=suffixes, - ) - - self.lhs = lhs.copy(deep=False) - self.rhs = rhs.copy(deep=False) - self.how = how - # If the user requests that the result is sorted or we're in - # pandas-compatible mode we have various obligations on the - # output order: - # - # compat-> | False | True - # sort | | - # ---------+--------------------------+------------------------------- - # False| no obligation | ordering as per pandas docs(*) - # True | sorted lexicographically | sorted lexicographically(*) - # - # (*) If two keys are equal, tiebreak is to use input table order. - # - # In pandas-compat mode, we have obligations on the order to - # match pandas (even if sort=False), see - # pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html. - # The ordering requirements differ depending on which join - # type is specified: - # - # - left: preserve key order (only keeping left keys) - # - right: preserve key order (only keeping right keys) - # - inner: preserve key order (of left keys) - # - outer: sort keys lexicographically - # - cross (not supported): preserve key order (of left keys) - # - # Moreover, in all cases, whenever there is a tiebreak - # situation (for sorting or otherwise), the deciding order is - # "input table order" - self.sort = sort or ( - cudf.get_option("mode.pandas_compatible") and how == "outer" - ) - self.preserve_key_order = cudf.get_option( - "mode.pandas_compatible" - ) and how in { - "inner", - "outer", - "left", - "right", - } - self.lsuffix, self.rsuffix = suffixes - - # At this point validation guarantees that if on is not None we - # don't have any other args, so we can apply it directly to left_on and - # right_on. - self._using_left_index = bool(left_index) - left_on = ( - lhs.index._column_names - if left_index - else left_on - if left_on - else on - ) - self._using_right_index = bool(right_index) - right_on = ( - rhs.index._column_names - if right_index - else right_on - if right_on - else on - ) - - if left_on or right_on: - self._left_keys = [ - _ColumnIndexer(name=on) - if not self._using_left_index and on in lhs._data - else _IndexIndexer(name=on) - for on in (_coerce_to_tuple(left_on) if left_on else []) - ] - self._right_keys = [ - _ColumnIndexer(name=on) - if not self._using_right_index and on in rhs._data - else _IndexIndexer(name=on) - for on in (_coerce_to_tuple(right_on) if right_on else []) - ] - if len(self._left_keys) != len(self._right_keys): - raise ValueError( - "Merge operands must have same number of join key columns" - ) - self._using_left_index = any( - isinstance(idx, _IndexIndexer) for idx in self._left_keys - ) - self._using_right_index = any( - isinstance(idx, _IndexIndexer) for idx in self._right_keys - ) - else: - # if `on` is not provided and we're not merging - # index with column or on both indexes, then use - # the intersection of columns in both frames - on_names = set(lhs._data) & set(rhs._data) - self._left_keys = [_ColumnIndexer(name=on) for on in on_names] - self._right_keys = [_ColumnIndexer(name=on) for on in on_names] - self._using_left_index = False - self._using_right_index = False - - self._key_columns_with_same_name = ( - set(_coerce_to_tuple(on)) - if on - else { - lkey.name - for lkey, rkey in zip(self._left_keys, self._right_keys) - if lkey.name == rkey.name - and not ( - isinstance(lkey, _IndexIndexer) - or isinstance(rkey, _IndexIndexer) - ) - } - ) - - def _gather_maps(self, left_cols, right_cols): - # Produce gather maps for the join, optionally reordering to - # match pandas-order in compat mode. - maps = self._joiner( - left_cols, - right_cols, - how=self.how, - ) - if not self.preserve_key_order: - return maps - # We should only get here if we're in a join on which - # pandas-compat places some ordering obligation (which - # precludes a semi-join) - # We must perform this reordering even if sort=True since the - # obligation to ensure tiebreaks appear in input table order - # means that the gather maps must be permuted into an original - # order. - assert self.how in {"inner", "outer", "left", "right"} - # And hence both maps returned from the libcudf join should be - # non-None. - assert all(m is not None for m in maps) - lengths = [len(left_cols[0]), len(right_cols[0])] - # Only nullify those maps that need it. - nullify = [ - self.how not in {"inner", "left"}, - self.how not in {"inner", "right"}, - ] - # To reorder maps so that they are in order of the input - # tables, we gather from iota on both right and left, and then - # sort the gather maps with those two columns as key. - key_order = list( - itertools.chain.from_iterable( - libcudf.copying.gather( - [ - cudf.core.column.as_column( - range(n), dtype=size_type_dtype - ) - ], - map_, - nullify=null, - ) - for map_, n, null in zip(maps, lengths, nullify) - ) - ) - return libcudf.sort.sort_by_key( - list(maps), - # If how is right, right map is primary sort key. - key_order[:: -1 if self.how == "right" else 1], - [True] * len(key_order), - ["last"] * len(key_order), - stable=True, - ) - - def perform_merge(self) -> cudf.DataFrame: - left_join_cols = [] - right_join_cols = [] - - for left_key, right_key in zip(self._left_keys, self._right_keys): - lcol = left_key.get(self.lhs) - rcol = right_key.get(self.rhs) - lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how) - left_join_cols.append(lcol_casted) - right_join_cols.append(rcol_casted) - - # Categorical dtypes must be cast back from the underlying codes - # type that was returned by _match_join_keys. - if ( - self.how == "inner" - and isinstance(lcol.dtype, cudf.CategoricalDtype) - and isinstance(rcol.dtype, cudf.CategoricalDtype) - ): - lcol_casted = lcol_casted.astype("category") - rcol_casted = rcol_casted.astype("category") - - left_key.set(self.lhs, lcol_casted) - right_key.set(self.rhs, rcol_casted) - - left_rows, right_rows = self._gather_maps( - left_join_cols, right_join_cols - ) - gather_kwargs = { - "keep_index": self._using_left_index or self._using_right_index, - } - left_result = ( - self.lhs._gather( - GatherMap.from_column_unchecked( - left_rows, len(self.lhs), nullify=True - ), - **gather_kwargs, - ) - if left_rows is not None - else cudf.DataFrame._from_data({}) - ) - del left_rows - right_result = ( - self.rhs._gather( - GatherMap.from_column_unchecked( - right_rows, len(self.rhs), nullify=True - ), - **gather_kwargs, - ) - if right_rows is not None - else cudf.DataFrame._from_data({}) - ) - del right_rows - result = cudf.DataFrame._from_data( - *self._merge_results(left_result, right_result) - ) - - if self.sort: - result = self._sort_result(result) - return result - - def _merge_results( - self, left_result: cudf.DataFrame, right_result: cudf.DataFrame - ): - # Merge the DataFrames `left_result` and `right_result` into a single - # `DataFrame`, suffixing column names if necessary. - - # If two key columns have the same name, a single output column appears - # in the result. For all non-outer join types, the key column from the - # rhs is simply dropped. For outer joins, the two key columns are - # combined by filling nulls in the left key column with corresponding - # values from the right key column: - if self.how == "outer": - for lkey, rkey in zip(self._left_keys, self._right_keys): - if lkey.name == rkey.name: - # fill nulls in lhs from values in the rhs - lkey.set( - left_result, - lkey.get(left_result).fillna(rkey.get(right_result)), - ) - - # All columns from the left table make it into the output. Non-key - # columns that share a name with a column in the right table are - # suffixed with the provided suffix. - common_names = set(left_result._column_names) & set( - right_result._column_names - ) - cols_to_suffix = common_names - self._key_columns_with_same_name - data = { - (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col - for name, col in left_result._column_labels_and_values - } - - # The right table follows the same rule as the left table except that - # key columns from the right table are removed. - for name, col in right_result._column_labels_and_values: - if name in common_names: - if name not in self._key_columns_with_same_name: - data[f"{name}{self.rsuffix}"] = col - else: - data[name] = col - - # determine if the result has multiindex columns. The result - # of a join has a MultiIndex as its columns if: - # - both the `lhs` and `rhs` have a MultiIndex columns - # OR - # - either one of `lhs` or `rhs` have a MultiIndex columns, - # and the other is empty (i.e., no columns) - if self.lhs._data and self.rhs._data: - multiindex_columns = ( - self.lhs._data.multiindex and self.rhs._data.multiindex - ) - elif self.lhs._data: - multiindex_columns = self.lhs._data.multiindex - elif self.rhs._data: - multiindex_columns = self.rhs._data.multiindex - else: - multiindex_columns = False - - index: cudf.BaseIndex | None - if self._using_right_index: - # right_index and left_on - index = left_result.index - elif self._using_left_index: - # left_index and right_on - index = right_result.index - else: - index = None - - # Construct result from data and index: - return ( - left_result._data.__class__( - data=data, multiindex=multiindex_columns - ), - index, - ) - - def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: - # Pandas sorts on the key columns in the - # same order as given in 'on'. If the indices are used as - # keys, the index will be sorted. If one index is specified, - # the key columns on the other side will be used to sort. - # In pandas-compatible mode, tie-breaking for multiple equal - # sort keys is to produce output in input dataframe order. - # This is taken care of by using a stable sort here, and (in - # pandas-compat mode) reordering the gather maps before - # producing the input result. - by: list[Any] = [] - if self._using_left_index and self._using_right_index: - by.extend(result.index._columns) - if not self._using_left_index: - by.extend([result._data[col.name] for col in self._left_keys]) - if not self._using_right_index: - by.extend([result._data[col.name] for col in self._right_keys]) - if by: - keep_index = self._using_left_index or self._using_right_index - if keep_index: - to_sort = [*result.index._columns, *result._columns] - index_names = result.index.names - else: - to_sort = [*result._columns] - index_names = None - result_columns = libcudf.sort.sort_by_key( - to_sort, - by, - [True] * len(by), - ["last"] * len(by), - stable=True, - ) - result = result._from_columns_like_self( - result_columns, result._column_names, index_names - ) - return result - - @staticmethod - def _validate_merge_params( - lhs, - rhs, - on, - left_on, - right_on, - left_index, - right_index, - how, - suffixes, - ): - # Error for various invalid combinations of merge input parameters - - # We must actually support the requested merge type - if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}: - raise NotImplementedError(f"{how} merge not supported yet") - - if on: - if left_on or right_on: - # Passing 'on' with 'left_on' or 'right_on' is ambiguous - raise ValueError( - 'Can only pass argument "on" OR "left_on" ' - 'and "right_on", not a combination of both.' - ) - elif left_index or right_index: - # Passing 'on' with 'left_index' or 'right_index' is ambiguous - raise ValueError( - 'Can only pass argument "on" OR "left_index" ' - 'and "right_index", not a combination of both.' - ) - else: - # the validity of 'on' being checked by _Indexer - return - elif left_on and left_index: - raise ValueError( - 'Can only pass argument "left_on" OR "left_index" not both.' - ) - elif right_on and right_index: - raise ValueError( - 'Can only pass argument "right_on" OR "right_index" not both.' - ) - - # Can't merge on a column name that is present in both a frame and its - # indexes. - if on: - for key in on: - if (key in lhs._data and key in lhs.index._data) or ( - key in rhs._data and key in rhs.index._data - ): - raise ValueError( - f"{key} is both an index level and a " - "column label, which is ambiguous." - ) - if left_on: - for key in left_on: - if key in lhs._data and key in lhs.index._data: - raise ValueError( - f"{key} is both an index level and a " - "column label, which is ambiguous." - ) - if right_on: - for key in right_on: - if key in rhs._data and key in rhs.index._data: - raise ValueError( - f"{key} is both an index level and a " - "column label, which is ambiguous." - ) - - # Can't merge on unnamed Series - if (isinstance(lhs, cudf.Series) and not lhs.name) or ( - isinstance(rhs, cudf.Series) and not rhs.name - ): - raise ValueError("Cannot merge on unnamed Series") - - # If nothing specified, must have common cols to use implicitly - same_named_columns = set(lhs._data) & set(rhs._data) - if ( - not (left_index or right_index) - and not (left_on or right_on) - and len(same_named_columns) == 0 - ): - raise ValueError("No common columns to perform merge on") - - lsuffix, rsuffix = suffixes - for name in same_named_columns: - if name == left_on == right_on: - continue - elif left_on and right_on: - if (name in left_on and name in right_on) and ( - left_on.index(name) == right_on.index(name) - ): - continue - else: - if not (lsuffix or rsuffix): - raise ValueError( - "there are overlapping columns but " - "lsuffix and rsuffix are not defined" - ) - - if ( - isinstance(lhs, cudf.DataFrame) - and isinstance(rhs, cudf.DataFrame) - # An empty column is considered to have 1 level by pandas (can be - # seen by using lhs.columns.nlevels, but we don't want to use - # columns internally because it's expensive). - # TODO: Investigate whether ColumnAccessor.nlevels should be - # modified in the size 0 case. - and max(lhs._data.nlevels, 1) != max(rhs._data.nlevels, 1) - ): - raise ValueError( - "Not allowed to merge between different levels. " - f"({lhs._data.nlevels} levels on " - f"the left, {rhs._data.nlevels} on the right)" - ) - - -class MergeSemi(Merge): - _joiner: ClassVar[staticmethod] = staticmethod(libcudf.join.semi_join) - - def _merge_results(self, lhs: cudf.DataFrame, rhs: cudf.DataFrame): - # semi-join result includes only lhs columns - return lhs._data, lhs.index diff --git a/python/cudf/cudf/core/missing.py b/python/cudf/cudf/core/missing.py deleted file mode 100644 index 0d48a1d4136..00000000000 --- a/python/cudf/cudf/core/missing.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. - - -# Pandas NAType enforces a single instance exists at a time -# instantiating this class will yield the existing instance -# of pandas._libs.missing.NAType, id(cudf.NA) == id(pd.NA). -from pandas import NA, NaT - -__all__ = ["NA", "NaT"] diff --git a/python/cudf/cudf/core/mixins/__init__.py b/python/cudf/cudf/core/mixins/__init__.py deleted file mode 100644 index 8306f3f11b3..00000000000 --- a/python/cudf/cudf/core/mixins/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -from .binops import BinaryOperand -from .reductions import Reducible -from .scans import Scannable - -__all__ = ["BinaryOperand", "Reducible", "Scannable"] diff --git a/python/cudf/cudf/core/mixins/binops.py b/python/cudf/cudf/core/mixins/binops.py deleted file mode 100644 index eaabc00f266..00000000000 --- a/python/cudf/cudf/core/mixins/binops.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -from .mixin_factory import _create_delegating_mixin - -BinaryOperand = _create_delegating_mixin( - "BinaryOperand", - "Mixin encapsulating binary operations.", - "BINARY_OPERATION", - "_binaryop", - { - # Numeric operations. - "__add__", - "__sub__", - "__mul__", - "__matmul__", - "__truediv__", - "__floordiv__", - "__mod__", - # "__divmod__", # Not yet implemented - "__pow__", - # "__lshift__", # Not yet implemented - # "__rshift__", # Not yet implemented - "__and__", - "__xor__", - "__or__", - # Reflected numeric operations. - "__radd__", - "__rsub__", - "__rmul__", - "__rmatmul__", - "__rtruediv__", - "__rfloordiv__", - "__rmod__", - # "__rdivmod__", # Not yet implemented - "__rpow__", - # "__rlshift__", # Not yet implemented - # "__rrshift__", # Not yet implemented - "__rand__", - "__rxor__", - "__ror__", - # Rich comparison operations. - "__lt__", - "__le__", - "__eq__", - "__ne__", - "__gt__", - "__ge__", - }, -) - -# TODO: See if there is a better approach to these two issues: 1) The mixin -# assumes a single standard parameter, whereas binops have two, and 2) we need -# a way to determine reflected vs normal ops. - - -def _binaryop(self, other, op: str): - """The core binary_operation function. - - Must be overridden by subclasses, the default implementation raises a - NotImplementedError. - """ - raise NotImplementedError - - -def _check_reflected_op(op): - if reflect := op[2] == "r" and op != "__rshift__": - op = op[:2] + op[3:] - return reflect, op - - -BinaryOperand._binaryop = _binaryop -BinaryOperand._check_reflected_op = staticmethod(_check_reflected_op) diff --git a/python/cudf/cudf/core/mixins/binops.pyi b/python/cudf/cudf/core/mixins/binops.pyi deleted file mode 100644 index 6be73e25332..00000000000 --- a/python/cudf/cudf/core/mixins/binops.pyi +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -from typing import Any, TypeVar - -# Note: It may be possible to define a narrower bound here eventually. -BinaryOperandType = TypeVar("BinaryOperandType", bound="Any") - -class BinaryOperand: - _SUPPORTED_BINARY_OPERATIONS: set - - def _binaryop(self, other: BinaryOperandType, op: str): ... - def __add__(self, other): ... - def __sub__(self, other): ... - def __mul__(self, other): ... - def __truediv__(self, other): ... - def __floordiv__(self, other): ... - def __mod__(self, other): ... - def __pow__(self, other): ... - def __and__(self, other): ... - def __xor__(self, other): ... - def __or__(self, other): ... - def __radd__(self, other): ... - def __rsub__(self, other): ... - def __rmul__(self, other): ... - def __rtruediv__(self, other): ... - def __rfloordiv__(self, other): ... - def __rmod__(self, other): ... - def __rpow__(self, other): ... - def __rand__(self, other): ... - def __rxor__(self, other): ... - def __ror__(self, other): ... - def __lt__(self, other): ... - def __le__(self, other): ... - def __eq__(self, other): ... - def __ne__(self, other): ... - def __gt__(self, other): ... - def __ge__(self, other): ... - @staticmethod - def _check_reflected_op(op) -> tuple[bool, str]: ... diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py deleted file mode 100644 index 7bbb299d643..00000000000 --- a/python/cudf/cudf/core/mixins/mixin_factory.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -import inspect - - -# `functools.partialmethod` does not allow setting attributes such as -# __doc__ on the resulting method. So we use a simple alternative to -# it here: -def _partialmethod(method, *args1, **kwargs1): - def wrapper(self, *args2, **kwargs2): - return method(self, *args1, *args2, **kwargs1, **kwargs2) - - return wrapper - - -class Operation: - """Descriptor used to define operations for delegating mixins. - - This class is designed to be assigned to the attributes (the delegating - methods) defined by the OperationMixin. This class will create the method - and mimic all the expected attributes for that method to appear as though - it was originally designed on the class. The use of the descriptor pattern - ensures that the method is only created the first time it is invoked, after - which all further calls use the callable generated on the first invocation. - - Parameters - ---------- - name : str - The name of the operation. - docstring_format_args : str - The attribute of the owning class from which to pull format parameters - for this operation's docstring. - base_operation : str - The underlying operation function to be invoked when operation `name` - is called on the owning class. - """ - - def __init__(self, name, docstring_format_args, base_operation): - self._name = name - self._docstring_format_args = docstring_format_args - self._base_operation = base_operation - - def __get__(self, obj, owner=None): - retfunc = _partialmethod(self._base_operation, op=self._name) - - # Required attributes that will exist. - retfunc.__name__ = self._name - retfunc.__qualname__ = ".".join([owner.__name__, self._name]) - retfunc.__module__ = self._base_operation.__module__ - - if self._base_operation.__doc__ is not None: - retfunc.__doc__ = self._base_operation.__doc__.format( - cls=owner.__name__, - op=self._name, - **self._docstring_format_args, - ) - - retfunc.__annotations__ = self._base_operation.__annotations__.copy() - retfunc.__annotations__.pop("op", None) - retfunc_params = [ - v - for k, v in inspect.signature( - self._base_operation - ).parameters.items() - if k != "op" - ] - retfunc.__signature__ = inspect.Signature(retfunc_params) - - setattr(owner, self._name, retfunc) - - if obj is None: - return getattr(owner, self._name) - else: - return getattr(obj, self._name) - - -def _should_define_operation(cls, operation, base_operation_name): - if operation not in dir(cls): - return True - - # If the class doesn't override the base operation we stick to whatever - # parent implementation exists. - if base_operation_name not in cls.__dict__: - return False - - # At this point we know that the class has the operation defined but it - # also overrides the base operation. Since this function is called before - # the operation is defined on the current class, we know that it inherited - # the operation from a parent. We therefore have three possibilities: - # 1. A parent class manually defined the operation. That override takes - # precedence even if the current class defined the base operation. - # 2. A parent class has an auto-generated operation, i.e. it is of type - # Operation and was created by OperationMixin.__init_subclass__. The - # current class must override it so that its base operation is used - # rather than the parent's base operation. - # 3. The method is defined for all classes, i.e. it is a method of object. - for base_cls in cls.__mro__: - # We always override methods defined for object. - if base_cls is object: - return True - # The first attribute in the MRO is the one that will be used. - if operation in base_cls.__dict__: - return isinstance(base_cls.__dict__[operation], Operation) - - # This line should be unreachable since we know the attribute exists - # somewhere in the MRO if the for loop was entered. - assert False, "Operation attribute not found in hierarchy." - - -def _create_delegating_mixin( - mixin_name, - docstring, - category_name, - base_operation_name, - supported_operations, -): - """Factory for mixins defining collections of delegated operations. - - This function generates mixins based on two common paradigms in cuDF: - - 1. libcudf groups many operations into categories using a common API. These - APIs usually accept an enum to delineate the specific operation to - perform, e.g. binary operations use the `binary_operator` enum when - calling the `binary_operation` function. cuDF Python mimics this - structure by having operations within a category delegate to a common - internal function (e.g. DataFrame.__add__ calls DataFrame._binaryop). - 2. Many cuDF classes implement similar operations (e.g. `sum`) via - delegation to lower-level APIs before reaching a libcudf C++ function - call. As a result, many API function calls actually involve multiple - delegations to lower-level APIs that can look essentially identical. An - example of such a sequence would be DataFrame.sum -> DataFrame._reduce - -> Column.sum -> Column._reduce -> libcudf. - - This factory creates mixins for a category of operations implemented by via - this delegator pattern. The resulting mixins make it easy to share common - functions across various classes while also providing a common entrypoint - for implementing the centralized logic for a given category of operations. - Its usage is best demonstrated by example below. - - Parameters - ---------- - mixin_name : str - The name of the class. This argument should be the same as the object - that this function's output is assigned to, e.g. - :code:`Baz = _create_delegating_mixin("Baz", ...)`. - docstring : str - The documentation string for the mixin class. - category_name : str - The category of operations for which a mixin is being created. This - name will be used to define or access the following attributes as shown - in the example below: - - f'_{category_name}_DOCSTRINGS' - - f'_VALID_{category_name}S' # The subset of ops a subclass allows - - f'_SUPPORTED_{category_name}S' # The ops supported by the mixin - base_operation_name : str - The name given to the core function implementing this category of - operations. The corresponding function is the entrypoint for child - classes. - supported_ops : List[str] - The list of valid operations that subclasses of the resulting mixin may - request to be implemented. - - Examples - -------- - >>> # The class below: - >>> class Person: - ... def _greet(self, op): - ... print(op) - ... - ... def hello(self): - ... self._greet("hello") - ... - ... def goodbye(self): - ... self._greet("goodbye") - >>> # can be rewritten using a delegating mixin as follows: - >>> Greeter = _create_delegating_mixin( - ... "Greeter", "", "GREETING", "_greet", {"hello", "goodbye", "hey"} - ... ) - >>> # The `hello` and `goodbye` methods will now be automatically generated - >>> # for the Person class below. - >>> class Person(Greeter): - ... _VALID_GREETINGS = {"hello", "goodbye"} - ... - ... def _greet(self, op: str): - ... '''Say {op}.''' - ... print(op) - >>> mom = Person() - >>> mom.hello() - hello - >>> # The Greeter class could also enable the `hey` method, but Person did - >>> # not include it in the _VALID_GREETINGS set so it will not exist. - >>> mom.hey() - Traceback (most recent call last): - ... - AttributeError: 'Person' object has no attribute 'hey' - >>> # The docstrings for each method are generated by formatting the _greet - >>> # docstring with the operation name as well as any additional keys - >>> # provided via the _GREETING_DOCSTRINGS parameter. - >>> print(mom.hello.__doc__) - Say hello. - """ - # The first two attributes may be defined on subclasses of the generated - # OperationMixin to indicate valid attributes and parameters to use when - # formatting docstrings. The supported_attr will be defined on the - # OperationMixin itself to indicate what operations its subclass may - # inherit from it. - validity_attr = f"_VALID_{category_name}S" - docstring_attr = f"_{category_name}_DOCSTRINGS" - supported_attr = f"_SUPPORTED_{category_name}S" - - class OperationMixin: - @classmethod - def __init_subclass__(cls): - # Support composition of various OperationMixins. Note that since - # this __init_subclass__ is defined on mixins, it does not prohibit - # classes that inherit it from implementing this method as well as - # long as those implementations also include this super call. - super().__init_subclass__() - - # Only add the valid set of operations for a particular class. - valid_operations = set() - for base_cls in cls.__mro__: - # Check for sentinel indicating that all operations are valid. - valid_operations |= getattr(base_cls, validity_attr, set()) - - invalid_operations = valid_operations - supported_operations - assert ( - len(invalid_operations) == 0 - ), f"Invalid requested operations: {invalid_operations}" - - base_operation = getattr(cls, base_operation_name) - for operation in valid_operations: - if _should_define_operation( - cls, operation, base_operation_name - ): - docstring_format_args = getattr( - cls, docstring_attr, {} - ).get(operation, {}) - op_attr = Operation( - operation, docstring_format_args, base_operation - ) - setattr(cls, operation, op_attr) - - OperationMixin.__name__ = mixin_name - OperationMixin.__qualname__ = mixin_name - OperationMixin.__doc__ = docstring - - def _operation(self, op: str, *args, **kwargs): - raise NotImplementedError - - _operation.__name__ = base_operation_name - _operation.__qualname__ = ".".join([mixin_name, base_operation_name]) - _operation.__doc__ = ( - f"The core {category_name.lower()} function. Must be overridden by " - "subclasses, the default implementation raises a NotImplementedError." - ) - - setattr(OperationMixin, base_operation_name, _operation) - # Making this attribute available makes it easy for subclasses to indicate - # that all supported operations for this mixin are valid. - setattr(OperationMixin, supported_attr, supported_operations) - - return OperationMixin diff --git a/python/cudf/cudf/core/mixins/reductions.py b/python/cudf/cudf/core/mixins/reductions.py deleted file mode 100644 index f73f0e8fbc6..00000000000 --- a/python/cudf/cudf/core/mixins/reductions.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -from .mixin_factory import _create_delegating_mixin - -Reducible = _create_delegating_mixin( - "Reducible", - "Mixin encapsulating reduction operations.", - "REDUCTION", - "_reduce", - { - "sum", - "product", - "min", - "max", - "count", - "any", - "all", - "sum_of_squares", - "mean", - "var", - "std", - "median", - "argmax", - "argmin", - "nunique", - "nth", - "collect", - "unique", - "prod", - "idxmin", - "idxmax", - "first", - "last", - }, -) diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi deleted file mode 100644 index 1c2126002ad..00000000000 --- a/python/cudf/cudf/core/mixins/reductions.pyi +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -class Reducible: - _SUPPORTED_REDUCTIONS: set - - def sum(self): ... - def product(self): ... - def min(self): ... - def max(self): ... - def count(self): ... - def any(self): ... - def all(self): ... - def sum_of_squares(self): ... - def mean(self): ... - def var(self): ... - def std(self): ... - def median(self): ... - def argmax(self): ... - def argmin(self): ... - def nunique(self): ... - def nth(self): ... - def collect(self): ... - def prod(self): ... - def idxmin(self): ... - def idxmax(self): ... - def first(self): ... - def last(self): ... diff --git a/python/cudf/cudf/core/mixins/scans.py b/python/cudf/cudf/core/mixins/scans.py deleted file mode 100644 index b0f606e32e6..00000000000 --- a/python/cudf/cudf/core/mixins/scans.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -from .mixin_factory import _create_delegating_mixin - -Scannable = _create_delegating_mixin( - "Scannable", - "Mixin encapsulating scan operations.", - "SCAN", - "_scan", - { - "cumsum", - "cumprod", - "cummin", - "cummax", - }, # noqa: E231 -) diff --git a/python/cudf/cudf/core/mixins/scans.pyi b/python/cudf/cudf/core/mixins/scans.pyi deleted file mode 100644 index 5190750c698..00000000000 --- a/python/cudf/cudf/core/mixins/scans.pyi +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -class Scannable: - _SUPPORTED_SCANS: set - - def cumsum(self): ... - def cumprod(self): ... - def cummin(self): ... - def cummax(self): ... diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py deleted file mode 100644 index 92d094d9de5..00000000000 --- a/python/cudf/cudf/core/multiindex.py +++ /dev/null @@ -1,2167 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import itertools -import numbers -import operator -import pickle -import warnings -from functools import cached_property -from typing import TYPE_CHECKING, Any, MutableMapping - -import cupy as cp -import numpy as np -import pandas as pd - -import cudf -import cudf._lib as libcudf -from cudf._lib.types import size_type_dtype -from cudf.api.extensions import no_default -from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar -from cudf.core import column -from cudf.core._base_index import _return_get_indexer_result -from cudf.core.algorithms import factorize -from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame -from cudf.core.index import ( - BaseIndex, - _get_indexer_basic, - _lexsorted_equal_range, - ensure_index, -) -from cudf.core.join._join_helpers import _match_join_keys -from cudf.utils.dtypes import is_column_like -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name - -if TYPE_CHECKING: - from collections.abc import Generator, Hashable - - from typing_extensions import Self - - from cudf._typing import DataFrameOrSeries - - -def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray: - """Makes best effort to convert an array of indices into a python slice. - If the conversion is not possible, return input. `indices` are expected - to be valid. - """ - # TODO: improve efficiency by avoiding sync. - if len(indices) == 1: - x = indices[0].item() - return slice(x, x + 1) - if len(indices) == 2: - x1, x2 = indices[0].item(), indices[1].item() - return slice(x1, x2 + 1, x2 - x1) - start, step = indices[0].item(), (indices[1] - indices[0]).item() - stop = start + step * len(indices) - if (indices == cp.arange(start, stop, step)).all(): - return slice(start, stop, step) - return indices - - -def _compute_levels_and_codes( - data: MutableMapping, -) -> tuple[list[cudf.Index], list[column.ColumnBase]]: - """Return MultiIndex level and codes from a ColumnAccessor-like mapping.""" - levels = [] - codes = [] - for col in data.values(): - code, cats = factorize(col) - codes.append(column.as_column(code.astype(np.int64))) - levels.append(cats) - - return levels, codes - - -class MultiIndex(Frame, BaseIndex, NotIterable): - """A multi-level or hierarchical index. - - Provides N-Dimensional indexing into Series and DataFrame objects. - - Parameters - ---------- - levels : sequence of arrays - The unique labels for each level. - codes: sequence of arrays - Integers for each level designating which label at each location. - sortorder : optional int - Not yet supported - names: optional sequence of objects - Names for each of the index levels. - copy : bool, default False - Copy the levels and codes. - verify_integrity : bool, default True - Check that the levels/codes are consistent and valid. - Not yet supported - - Attributes - ---------- - names - nlevels - dtypes - levels - codes - - Methods - ------- - from_arrays - from_tuples - from_product - from_frame - set_levels - set_codes - to_frame - to_flat_index - sortlevel - droplevel - swaplevel - reorder_levels - remove_unused_levels - get_level_values - get_loc - drop - - Returns - ------- - MultiIndex - - Examples - -------- - >>> import cudf - >>> cudf.MultiIndex( - ... levels=[[1, 2], ['blue', 'red']], codes=[[0, 0, 1, 1], [1, 0, 1, 0]]) - MultiIndex([(1, 'red'), - (1, 'blue'), - (2, 'red'), - (2, 'blue')], - ) - """ - - @_performance_tracking - def __init__( - self, - levels=None, - codes=None, - sortorder=None, - names=None, - dtype=None, - copy=False, - name=None, - verify_integrity=True, - ): - if sortorder is not None: - raise NotImplementedError("sortorder is not yet supported") - if name is not None: - raise NotImplementedError( - "Use `names`, `name` is not yet supported" - ) - if levels is None or codes is None: - raise TypeError("Must pass both levels and codes") - elif not (is_list_like(levels) and len(levels) > 0): - raise ValueError("Must pass non-zero length sequence of levels") - elif not (is_list_like(codes) and len(codes) > 0): - raise ValueError("Must pass non-zero length sequence of codes") - elif len(codes) != len(levels): - raise ValueError( - f"levels must have the same length ({len(levels)}) " - f"as codes ({len(codes)})." - ) - - new_levels = [] - for level in levels: - new_level = ensure_index(level) - if copy and new_level is level: - new_level = new_level.copy(deep=True) - new_levels.append(new_level) - - new_codes = [] - for code in codes: - if not (is_list_like(code) or is_column_like(code)): - raise TypeError("Each code must be list-like") - new_code = column.as_column(code).astype("int64") - if copy and new_code is code: - new_code = new_code.copy(deep=True) - new_codes.append(new_code) - - source_data = {} - for i, (code, level) in enumerate(zip(new_codes, new_levels)): - if len(code): - lo, hi = libcudf.reduce.minmax(code) - if lo.value < -1 or hi.value > len(level) - 1: - raise ValueError( - f"Codes must be -1 <= codes <= {len(level) - 1}" - ) - if lo.value == -1: - # Now we can gather and insert null automatically - code[code == -1] = np.iinfo(size_type_dtype).min - result_col = libcudf.copying.gather( - [level._column], code, nullify=True - ) - source_data[i] = result_col[0]._with_type_metadata(level.dtype) - - super().__init__(ColumnAccessor(source_data)) - self._levels = new_levels - self._codes = new_codes - self._name = None - self.names = names - - @property # type: ignore - @_performance_tracking - def names(self): - return self._names - - @names.setter # type: ignore - @_performance_tracking - def names(self, value): - if value is None: - value = [None] * self.nlevels - elif not is_list_like(value): - raise ValueError("Names should be list-like for a MultiIndex") - elif len(value) != self.nlevels: - raise ValueError( - "Length of names must match number of levels in MultiIndex." - ) - - if len(value) == len(set(value)): - # IMPORTANT: if the provided names are unique, - # we reconstruct self._data with the names as keys. - # If they are not unique, the keys of self._data - # and self._names will be different, which can lead - # to unexpected behavior in some cases. This is - # definitely buggy, but we can't disallow non-unique - # names either... - self._data = type(self._data)( - dict(zip(value, self._columns)), - level_names=self._data.level_names, - verify=False, - ) - self._names = pd.core.indexes.frozen.FrozenList(value) - - @_performance_tracking - def to_series(self, index=None, name=None): - raise NotImplementedError( - "MultiIndex.to_series isn't implemented yet." - ) - - @_performance_tracking - def astype(self, dtype, copy: bool = True) -> Self: - if not is_object_dtype(dtype): - raise TypeError( - "Setting a MultiIndex dtype to anything other than object is " - "not supported" - ) - return self - - @_performance_tracking - def rename(self, names, inplace: bool = False) -> Self | None: - """ - Alter MultiIndex level names - - Parameters - ---------- - names : list of label - Names to set, length must be the same as number of levels - inplace : bool, default False - If True, modifies objects directly, otherwise returns a new - ``MultiIndex`` instance - - Returns - ------- - None or MultiIndex - - Examples - -------- - Renaming each levels of a MultiIndex to specified name: - - >>> midx = cudf.MultiIndex.from_product( - ... [('A', 'B'), (2020, 2021)], names=['c1', 'c2']) - >>> midx.rename(['lv1', 'lv2']) - MultiIndex([('A', 2020), - ('A', 2021), - ('B', 2020), - ('B', 2021)], - names=['lv1', 'lv2']) - >>> midx.rename(['lv1', 'lv2'], inplace=True) - >>> midx - MultiIndex([('A', 2020), - ('A', 2021), - ('B', 2020), - ('B', 2021)], - names=['lv1', 'lv2']) - - ``names`` argument must be a list, and must have same length as - ``MultiIndex.levels``: - - >>> midx.rename(['lv0']) - Traceback (most recent call last): - ValueError: Length of names must match number of levels in MultiIndex. - - """ - return self.set_names(names, level=None, inplace=inplace) - - @_performance_tracking - def set_names( - self, names, level=None, inplace: bool = False - ) -> Self | None: - names_is_list_like = is_list_like(names) - level_is_list_like = is_list_like(level) - - if level is not None and not level_is_list_like and names_is_list_like: - raise TypeError( - "Names must be a string when a single level is provided." - ) - - if not names_is_list_like and level is None and self.nlevels > 1: - raise TypeError("Must pass list-like as `names`.") - - if not names_is_list_like: - names = [names] - if level is not None and not level_is_list_like: - level = [level] - - if level is not None and len(names) != len(level): - raise ValueError("Length of names must match length of level.") - if level is None and len(names) != self.nlevels: - raise ValueError( - "Length of names must match number of levels in MultiIndex." - ) - - if level is None: - level = range(self.nlevels) - else: - level = [self._level_index_from_level(lev) for lev in level] - - existing_names = list(self.names) - for i, lev in enumerate(level): - existing_names[lev] = names[i] - names = existing_names - - return self._set_names(names=names, inplace=inplace) - - @classmethod - @_performance_tracking - def _from_data( - cls, - data: MutableMapping, - name: Any = None, - ) -> Self: - """ - Use when you have a ColumnAccessor-like mapping but no codes and levels. - """ - levels, codes = _compute_levels_and_codes(data) - return cls._simple_new( - data=ColumnAccessor(data), - levels=levels, - codes=codes, - names=pd.core.indexes.frozen.FrozenList(data.keys()), - name=name, - ) - - @classmethod - def _simple_new( - cls, - data: ColumnAccessor, - levels: list[cudf.Index], - codes: list[column.ColumnBase], - names: pd.core.indexes.frozen.FrozenList, - name: Any = None, - ) -> Self: - """ - Use when you have a ColumnAccessor-like mapping, codes, and levels. - """ - mi = object.__new__(cls) - mi._data = data - mi._levels = levels - mi._codes = codes - mi._names = names - mi._name = name - return mi - - @property # type: ignore - @_performance_tracking - def name(self): - return self._name - - @name.setter # type: ignore - @_performance_tracking - def name(self, value): - self._name = value - - @_performance_tracking - def copy( - self, - names=None, - deep=False, - name=None, - ) -> Self: - """Returns copy of MultiIndex object. - - Returns a copy of `MultiIndex`. The `levels` and `codes` value can be - set to the provided parameters. When they are provided, the returned - MultiIndex is always newly constructed. - - Parameters - ---------- - names : sequence of objects, optional (default None) - Names for each of the index levels. - deep : Bool (default False) - If True, `._data`, `._levels`, `._codes` will be copied. Ignored if - `levels` or `codes` are specified. - name : object, optional (default None) - Kept for compatibility with 1-dimensional Index. Should not - be used. - - Returns - ------- - Copy of MultiIndex Instance - - Examples - -------- - >>> df = cudf.DataFrame({'Close': [3400.00, 226.58, 3401.80, 228.91]}) - >>> idx1 = cudf.MultiIndex( - ... levels=[['2020-08-27', '2020-08-28'], ['AMZN', 'MSFT']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]], - ... names=['Date', 'Symbol']) - >>> idx2 = idx1.copy( - ... names=['col1', 'col2']) - - >>> df.index = idx1 - >>> df - Close - Date Symbol - 2020-08-27 AMZN 3400.00 - MSFT 226.58 - 2020-08-28 AMZN 3401.80 - MSFT 228.91 - - >>> df.index = idx2 - >>> df - Close - col1 col2 - 2020-08-27 AMZN 3400.00 - MSFT 226.58 - 2020-08-28 AMZN 3401.80 - MSFT 228.91 - """ - if names is not None: - names = pd.core.indexes.frozen.FrozenList(names) - else: - names = self.names - return type(self)._simple_new( - data=self._data.copy(deep=deep), - levels=[idx.copy(deep=deep) for idx in self._levels], - codes=[code.copy(deep=deep) for code in self._codes], - names=names, - name=name, - ) - - @_performance_tracking - def __repr__(self) -> str: - max_seq_items = pd.get_option("display.max_seq_items") or len(self) - - if len(self) > max_seq_items: - n = int(max_seq_items / 2) + 1 - # TODO: Update the following two arange calls to - # a single arange call once arange has support for - # a vector start/end points. - indices = column.as_column(range(n)) - indices = indices.append( - column.as_column(range(len(self) - n, len(self), 1)) - ) - preprocess = self.take(indices) - else: - preprocess = self - - arrays = [] - for name, col in zip(self.names, preprocess._columns): - try: - pd_idx = col.to_pandas(nullable=True) - except NotImplementedError: - pd_idx = col.to_pandas(nullable=False) - pd_idx.name = name - arrays.append(pd_idx) - - preprocess_pd = pd.MultiIndex.from_arrays(arrays) - - output = repr(preprocess_pd) - output_prefix = self.__class__.__name__ + "(" - output = output.lstrip(output_prefix) - lines = output.split("\n") - - if len(lines) > 1: - if "length=" in lines[-1] and len(self) != len(preprocess_pd): - last_line = lines[-1] - length_index = last_line.index("length=") - last_line = last_line[:length_index] + f"length={len(self)})" - lines = lines[:-1] - lines.append(last_line) - - data_output = "\n".join(lines) - return output_prefix + data_output - - @property # type: ignore - @_external_only_api("Use ._codes instead") - @_performance_tracking - def codes(self) -> pd.core.indexes.frozen.FrozenList: - """ - Returns the codes of the underlying MultiIndex. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 11, 12]}) - >>> midx = cudf.MultiIndex.from_frame(df) - >>> midx - MultiIndex([(1, 10), - (2, 11), - (3, 12)], - names=['a', 'b']) - >>> midx.codes - FrozenList([[0, 1, 2], [0, 1, 2]]) - """ - return pd.core.indexes.frozen.FrozenList( - col.values for col in self._codes - ) - - def get_slice_bound(self, label, side): - raise NotImplementedError( - "get_slice_bound is not currently implemented." - ) - - @property # type: ignore - @_performance_tracking - def nlevels(self) -> int: - """Integer number of levels in this MultiIndex.""" - return self._num_columns - - @property # type: ignore - @_performance_tracking - def levels(self) -> list[cudf.Index]: - """ - Returns list of levels in the MultiIndex - - Returns - ------- - List of Index objects - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 11, 12]}) - >>> cudf.MultiIndex.from_frame(df) - MultiIndex([(1, 10), - (2, 11), - (3, 12)], - names=['a', 'b']) - >>> midx = cudf.MultiIndex.from_frame(df) - >>> midx - MultiIndex([(1, 10), - (2, 11), - (3, 12)], - names=['a', 'b']) - >>> midx.levels - [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')] - """ # noqa: E501 - return [ - idx.rename(name) for idx, name in zip(self._levels, self.names) - ] - - @property # type: ignore - @_performance_tracking - def ndim(self) -> int: - """Dimension of the data. For MultiIndex ndim is always 2.""" - return 2 - - @_performance_tracking - def _get_level_label(self, level): - """Get name of the level. - - Parameters - ---------- - level : int or level name - if level is name, it will be returned as it is - else if level is index of the level, then level - label will be returned as per the index. - """ - if level in self.names: - return level - else: - return self.names[level] - - @_performance_tracking - def isin(self, values, level=None) -> cp.ndarray: - """Return a boolean array where the index values are in values. - - Compute boolean array of whether each index value is found in - the passed set of values. The length of the returned boolean - array matches the length of the index. - - Parameters - ---------- - values : set, list-like, Index or Multi-Index - Sought values. - level : str or int, optional - Name or position of the index level to use (if the index - is a MultiIndex). - - Returns - ------- - is_contained : cupy array - CuPy array of boolean values. - - Notes - ----- - When `level` is None, `values` can only be MultiIndex, or a - set/list-like tuples. - When `level` is provided, `values` can be Index or MultiIndex, - or a set/list-like tuples. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> midx = cudf.from_pandas(pd.MultiIndex.from_arrays([[1,2,3], - ... ['red', 'blue', 'green']], - ... names=('number', 'color'))) - >>> midx - MultiIndex([(1, 'red'), - (2, 'blue'), - (3, 'green')], - names=['number', 'color']) - - Check whether the strings in the 'color' level of the MultiIndex - are in a list of colors. - - >>> midx.isin(['red', 'orange', 'yellow'], level='color') - array([ True, False, False]) - - To check across the levels of a MultiIndex, pass a list of tuples: - - >>> midx.isin([(1, 'red'), (3, 'red')]) - array([ True, False, False]) - """ - if level is None: - if isinstance(values, cudf.MultiIndex): - values_idx = values - elif ( - ( - isinstance( - values, - ( - cudf.Series, - cudf.Index, - cudf.DataFrame, - column.ColumnBase, - ), - ) - ) - or (not is_list_like(values)) - or ( - is_list_like(values) - and len(values) > 0 - and not isinstance(values[0], tuple) - ) - ): - raise TypeError( - "values need to be a Multi-Index or set/list-like tuple " - "squences when `level=None`." - ) - else: - values_idx = cudf.MultiIndex.from_tuples( - values, names=self.names - ) - self_df = self.to_frame(index=False).reset_index() - values_df = values_idx.to_frame(index=False) - idx = self_df.merge(values_df, how="leftsemi")._data["index"] - res = column.as_column(False, length=len(self)) - res[idx] = True - result = res.values - else: - level_series = self.get_level_values(level) - result = level_series.isin(values) - - return result - - def where(self, cond, other=None, inplace=False): - raise NotImplementedError( - ".where is not supported for MultiIndex operations" - ) - - @_performance_tracking - def _compute_validity_mask(self, index, row_tuple, max_length): - """Computes the valid set of indices of values in the lookup""" - lookup_dict = {} - for i, row in enumerate(row_tuple): - if isinstance(row, slice) and row == slice(None): - continue - lookup_dict[i] = row - lookup = cudf.DataFrame(lookup_dict) - frame = cudf.DataFrame._from_data( - ColumnAccessor( - dict(enumerate(index._columns)), - verify=False, - ) - ) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - data_table = cudf.concat( - [ - frame, - cudf.DataFrame._from_data( - ColumnAccessor( - {"idx": column.as_column(range(len(frame)))}, - verify=False, - ) - ), - ], - axis=1, - ) - # Sort indices in pandas compatible mode - # because we want the indices to be fetched - # in a deterministic order. - # TODO: Remove this after merge/join - # obtain deterministic ordering. - if cudf.get_option("mode.pandas_compatible"): - lookup_order = "_" + "_".join(map(str, lookup._column_names)) - lookup[lookup_order] = column.as_column(range(len(lookup))) - postprocess = operator.methodcaller( - "sort_values", by=[lookup_order, "idx"] - ) - else: - postprocess = lambda r: r # noqa: E731 - result = postprocess(lookup.merge(data_table))["idx"] - # Avoid computing levels unless the result of the merge is empty, - # which suggests that a KeyError should be raised. - if len(result) == 0: - for idx, row in enumerate(row_tuple): - if row == slice(None): - continue - if row not in index.levels[idx]._column: - raise KeyError(row) - return result - - @_performance_tracking - def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): - # Instructions for Slicing - # if tuple, get first and last elements of tuple - # if open beginning tuple, get 0 to highest valid_index - # if open ending tuple, get highest valid_index to len() - # if not open end or beginning, get range lowest beginning index - # to highest ending index - if isinstance(row_tuple, slice): - if ( - isinstance(row_tuple.start, numbers.Number) - or isinstance(row_tuple.stop, numbers.Number) - or row_tuple == slice(None) - ): - stop = row_tuple.stop or max_length - start, stop, step = row_tuple.indices(stop) - return column.as_column(range(start, stop, step)) - start_values = self._compute_validity_mask( - index, row_tuple.start, max_length - ) - stop_values = self._compute_validity_mask( - index, row_tuple.stop, max_length - ) - return column.as_column( - range(start_values.min(), stop_values.max() + 1) - ) - elif isinstance(row_tuple, numbers.Number): - return row_tuple - return self._compute_validity_mask(index, row_tuple, max_length) - - @_performance_tracking - def _index_and_downcast(self, result, index, index_key): - if isinstance(index_key, (numbers.Number, slice)): - index_key = [index_key] - if ( - len(index_key) > 0 and not isinstance(index_key, tuple) - ) or isinstance(index_key[0], slice): - index_key = index_key[0] - - slice_access = isinstance(index_key, slice) - # Count the last n-k columns where n is the number of columns and k is - # the length of the indexing tuple - size = 0 - if not isinstance(index_key, (numbers.Number, slice)): - size = len(index_key) - num_selected = max(0, index.nlevels - size) - - # determine if we should downcast from a DataFrame to a Series - need_downcast = ( - isinstance(result, cudf.DataFrame) - and len(result) == 1 # only downcast if we have a single row - and not slice_access # never downcast if we sliced - and ( - size == 0 # index_key was an integer - # we indexed into a single row directly, using its label: - or len(index_key) == self.nlevels - ) - ) - if need_downcast: - result = result.T - return result[result._column_names[0]] - - if len(result) == 0 and not slice_access: - # Pandas returns an empty Series with a tuple as name - # the one expected result column - result = cudf.Series._from_data( - {}, name=tuple(col[0] for col in index._columns) - ) - elif num_selected == 1: - # If there's only one column remaining in the output index, convert - # it into an Index and name the final index values according - # to that column's name. - *_, last_column = index._data.columns - index = cudf.Index._from_column(last_column, name=index.names[-1]) - elif num_selected > 1: - # Otherwise pop the leftmost levels, names, and codes from the - # source index until it has the correct number of columns (n-k) - result.reset_index(drop=True) - if index.names is not None: - result.names = index.names[size:] - index = MultiIndex( - levels=index.levels[size:], - codes=index._codes[size:], - names=index.names[size:], - ) - - if isinstance(index_key, tuple): - result.index = index - return result - - @_performance_tracking - def _get_row_major( - self, - df: DataFrameOrSeries, - row_tuple: numbers.Number - | slice - | tuple[Any, ...] - | list[tuple[Any, ...]], - ) -> DataFrameOrSeries: - if isinstance(row_tuple, slice): - if row_tuple.start is None: - row_tuple = slice(self[0], row_tuple.stop, row_tuple.step) - if row_tuple.stop is None: - row_tuple = slice(row_tuple.start, self[-1], row_tuple.step) - self._validate_indexer(row_tuple) - valid_indices = self._get_valid_indices_by_tuple( - df.index, row_tuple, len(df.index) - ) - if isinstance(valid_indices, column.ColumnBase): - indices = cudf.Series._from_column(valid_indices) - else: - indices = cudf.Series(valid_indices) - result = df.take(indices) - final = self._index_and_downcast(result, result.index, row_tuple) - return final - - @_performance_tracking - def _validate_indexer( - self, - indexer: numbers.Number - | slice - | tuple[Any, ...] - | list[tuple[Any, ...]], - ) -> None: - if isinstance(indexer, numbers.Number): - return - if isinstance(indexer, tuple): - # drop any slice(None) from the end: - indexer = tuple( - itertools.dropwhile( - lambda x: x == slice(None), reversed(indexer) - ) - )[::-1] - - # now check for size - if len(indexer) > self.nlevels: - raise IndexError("Indexer size exceeds number of levels") - elif isinstance(indexer, slice): - self._validate_indexer(indexer.start) - self._validate_indexer(indexer.stop) - else: - for i in indexer: - self._validate_indexer(i) - - @_performance_tracking - def __eq__(self, other): - if isinstance(other, MultiIndex): - return np.array( - [ - self_col.equals(other_col) - for self_col, other_col in zip( - self._columns, other._columns - ) - ] - ) - return NotImplemented - - @property # type: ignore - @_performance_tracking - def size(self) -> int: - # The size of a MultiIndex is only dependent on the number of rows. - return self._num_rows - - @_performance_tracking - def take(self, indices) -> Self: - if isinstance(indices, cudf.Series) and indices.has_nulls: - raise ValueError("Column must have no nulls.") - obj = super().take(indices) - obj.names = self.names - return obj - - @_performance_tracking - def serialize(self): - header, frames = super().serialize() - # Overwrite the names in _data with the true names. - header["column_names"] = pickle.dumps(self.names) - return header, frames - - @classmethod - @_performance_tracking - def deserialize(cls, header, frames): - # Spoof the column names to construct the frame, then set manually. - column_names = pickle.loads(header["column_names"]) - header["column_names"] = pickle.dumps(range(0, len(column_names))) - obj = super().deserialize(header, frames) - return obj._set_names(column_names) - - @_performance_tracking - def __getitem__(self, index): - flatten = isinstance(index, int) - - if isinstance(index, slice): - start, stop, step = index.indices(len(self)) - idx = range(start, stop, step) - elif is_scalar(index): - idx = [index] - else: - idx = index - - indexer = column.as_column(idx) - ca = self._data._from_columns_like_self( - (col.take(indexer) for col in self._columns), verify=False - ) - codes = [code.take(indexer) for code in self._codes] - result = type(self)._simple_new( - data=ca, codes=codes, levels=self._levels, names=self.names - ) - - # we are indexing into a single row of the MultiIndex, - # return that row as a tuple: - if flatten: - return result.to_pandas()[0] - else: - return result - - @_performance_tracking - def to_frame( - self, - index: bool = True, - name=no_default, - allow_duplicates: bool = False, - ) -> cudf.DataFrame: - """ - Create a DataFrame with the levels of the MultiIndex as columns. - - Column ordering is determined by the DataFrame constructor with data as - a dict. - - Parameters - ---------- - index : bool, default True - Set the index of the returned DataFrame as the original MultiIndex. - name : list / sequence of str, optional - The passed names should substitute index level names. - allow_duplicates : bool, optional default False - Allow duplicate column labels to be created. Note - that this parameter is non-functional because - duplicates column labels aren't supported in cudf. - - Returns - ------- - DataFrame - - Examples - -------- - >>> import cudf - >>> mi = cudf.MultiIndex.from_tuples([('a', 'c'), ('b', 'd')]) - >>> mi - MultiIndex([('a', 'c'), - ('b', 'd')], - ) - - >>> df = mi.to_frame() - >>> df - 0 1 - a c a c - b d b d - - >>> df = mi.to_frame(index=False) - >>> df - 0 1 - 0 a c - 1 b d - - >>> df = mi.to_frame(name=['x', 'y']) - >>> df - x y - a c a c - b d b d - """ - if name is no_default: - column_names = [ - level if name is None else name - for level, name in enumerate(self.names) - ] - elif not is_list_like(name): - raise TypeError( - "'name' must be a list / sequence of column names." - ) - elif len(name) != len(self.levels): - raise ValueError( - "'name' should have the same length as " - "number of levels on index." - ) - else: - column_names = name - - if len(column_names) != len(set(column_names)): - raise ValueError("Duplicate column names are not allowed") - ca = ColumnAccessor( - dict(zip(column_names, (col.copy() for col in self._columns))), - verify=False, - ) - return cudf.DataFrame._from_data( - data=ca, index=self if index else None - ) - - @_performance_tracking - def _level_to_ca_label(self, level) -> tuple[Hashable, int]: - """ - Convert a level to a ColumAccessor label and an integer position. - - Useful if self._column_names != self.names. - - Parameters - ---------- - level : int or label - - Returns - ------- - tuple[Hashable, int] - (ColumnAccessor label corresponding to level, integer position of the level) - """ - colnames = self._column_names - try: - level_idx = colnames.index(level) - except ValueError: - if isinstance(level, int): - if level < 0: - level = level + len(colnames) - if level < 0 or level >= len(colnames): - raise IndexError(f"Invalid level number: '{level}'") - level_idx = level - level = colnames[level_idx] - elif level in self.names: - level_idx = list(self.names).index(level) - level = colnames[level_idx] - else: - raise KeyError(f"Level not found: '{level}'") - return level, level_idx - - @_performance_tracking - def get_level_values(self, level) -> cudf.Index: - """ - Return the values at the requested level - - Parameters - ---------- - level : int or label - - Returns - ------- - An Index containing the values at the requested level. - """ - level, level_idx = self._level_to_ca_label(level) - level_values = cudf.Index._from_column( - self._data[level], name=self.names[level_idx] - ) - return level_values - - def _is_numeric(self) -> bool: - return False - - def _is_boolean(self) -> bool: - return False - - def _is_integer(self) -> bool: - return False - - def _is_floating(self) -> bool: - return False - - def _is_object(self) -> bool: - return False - - def _is_categorical(self) -> bool: - return False - - def _is_interval(self) -> bool: - return False - - @classmethod - @_performance_tracking - def _concat(cls, objs) -> Self: - source_data = [o.to_frame(index=False) for o in objs] - - # TODO: Verify if this is really necessary or if we can rely on - # DataFrame._concat. - if len(source_data) > 1: - colnames = source_data[0]._data.to_pandas_index() - for obj in source_data[1:]: - obj.columns = colnames - - source_df = cudf.DataFrame._concat(source_data) - try: - # Only set names if all objs have the same names - (names,) = {o.names for o in objs} - {None} - except ValueError: - names = [None] * source_df._num_columns - return cudf.MultiIndex.from_frame(source_df, names=names) - - @classmethod - @_performance_tracking - def from_tuples( - cls, tuples, sortorder: int | None = None, names=None - ) -> Self: - """ - Convert list of tuples to MultiIndex. - - Parameters - ---------- - tuples : list / sequence of tuple-likes - Each tuple is the index of one row/column. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of str, optional - Names for the levels in the index. - - Returns - ------- - MultiIndex - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - MultiIndex.from_frame : Make a MultiIndex from a DataFrame. - - Examples - -------- - >>> tuples = [(1, 'red'), (1, 'blue'), - ... (2, 'red'), (2, 'blue')] - >>> cudf.MultiIndex.from_tuples(tuples, names=('number', 'color')) - MultiIndex([(1, 'red'), - (1, 'blue'), - (2, 'red'), - (2, 'blue')], - names=['number', 'color']) - """ - # Use Pandas for handling Python host objects - pdi = pd.MultiIndex.from_tuples( - tuples, sortorder=sortorder, names=names - ) - return cls.from_pandas(pdi) - - @_performance_tracking - def to_numpy(self) -> np.ndarray: - return self.values_host - - def to_flat_index(self): - """ - Convert a MultiIndex to an Index of Tuples containing the level values. - - This is not currently implemented - """ - # TODO: Could implement as Index of ListDtype? - raise NotImplementedError("to_flat_index is not currently supported.") - - @property # type: ignore - @_performance_tracking - def values_host(self) -> np.ndarray: - """ - Return a numpy representation of the MultiIndex. - - Only the values in the MultiIndex will be returned. - - Returns - ------- - out : numpy.ndarray - The values of the MultiIndex. - - Examples - -------- - >>> import cudf - >>> midx = cudf.MultiIndex( - ... levels=[[1, 3, 4, 5], [1, 2, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> midx.values_host - array([(1, 1), (1, 5), (3, 2), (4, 2), (5, 1)], dtype=object) - >>> type(midx.values_host) - - """ - return self.to_pandas().values - - @property # type: ignore - @_performance_tracking - def values(self) -> cp.ndarray: - """ - Return a CuPy representation of the MultiIndex. - - Only the values in the MultiIndex will be returned. - - Returns - ------- - out: cupy.ndarray - The values of the MultiIndex. - - Examples - -------- - >>> import cudf - >>> midx = cudf.MultiIndex( - ... levels=[[1, 3, 4, 5], [1, 2, 5]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> midx.values - array([[1, 1], - [1, 5], - [3, 2], - [4, 2], - [5, 1]]) - >>> type(midx.values) - - """ - if cudf.get_option("mode.pandas_compatible"): - raise NotImplementedError( - "Unable to create a cupy array with tuples." - ) - return self.to_frame(index=False).values - - @classmethod - @_performance_tracking - def from_frame( - cls, - df: pd.DataFrame | cudf.DataFrame, - sortorder: int | None = None, - names=None, - ) -> Self: - """ - Make a MultiIndex from a DataFrame. - - Parameters - ---------- - df : DataFrame - DataFrame to be converted to MultiIndex. - sortorder : int, optional - Level of sortedness (must be lexicographically sorted by that - level). - names : list-like, optional - If no names are provided, use the column names, or tuple of column - names if the columns is a MultiIndex. If a sequence, overwrite - names with the given sequence. - - Returns - ------- - MultiIndex - The MultiIndex representation of the given DataFrame. - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], - ... ['NJ', 'Temp'], ['NJ', 'Precip']], - ... columns=['a', 'b']) - >>> df - a b - 0 HI Temp - 1 HI Precip - 2 NJ Temp - 3 NJ Precip - >>> cudf.MultiIndex.from_frame(df) - MultiIndex([('HI', 'Temp'), - ('HI', 'Precip'), - ('NJ', 'Temp'), - ('NJ', 'Precip')], - names=['a', 'b']) - - Using explicit names, instead of the column names - - >>> cudf.MultiIndex.from_frame(df, names=['state', 'observation']) - MultiIndex([('HI', 'Temp'), - ('HI', 'Precip'), - ('NJ', 'Temp'), - ('NJ', 'Precip')], - names=['state', 'observation']) - """ - if isinstance(df, pd.DataFrame): - source_data = cudf.DataFrame.from_pandas(df) - else: - source_data = df - names = names if names is not None else source_data._column_names - return cls.from_arrays( - source_data._columns, sortorder=sortorder, names=names - ) - - @classmethod - @_performance_tracking - def from_product( - cls, iterables, sortorder: int | None = None, names=None - ) -> Self: - """ - Make a MultiIndex from the cartesian product of multiple iterables. - - Parameters - ---------- - iterables : list / sequence of iterables - Each iterable has unique labels for each level of the index. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of str, optional - Names for the levels in the index. - If not explicitly provided, names will be inferred from the - elements of iterables if an element has a name attribute - - Returns - ------- - MultiIndex - - See Also - -------- - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_frame : Make a MultiIndex from a DataFrame. - - Examples - -------- - >>> numbers = [0, 1, 2] - >>> colors = ['green', 'purple'] - >>> cudf.MultiIndex.from_product([numbers, colors], - ... names=['number', 'color']) - MultiIndex([(0, 'green'), - (0, 'purple'), - (1, 'green'), - (1, 'purple'), - (2, 'green'), - (2, 'purple')], - names=['number', 'color']) - """ - # Use Pandas for handling Python host objects - pdi = pd.MultiIndex.from_product( - iterables, sortorder=sortorder, names=names - ) - return cls.from_pandas(pdi) - - @classmethod - @_performance_tracking - def from_arrays( - cls, - arrays, - sortorder=None, - names=None, - ) -> Self: - """ - Convert arrays to MultiIndex. - - Parameters - ---------- - arrays : list / sequence of array-likes - Each array-like gives one level's value for each data point. - len(arrays) is the number of levels. - sortorder : optional int - Not yet supported - names : list / sequence of str, optional - Names for the levels in the index. - - Returns - ------- - MultiIndex - - See Also - -------- - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - MultiIndex.from_frame : Make a MultiIndex from a DataFrame. - - Examples - -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> cudf.MultiIndex.from_arrays(arrays, names=('number', 'color')) - MultiIndex([(1, 'red'), - (1, 'blue'), - (2, 'red'), - (2, 'blue')], - names=['number', 'color']) - """ - error_msg = "Input must be a list / sequence of array-likes." - if not is_list_like(arrays): - raise TypeError(error_msg) - codes = [] - levels = [] - names_from_arrays = [] - for array in arrays: - if not (is_list_like(array) or is_column_like(array)): - raise TypeError(error_msg) - code, level = factorize(array, sort=True) - codes.append(code) - levels.append(level) - names_from_arrays.append(getattr(array, "name", None)) - if names is None: - names = names_from_arrays - return cls( - codes=codes, levels=levels, sortorder=sortorder, names=names - ) - - @_performance_tracking - def swaplevel(self, i=-2, j=-1) -> Self: - """ - Swap level i with level j. - Calling this method does not change the ordering of the values. - - Parameters - ---------- - i : int or str, default -2 - First level of index to be swapped. - j : int or str, default -1 - Second level of index to be swapped. - - Returns - ------- - MultiIndex - A new MultiIndex. - - Examples - -------- - >>> import cudf - >>> mi = cudf.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']], - ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]]) - >>> mi - MultiIndex([('a', 'bb'), - ('a', 'aa'), - ('b', 'bb'), - ('b', 'aa')], - ) - >>> mi.swaplevel(0, 1) - MultiIndex([('bb', 'a'), - ('aa', 'a'), - ('bb', 'b'), - ('aa', 'b')], - ) - """ - name_i = self._column_names[i] if isinstance(i, int) else i - name_j = self._column_names[j] if isinstance(j, int) else j - new_data = {} - for k, v in self._column_labels_and_values: - if k not in (name_i, name_j): - new_data[k] = v - elif k == name_i: - new_data[name_j] = self._data[name_j] - elif k == name_j: - new_data[name_i] = self._data[name_i] - midx = MultiIndex._from_data(new_data) - if all(n is None for n in self.names): - midx = midx.set_names(self.names) - return midx - - @_performance_tracking - def droplevel(self, level=-1) -> Self | cudf.Index: - """ - Removes the specified levels from the MultiIndex. - - Parameters - ---------- - level : level name or index, list-like - Integer, name or list of such, specifying one or more - levels to drop from the MultiIndex - - Returns - ------- - A MultiIndex or Index object, depending on the number of remaining - levels. - - Examples - -------- - >>> import cudf - >>> idx = cudf.MultiIndex.from_frame( - ... cudf.DataFrame( - ... { - ... "first": ["a", "a", "a", "b", "b", "b"], - ... "second": [1, 1, 2, 2, 3, 3], - ... "third": [0, 1, 2, 0, 1, 2], - ... } - ... ) - ... ) - - Dropping level by index: - - >>> idx.droplevel(0) - MultiIndex([(1, 0), - (1, 1), - (2, 2), - (2, 0), - (3, 1), - (3, 2)], - names=['second', 'third']) - - Dropping level by name: - - >>> idx.droplevel("first") - MultiIndex([(1, 0), - (1, 1), - (2, 2), - (2, 0), - (3, 1), - (3, 2)], - names=['second', 'third']) - - Dropping multiple levels: - - >>> idx.droplevel(["first", "second"]) - Index([0, 1, 2, 0, 1, 2], dtype='int64', name='third') - """ - if is_scalar(level): - level = (level,) - elif len(level) == 0: - return self - - new_names = list(self.names) - new_data = self._data.copy(deep=False) - for i in sorted( - (self._level_index_from_level(lev) for lev in level), reverse=True - ): - new_names.pop(i) - new_data.pop(self._data.names[i]) - - if len(new_data) == 1: - return cudf.core.index._index_from_data(new_data) - else: - mi = MultiIndex._from_data(new_data) - mi.names = new_names - return mi - - @_performance_tracking - def to_pandas( - self, *, nullable: bool = False, arrow_type: bool = False - ) -> pd.MultiIndex: - # cudf uses np.iinfo(size_type_dtype).min as missing code - # pandas uses -1 as missing code - pd_codes = ( - code.find_and_replace( - column.as_column(np.iinfo(size_type_dtype).min, length=1), - column.as_column(-1, length=1), - ) - for code in self._codes - ) - return pd.MultiIndex( - levels=[ - level.to_pandas(nullable=nullable, arrow_type=arrow_type) - for level in self.levels - ], - codes=[col.values_host for col in pd_codes], - names=self.names, - ) - - @classmethod - @_performance_tracking - def from_pandas( - cls, multiindex: pd.MultiIndex, nan_as_null=no_default - ) -> Self: - """ - Convert from a Pandas MultiIndex - - Raises - ------ - TypeError for invalid input type. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> pmi = pd.MultiIndex(levels=[['a', 'b'], ['c', 'd']], - ... codes=[[0, 1], [1, 1]]) - >>> cudf.from_pandas(pmi) - MultiIndex([('a', 'd'), - ('b', 'd')], - ) - """ - if not isinstance(multiindex, pd.MultiIndex): - raise TypeError("not a pandas.MultiIndex") - if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) - levels = [ - cudf.Index.from_pandas(level, nan_as_null=nan_as_null) - for level in multiindex.levels - ] - return cls( - levels=levels, codes=multiindex.codes, names=multiindex.names - ) - - @cached_property # type: ignore - @_performance_tracking - def is_unique(self) -> bool: - return len(self) == len(self.unique()) - - @property - def dtype(self) -> np.dtype: - return np.dtype("O") - - @_performance_tracking - def _is_sorted(self, ascending=None, null_position=None) -> bool: - """ - Returns a boolean indicating whether the data of the MultiIndex are sorted - based on the parameters given. Does not account for the index. - - Parameters - ---------- - self : MultiIndex - MultiIndex whose columns are to be checked for sort order - ascending : None or list-like of booleans - None or list-like of boolean values indicating expected sort order - of each column. If list-like, size of list-like must be - len(columns). If None, all columns expected sort order is set to - ascending. False (0) - ascending, True (1) - descending. - null_position : None or list-like of booleans - None or list-like of boolean values indicating desired order of - nulls compared to other elements. If list-like, size of list-like - must be len(columns). If None, null order is set to before. False - (0) - before, True (1) - after. - - Returns - ------- - returns : boolean - Returns True, if sorted as expected by ``ascending`` and - ``null_position``, False otherwise. - """ - if ascending is not None and not cudf.api.types.is_list_like( - ascending - ): - raise TypeError( - f"Expected a list-like or None for `ascending`, got " - f"{type(ascending)}" - ) - if null_position is not None and not cudf.api.types.is_list_like( - null_position - ): - raise TypeError( - f"Expected a list-like or None for `null_position`, got " - f"{type(null_position)}" - ) - return libcudf.sort.is_sorted( - [*self._columns], ascending=ascending, null_position=null_position - ) - - @cached_property # type: ignore - @_performance_tracking - def is_monotonic_increasing(self) -> bool: - """ - Return if the index is monotonic increasing - (only equal or increasing) values. - """ - return self._is_sorted(ascending=None, null_position=None) - - @cached_property # type: ignore - @_performance_tracking - def is_monotonic_decreasing(self) -> bool: - """ - Return if the index is monotonic decreasing - (only equal or decreasing) values. - """ - return self._is_sorted( - ascending=[False] * len(self.levels), null_position=None - ) - - @_performance_tracking - def fillna(self, value) -> Self: - """ - Fill null values with the specified value. - - Parameters - ---------- - value : scalar - Scalar value to use to fill nulls. This value cannot be a - list-likes. - - Returns - ------- - filled : MultiIndex - - Examples - -------- - >>> import cudf - >>> index = cudf.MultiIndex( - ... levels=[["a", "b", "c", None], ["1", None, "5"]], - ... codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - ... names=["x", "y"], - ... ) - >>> index - MultiIndex([( 'a', '1'), - ( 'a', '5'), - ( 'b', ), - ( 'c', ), - (, '1')], - names=['x', 'y']) - >>> index.fillna('hello') - MultiIndex([( 'a', '1'), - ( 'a', '5'), - ( 'b', 'hello'), - ( 'c', 'hello'), - ('hello', '1')], - names=['x', 'y']) - """ - - return super().fillna(value=value) - - @_performance_tracking - def unique(self, level: int | None = None) -> Self | cudf.Index: - if level is None: - return self.drop_duplicates(keep="first") - else: - return self.get_level_values(level).unique() - - @_performance_tracking - def nunique(self, dropna: bool = True) -> int: - mi = self.dropna(how="all") if dropna else self - return len(mi.unique()) - - def _clean_nulls_from_index(self) -> Self: - """ - Convert all na values(if any) in MultiIndex object - to `` as a preprocessing step to `__repr__` methods. - """ - index_df = self.to_frame(index=False, name=list(range(self.nlevels))) - return MultiIndex.from_frame( - index_df._clean_nulls_from_dataframe(index_df), names=self.names - ) - - @_performance_tracking - def memory_usage(self, deep: bool = False) -> int: - usage = sum(col.memory_usage for col in self._columns) - usage += sum(level.memory_usage(deep=deep) for level in self._levels) - usage += sum(code.memory_usage for code in self._codes) - return usage - - @_performance_tracking - def difference(self, other, sort=None) -> Self: - if hasattr(other, "to_pandas"): - other = other.to_pandas() - return cudf.from_pandas(self.to_pandas().difference(other, sort)) - - @_performance_tracking - def append(self, other) -> Self: - """ - Append a collection of MultiIndex objects together - - Parameters - ---------- - other : MultiIndex or list/tuple of MultiIndex objects - - Returns - ------- - appended : Index - - Examples - -------- - >>> import cudf - >>> idx1 = cudf.MultiIndex( - ... levels=[[1, 2], ['blue', 'red']], - ... codes=[[0, 0, 1, 1], [1, 0, 1, 0]] - ... ) - >>> idx2 = cudf.MultiIndex( - ... levels=[[3, 4], ['blue', 'red']], - ... codes=[[0, 0, 1, 1], [1, 0, 1, 0]] - ... ) - >>> idx1 - MultiIndex([(1, 'red'), - (1, 'blue'), - (2, 'red'), - (2, 'blue')], - ) - >>> idx2 - MultiIndex([(3, 'red'), - (3, 'blue'), - (4, 'red'), - (4, 'blue')], - ) - >>> idx1.append(idx2) - MultiIndex([(1, 'red'), - (1, 'blue'), - (2, 'red'), - (2, 'blue'), - (3, 'red'), - (3, 'blue'), - (4, 'red'), - (4, 'blue')], - ) - """ - if isinstance(other, (list, tuple)): - to_concat = [self] - to_concat.extend(other) - else: - to_concat = [self, other] - - for obj in to_concat: - if not isinstance(obj, MultiIndex): - raise TypeError( - f"all objects should be of type " - f"MultiIndex for MultiIndex.append, " - f"found object of type: {type(obj)}" - ) - - return MultiIndex._concat(to_concat) - - @_performance_tracking - def __array_function__(self, func, types, args, kwargs): - cudf_df_module = MultiIndex - - for submodule in func.__module__.split(".")[1:]: - # point cudf to the correct submodule - if hasattr(cudf_df_module, submodule): - cudf_df_module = getattr(cudf_df_module, submodule) - else: - return NotImplemented - - fname = func.__name__ - - handled_types = [cudf_df_module, np.ndarray] - - for t in types: - if t not in handled_types: - return NotImplemented - - if hasattr(cudf_df_module, fname): - cudf_func = getattr(cudf_df_module, fname) - # Handle case if cudf_func is same as numpy function - if cudf_func is func: - return NotImplemented - else: - return cudf_func(*args, **kwargs) - else: - return NotImplemented - - def _level_index_from_level(self, level) -> int: - """ - Return level index from given level name or index - """ - try: - return self.names.index(level) - except ValueError: - if not is_integer(level): - raise KeyError(f"Level {level} not found") - if level < 0: - level += self.nlevels - if level >= self.nlevels: - raise IndexError( - f"Level {level} out of bounds. " - f"Index has {self.nlevels} levels." - ) from None - return level - - @_performance_tracking - def get_indexer(self, target, method=None, limit=None, tolerance=None): - if tolerance is not None: - raise NotImplementedError( - "Parameter tolerance is not supported yet." - ) - if method == "nearest": - raise NotImplementedError( - f"{method=} is not supported yet for MultiIndex." - ) - if method in {"ffill", "bfill", "pad", "backfill"} and not ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ): - raise ValueError( - "index must be monotonic increasing or decreasing" - ) - - result = column.as_column( - -1, - length=len(target), - dtype=libcudf.types.size_type_dtype, - ) - if not len(self): - return _return_get_indexer_result(result.values) - try: - target = cudf.MultiIndex.from_tuples(target) - except TypeError: - return _return_get_indexer_result(result.values) - - join_keys = [ - _match_join_keys(lcol, rcol, "inner") - for lcol, rcol in zip(target._columns, self._columns) - ] - join_keys = map(list, zip(*join_keys)) - scatter_map, indices = libcudf.join.join( - *join_keys, - how="inner", - ) - result = libcudf.copying.scatter([indices], scatter_map, [result])[0] - result_series = cudf.Series._from_column(result) - - if method in {"ffill", "bfill", "pad", "backfill"}: - result_series = _get_indexer_basic( - index=self, - positions=result_series, - method=method, - target_col=target.to_frame(index=False)[ - list(range(0, self.nlevels)) - ], - tolerance=tolerance, - ) - elif method is not None: - raise ValueError( - f"{method=} is unsupported, only supported values are: " - "{['ffill'/'pad', 'bfill'/'backfill', None]}" - ) - - return _return_get_indexer_result(result_series.to_cupy()) - - @_performance_tracking - def get_loc(self, key): - is_sorted = ( - self.is_monotonic_increasing or self.is_monotonic_decreasing - ) - is_unique = self.is_unique - key = (key,) if not isinstance(key, tuple) else key - - # Handle partial key search. If length of `key` is less than `nlevels`, - # Only search levels up to `len(key)` level. - partial_index = self.__class__._from_data( - data=self._data.select_by_index(slice(len(key))) - ) - ( - lower_bound, - upper_bound, - sort_inds, - ) = _lexsorted_equal_range( - partial_index, - [column.as_column(k, length=1) for k in key], - is_sorted, - ) - - if lower_bound == upper_bound: - raise KeyError(key) - - if is_unique and lower_bound + 1 == upper_bound: - # Indices are unique (Pandas constraint), search result is unique, - # return int. - return ( - lower_bound - if is_sorted - else sort_inds.element_indexing(lower_bound) - ) - - if is_sorted: - # In monotonic index, lex search result is continuous. A slice for - # the range is returned. - return slice(lower_bound, upper_bound) - - true_inds = sort_inds.slice(lower_bound, upper_bound).values - true_inds = _maybe_indices_to_slice(true_inds) - if isinstance(true_inds, slice): - return true_inds - - # Not sorted and not unique. Return a boolean mask - mask = cp.full(len(self), False) - mask[true_inds] = True - return mask - - def _get_reconciled_name_object(self, other) -> Self: - """ - If the result of a set operation will be self, - return self, unless the names change, in which - case make a shallow copy of self. - """ - names = self._maybe_match_names(other) - if self.names != names: - return self.rename(names) - return self - - def _maybe_match_names(self, other): - """ - Try to find common names to attach to the result of an operation - between a and b. Return a consensus list of names if they match - at least partly or list of None if they have completely - different names. - """ - if len(self.names) != len(other.names): - return [None] * len(self.names) - return [ - self_name if _is_same_name(self_name, other_name) else None - for self_name, other_name in zip(self.names, other.names) - ] - - @_performance_tracking - def union(self, other, sort=None) -> Self: - if not isinstance(other, MultiIndex): - msg = "other must be a MultiIndex or a list of tuples" - try: - other = MultiIndex.from_tuples(other, names=self.names) - except (ValueError, TypeError) as err: - # ValueError raised by tuples_to_object_array if we - # have non-object dtype - raise TypeError(msg) from err - - if sort not in {None, False}: - raise ValueError( - f"The 'sort' keyword only takes the values of " - f"None or False; {sort} was passed." - ) - - if not len(other) or self.equals(other): - return self._get_reconciled_name_object(other) - elif not len(self): - return other._get_reconciled_name_object(self) - - return self._union(other, sort=sort) - - @_performance_tracking - def _union(self, other, sort=None) -> Self: - # TODO: When to_frame is refactored to return a - # deep copy in future, we should push most of the common - # logic between MultiIndex._union & BaseIndex._union into - # Index._union. - other_df = other.copy(deep=True).to_frame(index=False) - self_df = self.copy(deep=True).to_frame(index=False) - col_names = list(range(0, self.nlevels)) - self_df.columns = col_names - other_df.columns = col_names - self_df["order"] = self_df.index - other_df["order"] = other_df.index - - result_df = self_df.merge(other_df, on=col_names, how="outer") - result_df = result_df.sort_values( - by=result_df._data.to_pandas_index()[self.nlevels :], - ignore_index=True, - ) - - midx = type(self)._from_data(result_df.iloc[:, : self.nlevels]._data) - midx.names = self.names if self.names == other.names else None - if sort in {None, True} and len(other): - return midx.sort_values() - return midx - - @_performance_tracking - def _intersection(self, other, sort=None) -> Self: - if self.names != other.names: - deep = True - col_names = list(range(0, self.nlevels)) - res_name = (None,) * self.nlevels - else: - deep = False - col_names = None - res_name = self.names - - other_df = other.copy(deep=deep).to_frame(index=False) - self_df = self.copy(deep=deep).to_frame(index=False) - if col_names is not None: - other_df.columns = col_names - self_df.columns = col_names - - result_df = cudf.merge(self_df, other_df, how="inner") - midx = type(self)._from_data(result_df._data) - midx.names = res_name - if sort in {None, True} and len(other): - return midx.sort_values() - return midx - - @_performance_tracking - def _copy_type_metadata(self: Self, other: Self) -> Self: - res = super()._copy_type_metadata(other) - if isinstance(other, MultiIndex): - res._names = other._names - self._levels, self._codes = _compute_levels_and_codes(res._data) - return res - - @_performance_tracking - def _split_columns_by_levels( - self, levels: tuple, *, in_levels: bool - ) -> Generator[tuple[Any, column.ColumnBase], None, None]: - # This function assumes that for levels with duplicate names, they are - # specified by indices, not name by ``levels``. E.g. [None, None] can - # only be specified by 0, 1, not "None". - level_names = list(self.names) - level_indices = { - lv if isinstance(lv, int) else level_names.index(lv) - for lv in levels - } - for i, (name, col) in enumerate(zip(self.names, self._columns)): - if in_levels and i in level_indices: - name = f"level_{i}" if name is None else name - yield name, col - elif not in_levels and i not in level_indices: - yield name, col - - @_performance_tracking - def _new_index_for_reset_index( - self, levels: tuple | None, name - ) -> None | BaseIndex: - """Return the new index after .reset_index""" - if levels is None: - return None - - index_columns, index_names = [], [] - for name, col in self._split_columns_by_levels( - levels, in_levels=False - ): - index_columns.append(col) - index_names.append(name) - - if not index_columns: - # None is caught later to return RangeIndex - return None - - index = cudf.core.index._index_from_data( - dict(enumerate(index_columns)), - name=name, - ) - if isinstance(index, type(self)): - index.names = index_names - else: - index.name = index_names[0] - return index - - def _columns_for_reset_index( - self, levels: tuple | None - ) -> Generator[tuple[Any, column.ColumnBase], None, None]: - """Return the columns and column names for .reset_index""" - if levels is None: - for i, (col, name) in enumerate(zip(self._columns, self.names)): - yield f"level_{i}" if name is None else name, col - else: - yield from self._split_columns_by_levels(levels, in_levels=True) - - def repeat(self, repeats, axis=None) -> Self: - return self._from_data( - self._data._from_columns_like_self( - super()._repeat([*self._columns], repeats, axis) - ) - ) diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py deleted file mode 100644 index e0aee28bfeb..00000000000 --- a/python/cudf/cudf/core/resample.py +++ /dev/null @@ -1,443 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -import pickle -import warnings -from typing import TYPE_CHECKING - -import numpy as np -import pandas as pd - -import cudf -import cudf._lib.labeling -import cudf.core.index -from cudf.core.groupby.groupby import ( - DataFrameGroupBy, - GroupBy, - SeriesGroupBy, - _Grouping, -) - -if TYPE_CHECKING: - from cudf._typing import DataFrameOrSeries - - -class _Resampler(GroupBy): - grouping: "_ResampleGrouping" - - def __init__(self, obj, by, axis=None, kind=None): - by = _ResampleGrouping(obj, by) - super().__init__(obj, by=by) - - def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): - result = super().agg( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) - if len(self.grouping.bin_labels) != len(result): - index = cudf.core.index.Index( - self.grouping.bin_labels, name=self.grouping.names[0] - ) - return result._align_to_index( - index, how="right", sort=False, allow_non_unique=True - ) - else: - return result.sort_index() - - def asfreq(self): - return self.obj._align_to_index( - self.grouping.bin_labels, - how="right", - sort=False, - allow_non_unique=True, - ) - - def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: - # TODO: can this be more efficient? - - # first, compute the outer join between `self.obj` and the `bin_labels` - # to get the sampling "gaps": - upsampled = self.obj._align_to_index( - self.grouping.bin_labels, - how="outer", - sort=True, - allow_non_unique=True, - ) - - # fill the gaps: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - filled = upsampled.fillna(method=method) - - # filter the result to only include the values corresponding - # to the bin labels: - return filled._align_to_index( - self.grouping.bin_labels, - how="right", - sort=False, - allow_non_unique=True, - ) - - def serialize(self): - header, frames = super().serialize() - grouping_head, grouping_frames = self.grouping.serialize() - header["grouping"] = grouping_head - header["resampler_type"] = pickle.dumps(type(self)) - header["grouping_frames_count"] = len(grouping_frames) - frames.extend(grouping_frames) - return header, frames - - @classmethod - def deserialize(cls, header, frames): - obj_type = pickle.loads(header["obj_type"]) - obj = obj_type.deserialize( - header["obj"], frames[: header["num_obj_frames"]] - ) - grouping = _ResampleGrouping.deserialize( - header["grouping"], frames[header["num_obj_frames"] :] - ) - resampler_cls = pickle.loads(header["resampler_type"]) - out = resampler_cls.__new__(resampler_cls) - out.grouping = grouping - super().__init__(out, obj, by=grouping) - return out - - -class DataFrameResampler(_Resampler, DataFrameGroupBy): - pass - - -class SeriesResampler(_Resampler, SeriesGroupBy): - pass - - -class _ResampleGrouping(_Grouping): - bin_labels: cudf.core.index.Index - - def __init__(self, obj, by=None, level=None): - self._freq = getattr(by, "freq", None) - super().__init__(obj, by, level) - - def copy(self, deep=True): - out = super().copy(deep=deep) - result = _ResampleGrouping.__new__(_ResampleGrouping) - result.names = out.names - result._named_columns = out._named_columns - result._key_columns = out._key_columns - result.bin_labels = self.bin_labels.copy(deep=deep) - result._freq = self._freq - return result - - @property - def keys(self): - index = super().keys - if self._freq is not None and isinstance(index, cudf.DatetimeIndex): - return cudf.DatetimeIndex._from_column( - index._column, name=index.name, freq=self._freq - ) - return index - - def serialize(self): - header, frames = super().serialize() - labels_head, labels_frames = self.bin_labels.serialize() - header["__bin_labels"] = labels_head - header["__bin_labels_count"] = len(labels_frames) - header["_freq"] = self._freq - frames.extend(labels_frames) - return header, frames - - @classmethod - def deserialize(cls, header, frames): - names = pickle.loads(header["names"]) - _named_columns = pickle.loads(header["_named_columns"]) - key_columns = cudf.core.column.deserialize_columns( - header["columns"], frames[: -header["__bin_labels_count"]] - ) - out = _ResampleGrouping.__new__(_ResampleGrouping) - out.names = names - out._named_columns = _named_columns - out._key_columns = key_columns - out.bin_labels = cudf.core.index.Index.deserialize( - header["__bin_labels"], frames[-header["__bin_labels_count"] :] - ) - out._freq = header["_freq"] - return out - - def _handle_frequency_grouper(self, by): - # if `by` is a time frequency grouper, we bin the key column - # using bin intervals specified by `by.freq`, then use *that* - # as the groupby key - - freq = by.freq - label = by.label - closed = by.closed - - if isinstance(freq, (cudf.DateOffset, pd.DateOffset)): - raise NotImplementedError( - "Resampling by DateOffset objects is not yet supported." - ) - if not isinstance(freq, str): - raise TypeError( - f"Unsupported type for freq: {type(freq).__name__}" - ) - # convert freq to a pd.DateOffset: - offset = pd.tseries.frequencies.to_offset(freq) - - if offset.freqstr == "M" or offset.freqstr.startswith("W-"): - label = "right" if label is None else label - closed = "right" if closed is None else closed - else: - label = "left" if label is None else label - closed = "left" if closed is None else closed - - # determine the key column - if by.key is None and by.level is None: - # then assume that the key is the index of `self._obj`: - self._handle_index(self._obj.index) - elif by.key: - self._handle_label(by.key) - elif by.level: - self._handle_level(by.level) - - if not len(self._key_columns) == 1: - raise ValueError("Must resample on exactly one column") - - key_column = self._key_columns[0] - - if not isinstance(key_column, cudf.core.column.DatetimeColumn): - raise TypeError( - f"Can only resample on a DatetimeIndex or datetime column, " - f"got column of type {key_column.dtype}" - ) - - # get the start and end values that will be used to generate - # the bin labels - min_date = key_column._reduce("min") - max_date = key_column._reduce("max") - start, end = _get_timestamp_range_edges( - pd.Timestamp(min_date), - pd.Timestamp(max_date), - offset, - closed=closed, - ) - - # in some cases, an extra time stamp is required in order to - # bin all the values. It's OK if we generate more labels than - # we need, as we remove any unused labels below - end += offset - - # generate the labels for binning the key column: - bin_labels = cudf.date_range( - start=start, - end=end, - freq=freq, - ) - - # We want the (resampled) column of timestamps in the result - # to have a resolution closest to the resampling - # frequency. For example, if resampling from '1T' to '1s', we - # want the resulting timestamp column to by of dtype - # 'datetime64[s]'. libcudf requires the bin labels and key - # column to have the same dtype, so we compute a `result_type` - # and cast them both to that type. - try: - result_type = np.dtype(f"datetime64[{offset.rule_code}]") - # TODO: Ideally, we can avoid one cast by having `date_range` - # generate timestamps of a given dtype. Currently, it can - # only generate timestamps with 'ns' precision - cast_key_column = key_column.astype(result_type) - cast_bin_labels = bin_labels.astype(result_type) - except TypeError: - # unsupported resolution (we don't support resolutions >s) - # fall back to using datetime64[s] - result_type = np.dtype("datetime64[s]") - cast_key_column = key_column.astype(result_type) - cast_bin_labels = bin_labels.astype(result_type) - - # bin the key column: - bin_numbers = cudf._lib.labeling.label_bins( - cast_key_column, - left_edges=cast_bin_labels[:-1]._column, - left_inclusive=(closed == "left"), - right_edges=cast_bin_labels[1:]._column, - right_inclusive=(closed == "right"), - ) - - if label == "right": - cast_bin_labels = cast_bin_labels[1:] - else: - cast_bin_labels = cast_bin_labels[:-1] - - # if we have more labels than bins, remove the extras labels: - nbins = bin_numbers.max() + 1 - if len(cast_bin_labels) > nbins: - cast_bin_labels = cast_bin_labels[:nbins] - - cast_bin_labels.name = self.names[0] - self.bin_labels = cast_bin_labels - - # replace self._key_columns with the binned key column: - self._key_columns = [ - cast_bin_labels._gather( - bin_numbers, check_bounds=False - )._column.astype(result_type) - ] - - -# NOTE: this function is vendored from Pandas -def _get_timestamp_range_edges( - first, last, freq, closed="left", origin="start_day", offset=None -): - """ - Adjust the `first` Timestamp to the preceding Timestamp that resides on - the provided offset. Adjust the `last` Timestamp to the following - Timestamp that resides on the provided offset. Input Timestamps that - already reside on the offset will be adjusted depending on the type of - offset and the `closed` parameter. - - Parameters - ---------- - first : pd.Timestamp - The beginning Timestamp of the range to be adjusted. - last : pd.Timestamp - The ending Timestamp of the range to be adjusted. - freq : pd.DateOffset - The dateoffset to which the Timestamps will be adjusted. - closed : {'right', 'left'}, default None - Which side of bin interval is closed. - origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day' - The timestamp on which to adjust the grouping. The timezone of origin - must match the timezone of the index. If a timestamp is not used, - these values are also supported: - - - 'epoch': `origin` is 1970-01-01 - - 'start': `origin` is the first value of the timeseries - - 'start_day': `origin` is the first day at midnight of the timeseries - offset : pd.Timedelta, default is None - An offset timedelta added to the origin. - - Returns - ------- - A tuple of length 2, containing the adjusted pd.Timestamp objects. - """ - from pandas.tseries.offsets import Day, Tick - - if isinstance(freq, Tick): - index_tz = first.tz - if isinstance(origin, pd.Timestamp) and (origin.tz is None) != ( - index_tz is None - ): - raise ValueError( - "The origin must have the same timezone as the index." - ) - elif origin == "epoch": - # set the epoch based on the timezone to have similar bins results - # when resampling on the same kind of indexes on different - # timezones - origin = pd.Timestamp("1970-01-01", tz=index_tz) - - if isinstance(freq, Day): - # _adjust_dates_anchored assumes 'D' means 24H, but first/last - # might contain a DST transition (23H, 24H, or 25H). - # So "pretend" the dates are naive when adjusting the endpoints - first = first.tz_localize(None) - last = last.tz_localize(None) - if isinstance(origin, pd.Timestamp): - origin = origin.tz_localize(None) - - first, last = _adjust_dates_anchored( - first, last, freq, closed=closed, origin=origin, offset=offset - ) - if isinstance(freq, Day): - first = first.tz_localize(index_tz) - last = last.tz_localize(index_tz) - else: - first = first.normalize() - last = last.normalize() - - if closed == "left": - first = pd.Timestamp(freq.rollback(first)) - else: - first = pd.Timestamp(first - freq) - - last = pd.Timestamp(last + freq) - - return first, last - - -# NOTE: this function is vendored from Pandas -def _adjust_dates_anchored( - first, last, freq, closed="right", origin="start_day", offset=None -): - # First and last offsets should be calculated from the start day to fix an - # error cause by resampling across multiple days when a one day period is - # not a multiple of the frequency. See GH 8683 - # To handle frequencies that are not multiple or divisible by a day we let - # the possibility to define a fixed origin timestamp. See GH 31809 - origin_nanos = 0 # origin == "epoch" - if origin == "start_day": - origin_nanos = first.normalize().value - elif origin == "start": - origin_nanos = first.value - elif isinstance(origin, pd.Timestamp): - origin_nanos = origin.value - origin_nanos += offset.value if offset else 0 - - # GH 10117 & GH 19375. If first and last contain timezone information, - # Perform the calculation in UTC in order to avoid localizing on an - # Ambiguous or Nonexistent time. - first_tzinfo = first.tzinfo - last_tzinfo = last.tzinfo - if first_tzinfo is not None: - first = first.tz_convert("UTC") - if last_tzinfo is not None: - last = last.tz_convert("UTC") - - foffset = (first.value - origin_nanos) % freq.nanos - loffset = (last.value - origin_nanos) % freq.nanos - - if closed == "right": - if foffset > 0: - # roll back - fresult = first.value - foffset - else: - fresult = first.value - freq.nanos - - if loffset > 0: - # roll forward - lresult = last.value + (freq.nanos - loffset) - else: - # already the end of the road - lresult = last.value - else: # closed == 'left' - if foffset > 0: - fresult = first.value - foffset - else: - # start of the road - fresult = first.value - - if loffset > 0: - # roll forward - lresult = last.value + (freq.nanos - loffset) - else: - lresult = last.value + freq.nanos - fresult = pd.Timestamp(fresult) - lresult = pd.Timestamp(lresult) - if first_tzinfo is not None: - fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo) - if last_tzinfo is not None: - lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo) - return fresult, lresult diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py deleted file mode 100644 index 6e5abb2b82b..00000000000 --- a/python/cudf/cudf/core/reshape.py +++ /dev/null @@ -1,1567 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import itertools -import warnings -from typing import TYPE_CHECKING, Literal - -import numpy as np -import pandas as pd - -import cudf -from cudf._lib.transform import one_hot_encode -from cudf._lib.types import size_type_dtype -from cudf.api.extensions import no_default -from cudf.api.types import is_scalar -from cudf.core._compat import PANDAS_LT_300 -from cudf.core.column import ColumnBase, as_column, column_empty_like -from cudf.core.column_accessor import ColumnAccessor -from cudf.utils.dtypes import min_unsigned_type - -if TYPE_CHECKING: - from cudf._typing import Dtype - -_AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1} - - -def _align_objs(objs, how="outer", sort=None): - """ - Align a set of Series or Dataframe objects. - - Parameters - ---------- - objs : list of DataFrame, Series, or Index - how : How to handle indexes on other axis (or axes), - similar to join in concat - sort : Whether to sort the resulting Index - - Returns - ------- - A list of reindexed and aligned objects - ready for concatenation - """ - # Check if multiindex then check if indexes match. Index - # returns ndarray tuple of bools requiring additional filter. - # Then check for duplicate index value. - i_objs = iter(objs) - first = next(i_objs) - - not_matching_index = any( - not first.index.equals(rest.index) for rest in i_objs - ) - - if not_matching_index: - if not all(o.index.is_unique for o in objs): - raise ValueError("cannot reindex on an axis with duplicate labels") - - index = objs[0].index - name = index.name - - final_index = _get_combined_index( - [obj.index for obj in objs], intersect=how == "inner", sort=sort - ) - - final_index.name = name - return [ - obj.reindex(final_index) - if not final_index.equals(obj.index) - else obj - for obj in objs - ] - else: - if sort: - if not first.index.is_monotonic_increasing: - final_index = first.index.sort_values() - return [obj.reindex(final_index) for obj in objs] - return objs - - -def _get_combined_index(indexes, intersect: bool = False, sort=None): - if len(indexes) == 0: - index = cudf.Index([]) - elif len(indexes) == 1: - index = indexes[0] - elif intersect: - sort = True - index = indexes[0] - for other in indexes[1:]: - # Don't sort for every intersection, - # let the sorting happen in the end. - index = index.intersection(other, sort=False) - else: - index = indexes[0] - if sort is None: - sort = not index._is_object() - for other in indexes[1:]: - index = index.union(other, sort=False) - - if sort: - if not index.is_monotonic_increasing: - index = index.sort_values() - - return index - - -def _normalize_series_and_dataframe( - objs: list[cudf.Series | cudf.DataFrame], axis: Literal[0, 1] -) -> None: - """Convert any cudf.Series objects in objs to DataFrames in place.""" - # Default to naming series by a numerical id if they are not named. - sr_name = 0 - for idx, obj in enumerate(objs): - if isinstance(obj, cudf.Series): - name = obj.name - if name is None: - if axis == 0: - name = 0 - else: - name = sr_name - sr_name += 1 - - objs[idx] = obj.to_frame(name=name) - - -def concat( - objs, - axis=0, - join="outer", - ignore_index=False, - keys=None, - levels=None, - names=None, - verify_integrity=False, - sort=None, -): - """Concatenate DataFrames, Series, or Indices row-wise. - - Parameters - ---------- - objs : list or dictionary of DataFrame, Series, or Index - axis : {0/'index', 1/'columns'}, default 0 - The axis to concatenate along. - `axis=1` must be passed if a dictionary is passed. - join : {'inner', 'outer'}, default 'outer' - How to handle indexes on other axis (or axes). - ignore_index : bool, default False - Set True to ignore the index of the *objs* and provide a - default range index instead. - keys : sequence, default None - If multiple levels passed, should contain tuples. Construct - hierarchical index using the passed keys as the outermost level. - Currently not supported. - levels : list of sequences, default None - Specific levels (unique values) to use for constructing a - MultiIndex. Otherwise they will be inferred from the keys. - Currently not supported. - names : list, default None - Names for the levels in the resulting hierarchical index. - Currently not supported. - verify_integrity : bool, default False - Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation. - Currently not supported. - sort : bool, default False - Sort non-concatenation axis if it is not already aligned. - - Returns - ------- - A new object of like type with rows from each object in ``objs``. - - Examples - -------- - Combine two ``Series``. - - >>> import cudf - >>> s1 = cudf.Series(['a', 'b']) - >>> s2 = cudf.Series(['c', 'd']) - >>> s1 - 0 a - 1 b - dtype: object - >>> s2 - 0 c - 1 d - dtype: object - >>> cudf.concat([s1, s2]) - 0 a - 1 b - 0 c - 1 d - dtype: object - - Clear the existing index and reset it in the - result by setting the ``ignore_index`` option to ``True``. - - >>> cudf.concat([s1, s2], ignore_index=True) - 0 a - 1 b - 2 c - 3 d - dtype: object - - Combine two DataFrame objects with identical columns. - - >>> df1 = cudf.DataFrame([['a', 1], ['b', 2]], - ... columns=['letter', 'number']) - >>> df1 - letter number - 0 a 1 - 1 b 2 - >>> df2 = cudf.DataFrame([['c', 3], ['d', 4]], - ... columns=['letter', 'number']) - >>> df2 - letter number - 0 c 3 - 1 d 4 - >>> cudf.concat([df1, df2]) - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - Combine DataFrame objects with overlapping columns and return - everything. Columns outside the intersection will - be filled with ``null`` values. - - >>> df3 = cudf.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], - ... columns=['letter', 'number', 'animal']) - >>> df3 - letter number animal - 0 c 3 cat - 1 d 4 dog - >>> cudf.concat([df1, df3], sort=False) - letter number animal - 0 a 1 - 1 b 2 - 0 c 3 cat - 1 d 4 dog - - Combine ``DataFrame`` objects with overlapping columns - and return only those that are shared by passing ``inner`` to - the ``join`` keyword argument. - - >>> cudf.concat([df1, df3], join="inner") - letter number - 0 a 1 - 1 b 2 - 0 c 3 - 1 d 4 - - Combine ``DataFrame`` objects horizontally along the - x axis by passing in ``axis=1``. - - >>> df4 = cudf.DataFrame([['bird', 'polly'], ['monkey', 'george']], - ... columns=['animal', 'name']) - >>> df4 - animal name - 0 bird polly - 1 monkey george - >>> cudf.concat([df1, df4], axis=1) - letter number animal name - 0 a 1 bird polly - 1 b 2 monkey george - - Combine a dictionary of DataFrame objects horizontally: - - >>> d = {'first': df1, 'second': df2} - >>> cudf.concat(d, axis=1) - first second - letter number letter number - 0 a 1 c 3 - 1 b 2 d 4 - """ - if keys is not None: - raise NotImplementedError("keys is currently not supported") - if levels is not None: - raise NotImplementedError("levels is currently not supported") - if names is not None: - raise NotImplementedError("names is currently not supported") - # TODO: Do we really need to have different error messages for an empty - # list and a list of None? - if not objs: - raise ValueError("No objects to concatenate") - - axis = _AXIS_MAP.get(axis, None) - if axis is None: - raise ValueError( - f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}' - ) - - if isinstance(objs, dict): - if axis != 1: - raise NotImplementedError( - f"Can only concatenate dictionary input along axis=1, not {axis}" - ) - objs = {k: obj for k, obj in objs.items() if obj is not None} - keys_objs = list(objs) - objs = list(objs.values()) - if any(isinstance(o, cudf.BaseIndex) for o in objs): - raise TypeError( - "cannot concatenate a dictionary containing indices" - ) - else: - objs = [obj for obj in objs if obj is not None] - keys_objs = None - - if not objs: - raise ValueError("All objects passed were None") - - # Retrieve the base types of `objs`. In order to support sub-types - # and object wrappers, we use `isinstance()` instead of comparing - # types directly - allowed_typs = { - cudf.Series, - cudf.DataFrame, - cudf.BaseIndex, - } - if not all(isinstance(o, tuple(allowed_typs)) for o in objs): - raise TypeError( - f"can only concatenate objects which are instances of " - f"{allowed_typs}, instead received {[type(o) for o in objs]}" - ) - - if any(isinstance(o, cudf.BaseIndex) for o in objs): - if not all(isinstance(o, cudf.BaseIndex) for o in objs): - raise TypeError( - "when concatenating indices you must provide ONLY indices" - ) - - only_series = all(isinstance(o, cudf.Series) for o in objs) - - # Return for single object - if len(objs) == 1: - obj = objs[0] - if ignore_index: - if axis == 1: - if isinstance(obj, cudf.Series): - result = obj.to_frame() - else: - result = obj.copy(deep=True) - result.columns = cudf.RangeIndex(len(result._data)) - else: - result = type(obj)._from_data( - data=obj._data.copy(deep=True), - index=cudf.RangeIndex(len(obj)), - ) - elif axis == 0: - result = obj.copy(deep=True) - else: - if isinstance(obj, cudf.Series): - result = obj.to_frame() - else: - result = obj.copy(deep=True) - if keys_objs is not None and isinstance(result, cudf.DataFrame): - k = keys_objs[0] - result.columns = pd.MultiIndex.from_tuples( - [ - (k, *c) if isinstance(c, tuple) else (k, c) - for c in result._column_names - ] - ) - - if isinstance(result, cudf.Series) and axis == 0: - # sort has no effect for series concatted along axis 0 - return result - else: - return result.sort_index(axis=(1 - axis)) if sort else result - - # when axis is 1 (column) we can concat with Series and Dataframes - if axis == 1: - if not all(isinstance(o, (cudf.Series, cudf.DataFrame)) for o in objs): - raise TypeError( - "Can only concatenate Series and DataFrame objects when axis=1" - ) - _normalize_series_and_dataframe(objs, axis=axis) - - any_empty = any(obj.empty for obj in objs) - if any_empty: - # Do not remove until pandas-3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn( - "The behavior of array concatenation with empty entries is " - "deprecated. In a future version, this will no longer exclude " - "empty items when determining the result dtype. " - "To retain the old behavior, exclude the empty entries before " - "the concat operation.", - FutureWarning, - ) - # Inner joins involving empty data frames always return empty dfs, but - # We must delay returning until we have set the column names. - empty_inner = any_empty and join == "inner" - - objs = [obj for obj in objs if obj.shape != (0, 0)] - - if len(objs) == 0: - # TODO: https://github.com/rapidsai/cudf/issues/16550 - return cudf.DataFrame() - - # Don't need to align indices of all `objs` since we - # would anyway return an empty dataframe below - if not empty_inner: - objs = _align_objs(objs, how=join, sort=sort) - result_index = objs[0].index - else: - result_index = None - - result_data = {} - result_columns = None - if keys_objs is None: - for o in objs: - for name, col in o._column_labels_and_values: - if name in result_data: - raise NotImplementedError( - f"A Column with duplicate name found: {name}, cuDF " - f"doesn't support having multiple columns with " - f"same names yet." - ) - if empty_inner: - # if join is inner and it contains an empty df - # we return an empty df, hence creating an empty - # column with dtype metadata retained. - result_data[name] = cudf.core.column.column_empty_like( - col, newsize=0 - ) - else: - result_data[name] = col - - result_columns = ( - objs[0] - ._data.to_pandas_index() - .append([obj._data.to_pandas_index() for obj in objs[1:]]) - .unique() - ) - - # need to create a MultiIndex column - else: - # All levels in the multiindex label must have the same type - has_multiple_level_types = ( - len({type(name) for o in objs for name in o._column_names}) > 1 - ) - if has_multiple_level_types: - raise NotImplementedError( - "Cannot construct a MultiIndex column with multiple " - "label types in cuDF at this time. You must convert " - "the labels to the same type." - ) - for k, o in zip(keys_objs, objs): - for name, col in o._column_labels_and_values: - # if only series, then only keep keys_objs as column labels - # if the existing column is multiindex, prepend it - # to handle cases where dfs and srs are concatenated - if only_series: - col_label = k - elif isinstance(name, tuple): - col_label = (k, *name) - else: - col_label = (k, name) - if empty_inner: - result_data[col_label] = ( - cudf.core.column.column_empty_like(col, newsize=0) - ) - else: - result_data[col_label] = col - - df = cudf.DataFrame._from_data( - ColumnAccessor(result_data, verify=False), index=result_index - ) - if ignore_index: - df.columns = cudf.RangeIndex(df._num_columns) - elif result_columns is not None: - df.columns = result_columns - elif not only_series: - df.columns = pd.MultiIndex.from_tuples(df._column_names) - - if empty_inner: - # if join is inner and it contains an empty df - # we return an empty df - return df.head(0) - - return df - - # If we get here, we are always concatenating along axis 0 (the rows). - typ = type(objs[0]) - if len({type(o) for o in objs}) > 1: - _normalize_series_and_dataframe(objs, axis=axis) - typ = cudf.DataFrame - - if typ is cudf.DataFrame: - old_objs = objs - objs = [obj for obj in objs if obj.shape != (0, 0)] - if len(objs) == 0: - # If objs is empty, that indicates all of - # objs are empty dataframes. - # TODO: https://github.com/rapidsai/cudf/issues/16550 - return cudf.DataFrame() - elif len(objs) == 1: - obj = objs[0] - result = cudf.DataFrame._from_data( - data={} if join == "inner" else obj._data.copy(deep=True), - index=cudf.RangeIndex(len(obj)) - if ignore_index - else obj.index.copy(deep=True), - ) - return result - else: - if join == "inner" and len(old_objs) != len(objs): - # don't filter out empty df's - objs = old_objs - result = cudf.DataFrame._concat( - objs, - axis=axis, - join=join, - ignore_index=ignore_index, - # Explicitly cast rather than relying on None being falsy. - sort=bool(sort), - ) - return result - - elif typ is cudf.Series: - new_objs = [obj for obj in objs if len(obj)] - if len(new_objs) == 1 and not ignore_index: - return new_objs[0] - else: - return cudf.Series._concat(objs, axis=axis, index=not ignore_index) - elif typ is cudf.MultiIndex: - return cudf.MultiIndex._concat(objs) - elif issubclass(typ, cudf.Index): - return cudf.Index._concat(objs) - else: - raise TypeError(f"cannot concatenate object of type {typ}") - - -def melt( - frame, - id_vars=None, - value_vars=None, - var_name=None, - value_name="value", - col_level=None, - ignore_index: bool = True, -): - """Unpivots a DataFrame from wide format to long format, - optionally leaving identifier variables set. - - Parameters - ---------- - frame : DataFrame - id_vars : tuple, list, or ndarray, optional - Column(s) to use as identifier variables. - default: None - value_vars : tuple, list, or ndarray, optional - Column(s) to unpivot. - default: all columns that are not set as `id_vars`. - var_name : scalar - Name to use for the `variable` column. - default: frame.columns.name or 'variable' - value_name : str - Name to use for the `value` column. - default: 'value' - - Returns - ------- - out : DataFrame - Melted result - - Difference from pandas: - * Does not support 'col_level' because cuDF does not have multi-index - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'A': ['a', 'b', 'c'], - ... 'B': [1, 3, 5], - ... 'C': [2, 4, 6]}) - >>> df - A B C - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - >>> cudf.melt(df, id_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - >>> cudf.melt(df, id_vars=['A'], value_vars=['B', 'C']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 3 a C 2 - 4 b C 4 - 5 c C 6 - - The names of 'variable' and 'value' columns can be customized: - - >>> cudf.melt(df, id_vars=['A'], value_vars=['B'], - ... var_name='myVarname', value_name='myValname') - A myVarname myValname - 0 a B 1 - 1 b B 3 - 2 c B 5 - """ - if col_level is not None: - raise NotImplementedError("col_level != None is not supported yet.") - if ignore_index is not True: - raise NotImplementedError("ignore_index is currently not supported.") - - # Arg cleaning - - # id_vars - if id_vars is not None: - if cudf.api.types.is_scalar(id_vars): - id_vars = [id_vars] - id_vars = list(id_vars) - missing = set(id_vars) - set(frame._column_names) - if not len(missing) == 0: - raise KeyError( - f"The following 'id_vars' are not present" - f" in the DataFrame: {list(missing)}" - ) - else: - id_vars = [] - - # value_vars - if value_vars is not None: - if cudf.api.types.is_scalar(value_vars): - value_vars = [value_vars] - value_vars = list(value_vars) - missing = set(value_vars) - set(frame._column_names) - if not len(missing) == 0: - raise KeyError( - f"The following 'value_vars' are not present" - f" in the DataFrame: {list(missing)}" - ) - else: - # then all remaining columns in frame - unique_id = set(id_vars) - value_vars = [c for c in frame._column_names if c not in unique_id] - - # Error for unimplemented support for datatype - if any( - isinstance(frame[col].dtype, cudf.CategoricalDtype) - for col in id_vars + value_vars - ): - raise NotImplementedError( - "Categorical columns are not yet supported for function" - ) - - # Check dtype homogeneity in value_var - # Because heterogeneous concat is unimplemented - if len(value_vars) > 1: - dtype = frame[value_vars[0]].dtype - if any(frame[col].dtype != dtype for col in value_vars): - raise ValueError("all cols in value_vars must have the same dtype") - - # overlap - overlap = set(id_vars).intersection(set(value_vars)) - if not len(overlap) == 0: - raise KeyError( - f"'value_vars' and 'id_vars' cannot have overlap." - f" The following 'value_vars' are ALSO present" - f" in 'id_vars': {list(overlap)}" - ) - - N = len(frame) - K = len(value_vars) - - def _tile(A, reps): - series_list = [A] * reps - if reps > 0: - return cudf.Series._concat(objs=series_list, index=False) - else: - return cudf.Series([], dtype=A.dtype) - - # Step 1: tile id_vars - mdata = {col: _tile(frame[col], K) for col in id_vars} - - # Step 2: add variable - nval = len(value_vars) - dtype = min_unsigned_type(nval) - - if not var_name: - var_name = "variable" - - if not value_vars: - # TODO: Use frame._data.label_dtype when it's more consistently set - var_data = cudf.Series( - value_vars, dtype=frame._data.to_pandas_index().dtype - ) - else: - var_data = ( - cudf.Series(value_vars) - .take(np.repeat(np.arange(nval, dtype=dtype), N)) - .reset_index(drop=True) - ) - mdata[var_name] = var_data - - # Step 3: add values - mdata[value_name] = cudf.Series._concat( - objs=[frame[val] for val in value_vars], index=False - ) - - return cudf.DataFrame(mdata) - - -def get_dummies( - data, - prefix=None, - prefix_sep="_", - dummy_na=False, - columns=None, - cats=None, - sparse=False, - drop_first=False, - dtype="bool", -): - """Returns a dataframe whose columns are the one hot encodings of all - columns in `df` - - Parameters - ---------- - data : array-like, Series, or DataFrame - Data of which to get dummy indicators. - prefix : str, dict, or sequence, optional - Prefix to append. Either a str (to apply a constant prefix), dict - mapping column names to prefixes, or sequence of prefixes to apply with - the same length as the number of columns. If not supplied, defaults - to the empty string - prefix_sep : str, dict, or sequence, optional, default '_' - Separator to use when appending prefixes - dummy_na : boolean, optional - Add a column to indicate Nones, if False Nones are ignored. - cats : dict, optional - Dictionary mapping column names to sequences of values representing - that column's category. If not supplied, it is computed as the unique - values of the column. - sparse : boolean, optional - Right now this is NON-FUNCTIONAL argument in rapids. - drop_first : boolean, optional - Whether to get k-1 dummies out of k categorical levels by removing the - first level. - columns : sequence of str, optional - Names of columns to encode. If not provided, will attempt to encode all - columns. Note this is different from pandas default behavior, which - encodes all columns with dtype object or categorical - dtype : str, optional - Output dtype, default 'bool' - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({"a": ["value1", "value2", None], "b": [0, 0, 0]}) - >>> cudf.get_dummies(df) - b a_value1 a_value2 - 0 0 True False - 1 0 False True - 2 0 False False - - >>> cudf.get_dummies(df, dummy_na=True) - b a_ a_value1 a_value2 - 0 0 False True False - 1 0 False False True - 2 0 True False False - - >>> import numpy as np - >>> df = cudf.DataFrame({"a":cudf.Series([1, 2, np.nan, None], - ... nan_as_null=False)}) - >>> df - a - 0 1.0 - 1 2.0 - 2 NaN - 3 - - >>> cudf.get_dummies(df, dummy_na=True, columns=["a"]) - a_ a_1.0 a_2.0 a_nan - 0 False True False False - 1 False False True False - 2 False False False True - 3 True False False False - - >>> series = cudf.Series([1, 2, None, 2, 4]) - >>> series - 0 1 - 1 2 - 2 - 3 2 - 4 4 - dtype: int64 - >>> cudf.get_dummies(series, dummy_na=True) - 1 2 4 - 0 False True False False - 1 False False True False - 2 True False False False - 3 False False True False - 4 False False False True - """ - - if cats is None: - cats = {} - else: - warnings.warn( - "cats is deprecated and will be removed in a future version.", - FutureWarning, - ) - if sparse: - raise NotImplementedError("sparse is not supported yet") - - if isinstance(data, cudf.DataFrame): - encode_fallback_dtypes = ["object", "category"] - - if columns is None or len(columns) == 0: - columns = data.select_dtypes( - include=encode_fallback_dtypes - )._column_names - - _length_check_params(prefix, columns, "prefix") - _length_check_params(prefix_sep, columns, "prefix_sep") - - if prefix is None: - prefix = columns - - if isinstance(prefix, str): - prefix_map = {} - elif isinstance(prefix, dict): - prefix_map = prefix - else: - prefix_map = dict(zip(columns, prefix)) - - if isinstance(prefix_sep, str): - prefix_sep_map = {} - elif isinstance(prefix_sep, dict): - prefix_sep_map = prefix_sep - else: - prefix_sep_map = dict(zip(columns, prefix_sep)) - - # If we have no columns to encode, we need to drop - # fallback columns(if any) - if len(columns) == 0: - return data.select_dtypes(exclude=encode_fallback_dtypes) - else: - result_data = { - col_name: col - for col_name, col in data._column_labels_and_values - if col_name not in columns - } - - for name in columns: - if name not in cats: - unique = _get_unique( - column=data._data[name], dummy_na=dummy_na - ) - else: - unique = as_column(cats[name]) - - col_enc_data = _one_hot_encode_column( - column=data._data[name], - categories=unique, - prefix=prefix_map.get(name, prefix), - prefix_sep=prefix_sep_map.get(name, prefix_sep), - dtype=dtype, - drop_first=drop_first, - ) - result_data.update(col_enc_data) - return cudf.DataFrame._from_data(result_data, index=data.index) - else: - ser = cudf.Series(data) - unique = _get_unique(column=ser._column, dummy_na=dummy_na) - data = _one_hot_encode_column( - column=ser._column, - categories=unique, - prefix=prefix, - prefix_sep=prefix_sep, - dtype=dtype, - drop_first=drop_first, - ) - return cudf.DataFrame._from_data(data, index=ser.index) - - -def _merge_sorted( - objs, - keys=None, - by_index=False, - ignore_index=False, - ascending=True, - na_position="last", -): - """Merge a list of sorted DataFrame or Series objects. - - Dataframes/Series in objs list MUST be pre-sorted by columns - listed in `keys`, or by the index (if `by_index=True`). - - Parameters - ---------- - objs : list of DataFrame or Series - keys : list, default None - List of Column names to sort by. If None, all columns used - (Ignored if `by_index=True`) - by_index : bool, default False - Use index for sorting. `keys` input will be ignored if True - ignore_index : bool, default False - Drop and ignore index during merge. Default range index will - be used in the output dataframe. - ascending : bool, default True - Sorting is in ascending order, otherwise it is descending - na_position : {'first', 'last'}, default 'last' - 'first' nulls at the beginning, 'last' nulls at the end - - Returns - ------- - A new, lexicographically sorted, DataFrame/Series. - """ - if not pd.api.types.is_list_like(objs): - raise TypeError("objs must be a list-like of Frame-like objects") - - if len(objs) < 1: - raise ValueError("objs must be non-empty") - - if not all(isinstance(table, cudf.core.frame.Frame) for table in objs): - raise TypeError("Elements of objs must be Frame-like") - - if len(objs) == 1: - return objs[0] - - if by_index and ignore_index: - raise ValueError("`by_index` and `ignore_index` cannot both be True") - - if by_index: - key_columns_indices = list(range(0, objs[0].index.nlevels)) - else: - if keys is None: - key_columns_indices = list(range(0, objs[0]._num_columns)) - else: - key_columns_indices = [ - objs[0]._column_names.index(key) for key in keys - ] - if not ignore_index: - key_columns_indices = [ - idx + objs[0].index.nlevels for idx in key_columns_indices - ] - - columns = [ - [ - *(obj.index._columns if not ignore_index else ()), - *obj._columns, - ] - for obj in objs - ] - - return objs[0]._from_columns_like_self( - cudf._lib.merge.merge_sorted( - input_columns=columns, - key_columns_indices=key_columns_indices, - ascending=ascending, - na_position=na_position, - ), - column_names=objs[0]._column_names, - index_names=None if ignore_index else objs[0]._index_names, - ) - - -def _pivot(col_accessor: ColumnAccessor, index, columns) -> cudf.DataFrame: - """ - Reorganize the values of the DataFrame according to the given - index and columns. - - Parameters - ---------- - col_accessor : DataFrame - index : cudf.Index - Index labels of the result - columns : cudf.Index - Column labels of the result - """ - columns_labels, columns_idx = columns._encode() - index_labels, index_idx = index._encode() - column_labels = columns_labels.to_pandas().to_flat_index() - - result = {} - if len(index_labels) != 0 and len(columns_labels) != 0: - - def as_tuple(x): - return x if isinstance(x, tuple) else (x,) - - nrows = len(index_labels) - for col_label, col in col_accessor.items(): - names = [ - as_tuple(col_label) + as_tuple(name) for name in column_labels - ] - new_size = nrows * len(names) - scatter_map = (columns_idx * np.int32(nrows)) + index_idx - target_col = cudf.core.column.column_empty_like( - col, masked=True, newsize=new_size - ) - target_col[scatter_map] = col - target = cudf.Index._from_column(target_col) - result.update( - { - name: idx._column - for name, idx in zip( - names, target._split(range(nrows, new_size, nrows)) - ) - } - ) - - # the result of pivot always has a multicolumn - ca = ColumnAccessor( - result, - multiindex=True, - level_names=(None,) + columns._column_names, - verify=False, - ) - return cudf.DataFrame._from_data( - ca, index=cudf.Index(index_labels, name=index.name) - ) - - -def pivot(data, columns=None, index=no_default, values=no_default): - """ - Return reshaped DataFrame organized by the given index and column values. - - Reshape data (produce a "pivot" table) based on column values. Uses - unique values from specified `index` / `columns` to form axes of the - resulting DataFrame. - - Parameters - ---------- - columns : column name, optional - Column used to construct the columns of the result. - index : column name, optional - Column used to construct the index of the result. - values : column name or list of column names, optional - Column(s) whose values are rearranged to produce the result. - If not specified, all remaining columns of the DataFrame - are used. - - Returns - ------- - DataFrame - - Examples - -------- - >>> a = cudf.DataFrame() - >>> a['a'] = [1, 1, 2, 2] - >>> a['b'] = ['a', 'b', 'a', 'b'] - >>> a['c'] = [1, 2, 3, 4] - >>> a.pivot(index='a', columns='b') - c - b a b - a - 1 1 2 - 2 3 4 - - Pivot with missing values in result: - - >>> a = cudf.DataFrame() - >>> a['a'] = [1, 1, 2] - >>> a['b'] = [1, 2, 3] - >>> a['c'] = ['one', 'two', 'three'] - >>> a.pivot(index='a', columns='b') - c - b 1 2 3 - a - 1 one two - 2 three - - """ - values_is_list = True - if values is no_default: - cols_to_select = [ - col for col in data._column_names if col not in (index, columns) - ] - elif not isinstance(values, (list, tuple)): - cols_to_select = [values] - values_is_list = False - else: - cols_to_select = values - if index is no_default: - index = data.index - else: - index = cudf.Index(data.loc[:, index]) - columns = cudf.Index(data.loc[:, columns]) - - # Create a DataFrame composed of columns from both - # columns and index - ca = ColumnAccessor( - dict(enumerate(itertools.chain(index._columns, columns._columns))), - verify=False, - ) - columns_index = cudf.DataFrame._from_data(ca) - - # Check that each row is unique: - if len(columns_index) != len(columns_index.drop_duplicates()): - raise ValueError("Duplicate index-column pairs found. Cannot reshape.") - - result = _pivot(data._data.select_by_label(cols_to_select), index, columns) - - # MultiIndex to Index - if not values_is_list: - result._data.droplevel(0) - - return result - - -def unstack(df, level, fill_value=None, sort: bool = True): - """ - Pivot one or more levels of the (necessarily hierarchical) index labels. - - Pivots the specified levels of the index labels of df to the innermost - levels of the columns labels of the result. - - * If the index of ``df`` has multiple levels, returns a ``Dataframe`` with - specified level of the index pivoted to the column levels. - * If the index of ``df`` has single level, returns a ``Series`` with all - column levels pivoted to the index levels. - - Parameters - ---------- - df : DataFrame - level : level name or index, list-like - Integer, name or list of such, specifying one or more - levels of the index to pivot - fill_value - Non-functional argument provided for compatibility with Pandas. - sort : bool, default True - Sort the level(s) in the resulting MultiIndex columns. - - - Returns - ------- - Series or DataFrame - - Examples - -------- - >>> df = cudf.DataFrame() - >>> df['a'] = [1, 1, 1, 2, 2] - >>> df['b'] = [1, 2, 3, 1, 2] - >>> df['c'] = [5, 6, 7, 8, 9] - >>> df['d'] = ['a', 'b', 'a', 'd', 'e'] - >>> df = df.set_index(['a', 'b', 'd']) - >>> df - c - a b d - 1 1 a 5 - 2 b 6 - 3 a 7 - 2 1 d 8 - 2 e 9 - - Unstacking level 'a': - - >>> df.unstack('a') - c - a 1 2 - b d - 1 a 5 - d 8 - 2 b 6 - e 9 - 3 a 7 - - Unstacking level 'd' : - - >>> df.unstack('d') - c - d a b d e - a b - 1 1 5 - 2 6 - 3 7 - 2 1 8 - 2 9 - - Unstacking multiple levels: - - >>> df.unstack(['b', 'd']) - c - b 1 2 3 - d a d b e a - a - 1 5 6 7 - 2 8 9 - - Unstacking single level index dataframe: - - >>> df = cudf.DataFrame({('c', 1): [1, 2, 3], ('c', 2):[9, 8, 7]}) - >>> df.unstack() - c 1 0 1 - 1 2 - 2 3 - 2 0 9 - 1 8 - 2 7 - dtype: int64 - """ - if not isinstance(df, cudf.DataFrame): - raise ValueError("`df` should be a cudf Dataframe object.") - - if df.empty: - raise ValueError("Cannot unstack an empty dataframe.") - - if fill_value is not None: - raise NotImplementedError("fill_value is not supported.") - elif sort is False: - raise NotImplementedError(f"{sort=} is not supported.") - if pd.api.types.is_list_like(level): - if not level: - return df - if not isinstance(df.index, cudf.MultiIndex): - dtype = df._columns[0].dtype - for col in df._columns: - if not col.dtype == dtype: - raise ValueError( - "Calling unstack() on single index dataframe" - " with different column datatype is not supported." - ) - res = df.T.stack(future_stack=False) - # Result's index is a multiindex - res.index.names = ( - tuple(df._data.to_pandas_index().names) + df.index.names - ) - return res - else: - index = df.index.droplevel(level) - if is_scalar(level): - columns = df.index.get_level_values(level) - else: - new_names = [] - ca_data = {} - for lev in level: - ca_level, level_idx = df.index._level_to_ca_label(lev) - new_names.append(df.index.names[level_idx]) - ca_data[ca_level] = df.index._data[ca_level] - columns = type(df.index)._from_data( - ColumnAccessor(ca_data, verify=False) - ) - columns.names = new_names - result = _pivot(df, index, columns) - if result.index.nlevels == 1: - result.index = result.index.get_level_values(result.index.names[0]) - return result - - -def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase: - """ - Returns unique values in a column, if - dummy_na is False, nan's are also dropped. - """ - if isinstance(column.dtype, cudf.CategoricalDtype): - unique = column.categories # type: ignore[attr-defined] - else: - unique = column.unique().sort_values() - if not dummy_na: - unique = unique.nans_to_nulls().dropna() - return unique - - -def _one_hot_encode_column( - column: ColumnBase, - categories: ColumnBase, - prefix: str | None, - prefix_sep: str | None, - dtype: Dtype | None, - drop_first: bool, -) -> dict[str, ColumnBase]: - """Encode a single column with one hot encoding. The return dictionary - contains pairs of (category, encodings). The keys may be prefixed with - `prefix`, separated with category name with `prefix_sep`. The encoding - columns maybe coerced into `dtype`. - """ - if isinstance(column.dtype, cudf.CategoricalDtype): - if column.size == column.null_count: - column = column_empty_like(categories, newsize=column.size) - else: - column = column._get_decategorized_column() # type: ignore[attr-defined] - - if column.size * categories.size >= np.iinfo(size_type_dtype).max: - raise ValueError( - "Size limitation exceeded: column.size * category.size < " - f"np.iinfo({size_type_dtype}).max. Consider reducing " - "size of category" - ) - data = one_hot_encode(column, categories) - - if drop_first and len(data): - data.pop(next(iter(data))) - if prefix is not None and prefix_sep is not None: - data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()} - if dtype: - data = {k: v.astype(dtype) for k, v in data.items()} - return data - - -def _length_check_params(obj, columns, name): - if cudf.api.types.is_list_like(obj): - if len(obj) != len(columns): - raise ValueError( - f"Length of '{name}' ({len(obj)}) did not match the " - f"length of the columns being " - f"encoded ({len(columns)})." - ) - - -def _get_pivot_names(arrs, names, prefix): - """ - Generates unique names for rows/columns - """ - if names is None: - names = [] - for i, arr in enumerate(arrs): - if isinstance(arr, cudf.Series) and arr.name is not None: - names.append(arr.name) - else: - names.append(f"{prefix}_{i}") - else: - if len(names) != len(arrs): - raise ValueError("arrays and names must have the same length") - if not isinstance(names, list): - names = list(names) - - return names - - -def crosstab( - index, - columns, - values=None, - rownames=None, - colnames=None, - aggfunc=None, - margins=False, - margins_name="All", - dropna=None, - normalize=False, -): - """ - Compute a simple cross tabulation of two (or more) factors. By default - computes a frequency table of the factors unless an array of values and an - aggregation function are passed. - - Parameters - ---------- - index : array-like, Series, or list of arrays/Series - Values to group by in the rows. - columns : array-like, Series, or list of arrays/Series - Values to group by in the columns. - values : array-like, optional - Array of values to aggregate according to the factors. - Requires `aggfunc` be specified. - rownames : list of str, default None - If passed, must match number of row arrays passed. - colnames : list of str, default None - If passed, must match number of column arrays passed. - aggfunc : function, optional - If specified, requires `values` be specified as well. - margins : Not supported - margins_name : Not supported - dropna : Not supported - normalize : Not supported - - Returns - ------- - DataFrame - Cross tabulation of the data. - - Examples - -------- - >>> a = cudf.Series(["foo", "foo", "foo", "foo", "bar", "bar", - ... "bar", "bar", "foo", "foo", "foo"], dtype=object) - >>> b = cudf.Series(["one", "one", "one", "two", "one", "one", - ... "one", "two", "two", "two", "one"], dtype=object) - >>> c = cudf.Series(["dull", "dull", "shiny", "dull", "dull", "shiny", - ... "shiny", "dull", "shiny", "shiny", "shiny"], - ... dtype=object) - >>> cudf.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) - b one two - c dull shiny dull shiny - a - bar 1 2 1 0 - foo 2 2 1 2 - """ - if normalize is not False: - raise NotImplementedError("normalize is not supported yet") - - if values is None and aggfunc is not None: - raise ValueError("aggfunc cannot be used without values.") - - if values is not None and aggfunc is None: - raise ValueError("values cannot be used without an aggfunc.") - - if not isinstance(index, (list, tuple)): - index = [index] - if not isinstance(columns, (list, tuple)): - columns = [columns] - - if not rownames: - rownames = _get_pivot_names(index, rownames, prefix="row") - if not colnames: - colnames = _get_pivot_names(columns, colnames, prefix="col") - - if len(index) != len(rownames): - raise ValueError("index and rownames must have same length") - if len(columns) != len(colnames): - raise ValueError("columns and colnames must have same length") - - if len(set(rownames)) != len(rownames): - raise ValueError("rownames must be unique") - if len(set(colnames)) != len(colnames): - raise ValueError("colnames must be unique") - - data = { - **dict(zip(rownames, map(as_column, index))), - **dict(zip(colnames, map(as_column, columns))), - } - - df = cudf.DataFrame._from_data(data) - - if values is None: - df["__dummy__"] = 0 - kwargs = {"aggfunc": "count", "fill_value": 0} - else: - df["__dummy__"] = values - kwargs = {"aggfunc": aggfunc} - - table = pivot_table( - data=df, - index=rownames, - columns=colnames, - values="__dummy__", - margins=margins, - margins_name=margins_name, - dropna=dropna, - **kwargs, - ) - - return table - - -def pivot_table( - data, - values=None, - index=None, - columns=None, - aggfunc="mean", - fill_value=None, - margins=False, - dropna=None, - margins_name="All", - observed=False, - sort=True, -): - """ - Create a spreadsheet-style pivot table as a DataFrame. - - Parameters - ---------- - data : DataFrame - values : column name or list of column names to aggregate, optional - index : list of column names - Values to group by in the rows. - columns : list of column names - Values to group by in the columns. - aggfunc : str or dict, default "mean" - If dict is passed, the key is column to aggregate - and value is function name. - fill_value : scalar, default None - Value to replace missing values with - (in the resulting pivot table, after aggregation). - margins : Not supported - dropna : Not supported - margins_name : Not supported - observed : Not supported - sort : Not supported - - Returns - ------- - DataFrame - An Excel style pivot table. - """ - if margins is not False: - raise NotImplementedError("margins is not supported yet") - - if margins_name != "All": - raise NotImplementedError("margins_name is not supported yet") - - if dropna is not None: - raise NotImplementedError("dropna is not supported yet") - - if observed is not False: - raise NotImplementedError("observed is not supported yet") - - if sort is not True: - raise NotImplementedError("sort is not supported yet") - - keys = index + columns - - values_passed = values is not None - if values_passed: - if pd.api.types.is_list_like(values): - values_multi = True - values = list(values) - else: - values_multi = False - values = [values] - - for i in values: - if i not in data: - raise KeyError(i) - - to_filter = [] - for x in keys + values: - if isinstance(x, cudf.Grouper): - x = x.key - try: - if x in data: - to_filter.append(x) - except TypeError: - pass - if len(to_filter) < len(data._column_names): - data = data[to_filter] - - else: - values = data.columns - for key in keys: - try: - values = values.drop(key) - except (TypeError, ValueError, KeyError): - pass - values = list(values) - - grouped = data.groupby(keys) - agged = grouped.agg(aggfunc) - - table = agged - - if table.index.nlevels > 1 and index: - # If index_names are integers, determine whether the integers refer - # to the level position or name. - index_names = agged.index.names[: len(index)] - to_unstack = [] - for i in range(len(index), len(keys)): - name = agged.index.names[i] - if name is None or name in index_names: - to_unstack.append(i) - else: - to_unstack.append(name) - table = agged.unstack(to_unstack) - - if fill_value is not None: - table = table.fillna(fill_value) - - # discard the top level - if values_passed and not values_multi and table._data.multiindex: - column_names = table._data.level_names[1:] - table_columns = tuple( - map(lambda column: column[1:], table._column_names) - ) - table.columns = pd.MultiIndex.from_tuples( - tuples=table_columns, names=column_names - ) - - if len(index) == 0 and len(columns) > 0: - table = table.T - - return table diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py deleted file mode 100644 index f6331aa1f49..00000000000 --- a/python/cudf/cudf/core/scalar.py +++ /dev/null @@ -1,402 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import decimal -import operator -from collections import OrderedDict - -import numpy as np -import pyarrow as pa - -import cudf -from cudf.api.types import is_scalar -from cudf.core.dtypes import ListDtype, StructDtype -from cudf.core.missing import NA, NaT -from cudf.core.mixins import BinaryOperand -from cudf.utils.dtypes import ( - get_allowed_combinations_for_operator, - to_cudf_compatible_scalar, -) - - -# Note that the metaclass below can easily be generalized for use with -# other classes, if needed in the future. Simply replace the arguments -# of the `__call__` method with `*args` and `**kwargs`. This will -# result in additional overhead when constructing the cache key, as -# unpacking *args and **kwargs is not cheap. See the discussion in -# https://github.com/rapidsai/cudf/pull/11246#discussion_r955843532 -# for details. -class CachedScalarInstanceMeta(type): - """ - Metaclass for Scalar that caches `maxsize` instances. - - After `maxsize` is reached, evicts the least recently used - instances to make room for new values. - """ - - def __new__(cls, names, bases, attrs, **kwargs): - return type.__new__(cls, names, bases, attrs) - - # choose 128 because that's the default `maxsize` for - # `functools.lru_cache`: - def __init__(self, names, bases, attrs, maxsize=128): - self.__maxsize = maxsize - self.__instances = OrderedDict() - - def __call__(self, value, dtype=None): - # the cache key is constructed from the arguments, and also - # the _types_ of the arguments, since objects of different - # types can compare equal - cache_key = (value, type(value), dtype, type(dtype)) - try: - # try retrieving an instance from the cache: - self.__instances.move_to_end(cache_key) - return self.__instances[cache_key] - except KeyError: - # if an instance couldn't be found in the cache, - # construct it and add to cache: - obj = super().__call__(value, dtype=dtype) - try: - self.__instances[cache_key] = obj - except TypeError: - # couldn't hash the arguments, don't cache: - return obj - if len(self.__instances) > self.__maxsize: - self.__instances.popitem(last=False) - return obj - except TypeError: - # couldn't hash the arguments, don't cache: - return super().__call__(value, dtype=dtype) - - def _clear_instance_cache(self): - self.__instances.clear() - - -class Scalar(BinaryOperand, metaclass=CachedScalarInstanceMeta): - """ - A GPU-backed scalar object with NumPy scalar like properties - May be used in binary operations against other scalars, cuDF - Series, DataFrame, and Index objects. - - Examples - -------- - >>> import cudf - >>> cudf.Scalar(42, dtype='int64') - Scalar(42, dtype=int64) - >>> cudf.Scalar(42, dtype='int32') + cudf.Scalar(42, dtype='float64') - Scalar(84.0, dtype=float64) - >>> cudf.Scalar(42, dtype='int64') + np.int8(21) - Scalar(63, dtype=int64) - >>> x = cudf.Scalar(42, dtype='datetime64[s]') - >>> y = cudf.Scalar(21, dtype='timedelta64[ns]') - >>> x - y - Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns]) - >>> cudf.Series([1,2,3]) + cudf.Scalar(1) - 0 2 - 1 3 - 2 4 - dtype: int64 - >>> df = cudf.DataFrame({'a':[1,2,3], 'b':[4.5, 5.5, 6.5]}) - >>> slr = cudf.Scalar(10, dtype='uint8') - >>> df - slr - a b - 0 -9 -5.5 - 1 -8 -4.5 - 2 -7 -3.5 - - Parameters - ---------- - value : Python Scalar, NumPy Scalar, or cuDF Scalar - The scalar value to be converted to a GPU backed scalar object - dtype : np.dtype or string specifier - The data type - """ - - _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS - - def __init__(self, value, dtype=None): - self._host_value = None - self._host_dtype = None - self._device_value = None - - if isinstance(value, Scalar): - if value._is_host_value_current: - self._host_value = value._host_value - self._host_dtype = value._host_dtype - else: - self._device_value = value._device_value - else: - self._host_value, self._host_dtype = self._preprocess_host_value( - value, dtype - ) - - @classmethod - def from_device_scalar(cls, device_scalar): - if not isinstance(device_scalar, cudf._lib.scalar.DeviceScalar): - raise TypeError( - "Expected an instance of DeviceScalar, " - f"got {type(device_scalar).__name__}" - ) - obj = object.__new__(cls) - obj._host_value = None - obj._host_dtype = None - obj._device_value = device_scalar - return obj - - @property - def _is_host_value_current(self): - return self._host_value is not None - - @property - def _is_device_value_current(self): - return self._device_value is not None - - @property - def device_value(self): - if self._device_value is None: - self._device_value = cudf._lib.scalar.DeviceScalar( - self._host_value, self._host_dtype - ) - return self._device_value - - @property - def value(self): - if not self._is_host_value_current: - self._device_value_to_host() - return self._host_value - - # todo: change to cached property - @property - def dtype(self): - if self._is_host_value_current: - if isinstance(self._host_value, str): - return cudf.dtype("object") - else: - return self._host_dtype - else: - return self.device_value.dtype - - def is_valid(self): - if not self._is_host_value_current: - self._device_value_to_host() - return not cudf._lib.scalar._is_null_host_scalar(self._host_value) - - def _device_value_to_host(self): - self._host_value = self._device_value._to_host_scalar() - - def _preprocess_host_value(self, value, dtype): - valid = not cudf._lib.scalar._is_null_host_scalar(value) - - if isinstance(value, list): - if dtype is not None: - raise TypeError("Lists may not be cast to a different dtype") - else: - dtype = ListDtype.from_arrow( - pa.infer_type([value], from_pandas=True) - ) - return value, dtype - elif isinstance(dtype, ListDtype): - if value not in {None, NA}: - raise ValueError(f"Can not coerce {value} to ListDtype") - else: - return NA, dtype - - if isinstance(value, dict): - if dtype is None: - dtype = StructDtype.from_arrow( - pa.infer_type([value], from_pandas=True) - ) - return value, dtype - elif isinstance(dtype, StructDtype): - if value not in {None, NA}: - raise ValueError(f"Can not coerce {value} to StructDType") - else: - return NA, dtype - - if isinstance(dtype, cudf.core.dtypes.DecimalDtype): - value = pa.scalar( - value, type=pa.decimal128(dtype.precision, dtype.scale) - ).as_py() - if isinstance(value, decimal.Decimal) and dtype is None: - dtype = cudf.Decimal128Dtype._from_decimal(value) - - value = to_cudf_compatible_scalar(value, dtype=dtype) - - if dtype is None: - if not valid: - if value is NaT: - value = value.to_numpy() - - if isinstance(value, (np.datetime64, np.timedelta64)): - unit, _ = np.datetime_data(value) - if unit == "generic": - raise TypeError( - "Cant convert generic NaT to null scalar" - ) - else: - dtype = value.dtype - else: - raise TypeError( - "dtype required when constructing a null scalar" - ) - else: - dtype = value.dtype - - if not isinstance(dtype, cudf.core.dtypes.DecimalDtype): - dtype = cudf.dtype(dtype) - - if not valid: - value = NaT if dtype.kind in "mM" else NA - - return value, dtype - - def _sync(self): - """ - If the cache is not synched, copy either the device or host value - to the host or device respectively. If cache is valid, do nothing - """ - if self._is_host_value_current and self._is_device_value_current: - return - elif self._is_host_value_current and not self._is_device_value_current: - self._device_value = cudf._lib.scalar.DeviceScalar( - self._host_value, self._host_dtype - ) - elif self._is_device_value_current and not self._is_host_value_current: - self._host_value = self._device_value.value - self._host_dtype = self._host_value.dtype - else: - raise ValueError("Invalid cudf.Scalar") - - def __index__(self): - if self.dtype.kind not in {"u", "i"}: - raise TypeError("Only Integer typed scalars may be used in slices") - return int(self) - - def __int__(self): - return int(self.value) - - def __float__(self): - return float(self.value) - - def __bool__(self): - return bool(self.value) - - def __round__(self, n): - return self._binaryop(n, "__round__") - - # Scalar Unary Operations - def __abs__(self): - return self._scalar_unaop("__abs__") - - def __ceil__(self): - return self._scalar_unaop("__ceil__") - - def __floor__(self): - return self._scalar_unaop("__floor__") - - def __invert__(self): - return self._scalar_unaop("__invert__") - - def __neg__(self): - return self._scalar_unaop("__neg__") - - def __repr__(self): - # str() fixes a numpy bug with NaT - # https://github.com/numpy/numpy/issues/17552 - return ( - f"{self.__class__.__name__}" - f"({str(self.value)}, dtype={self.dtype})" - ) - - def _binop_result_dtype_or_error(self, other, op): - if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}: - return np.bool_ - - out_dtype = get_allowed_combinations_for_operator( - self.dtype, other.dtype, op - ) - - # datetime handling - if out_dtype in {"M", "m"}: - if self.dtype.char in {"M", "m"} and other.dtype.char not in { - "M", - "m", - }: - return self.dtype - if other.dtype.char in {"M", "m"} and self.dtype.char not in { - "M", - "m", - }: - return other.dtype - else: - if ( - op == "__sub__" - and self.dtype.char == other.dtype.char == "M" - ): - res, _ = np.datetime_data(max(self.dtype, other.dtype)) - return cudf.dtype("m8" + f"[{res}]") - return np.result_type(self.dtype, other.dtype) - - return cudf.dtype(out_dtype) - - def _binaryop(self, other, op: str): - if is_scalar(other): - other = to_cudf_compatible_scalar(other) - out_dtype = self._binop_result_dtype_or_error(other, op) - valid = self.is_valid() and ( - isinstance(other, np.generic) or other.is_valid() - ) - if not valid: - return Scalar(None, dtype=out_dtype) - else: - result = self._dispatch_scalar_binop(other, op) - return Scalar(result, dtype=out_dtype) - else: - return NotImplemented - - def _dispatch_scalar_binop(self, other, op): - if isinstance(other, Scalar): - rhs = other.value - else: - rhs = other - lhs = self.value - reflect, op = self._check_reflected_op(op) - if reflect: - lhs, rhs = rhs, lhs - try: - return getattr(operator, op)(lhs, rhs) - except AttributeError: - return getattr(lhs, op)(rhs) - - def _unaop_result_type_or_error(self, op): - if op == "__neg__" and self.dtype == "bool": - raise TypeError( - "Boolean scalars in cuDF do not support" - " negation, use logical not" - ) - - if op in {"__ceil__", "__floor__"}: - if self.dtype.char in "bBhHf?": - return cudf.dtype("float32") - else: - return cudf.dtype("float64") - return self.dtype - - def _scalar_unaop(self, op): - out_dtype = self._unaop_result_type_or_error(op) - if not self.is_valid(): - result = None - else: - result = self._dispatch_scalar_unaop(op) - return Scalar(result, dtype=out_dtype) - - def _dispatch_scalar_unaop(self, op): - if op == "__floor__": - return np.floor(self.value) - if op == "__ceil__": - return np.ceil(self.value) - return getattr(self.value, op)() - - def astype(self, dtype): - if self.dtype == dtype: - return self - return Scalar(self.value, dtype) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py deleted file mode 100644 index acd97c2047c..00000000000 --- a/python/cudf/cudf/core/series.py +++ /dev/null @@ -1,5372 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import functools -import inspect -import pickle -import textwrap -import warnings -from collections import abc -from shutil import get_terminal_size -from typing import TYPE_CHECKING, Any, Literal, MutableMapping - -import cupy -import numpy as np -import pandas as pd -from typing_extensions import Self, assert_never - -import cudf -from cudf import _lib as libcudf -from cudf.api.extensions import no_default -from cudf.api.types import ( - _is_non_decimal_numeric_dtype, - _is_scalar_or_zero_d_array, - is_dict_like, - is_integer, - is_scalar, -) -from cudf.core import indexing_utils -from cudf.core._compat import PANDAS_LT_300 -from cudf.core.abc import Serializable -from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ( - ColumnBase, - DatetimeColumn, - IntervalColumn, - TimeDeltaColumn, - as_column, -) -from cudf.core.column.categorical import ( - _DEFAULT_CATEGORICAL_VALUE, - CategoricalAccessor as CategoricalAccessor, - CategoricalColumn, -) -from cudf.core.column.column import concat_columns -from cudf.core.column.lists import ListMethods -from cudf.core.column.string import StringMethods -from cudf.core.column.struct import StructMethods -from cudf.core.column_accessor import ColumnAccessor -from cudf.core.groupby.groupby import SeriesGroupBy, groupby_doc_template -from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, ensure_index -from cudf.core.indexed_frame import ( - IndexedFrame, - _FrameIndexer, - _get_label_range_or_mask, - _indices_from_labels, - doc_reset_index_template, -) -from cudf.core.resample import SeriesResampler -from cudf.core.single_column_frame import SingleColumnFrame -from cudf.core.udf.scalar_function import _get_scalar_kernel -from cudf.errors import MixedTypeError -from cudf.utils import docutils -from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import ( - can_convert_to_column, - find_common_type, - is_mixed_with_object_dtype, - to_cudf_compatible_scalar, -) -from cudf.utils.performance_tracking import _performance_tracking - -if TYPE_CHECKING: - import pyarrow as pa - - from cudf._typing import ( - ColumnLike, - DataFrameOrSeries, - NotImplementedType, - ScalarLike, - ) - - -def _format_percentile_names(percentiles): - return [f"{int(x * 100)}%" for x in percentiles] - - -def _describe_numeric(obj, percentiles): - # Helper for Series.describe with numerical data. - data = { - "count": obj.count(), - "mean": obj.mean(), - "std": obj.std(), - "min": obj.min(), - **dict( - zip( - _format_percentile_names(percentiles), - obj.quantile(percentiles).to_numpy(na_value=np.nan).tolist(), - ) - ), - "max": obj.max(), - } - return {k: round(v, 6) for k, v in data.items()} - - -def _describe_timetype(obj, percentiles, typ): - # Common helper for Series.describe with timedelta/timestamp data. - data = { - "count": str(obj.count()), - "mean": str(typ(obj.mean())), - "std": "", - "min": str(typ(obj.min())), - **dict( - zip( - _format_percentile_names(percentiles), - obj.quantile(percentiles) - .astype("str") - .to_numpy(na_value=np.nan) - .tolist(), - ) - ), - "max": str(typ(obj.max())), - } - - if typ is pd.Timedelta: - data["std"] = str(obj.std()) - else: - data.pop("std") - return data - - -def _describe_timedelta(obj, percentiles): - # Helper for Series.describe with timedelta data. - return _describe_timetype(obj, percentiles, pd.Timedelta) - - -def _describe_timestamp(obj, percentiles): - # Helper for Series.describe with timestamp data. - return _describe_timetype(obj, percentiles, pd.Timestamp) - - -def _describe_categorical(obj, percentiles): - # Helper for Series.describe with categorical data. - data = { - "count": obj.count(), - "unique": len(obj.unique()), - "top": None, - "freq": None, - } - if data["count"] > 0: - # In case there's a tie, break the tie by sorting the index - # and take the top. - val_counts = obj.value_counts(ascending=False) - tied_val_counts = val_counts[ - val_counts == val_counts.iloc[0] - ].sort_index() - data.update( - { - "top": tied_val_counts.index[0], - "freq": tied_val_counts.iloc[0], - } - ) - return data - - -def _append_new_row_inplace(col: ColumnLike, value: ScalarLike): - """Append a scalar `value` to the end of `col` inplace. - Cast to common type if possible - """ - to_type = find_common_type([type(value), col.dtype]) - val_col = as_column(value, dtype=to_type) - old_col = col.astype(to_type) - - col._mimic_inplace(concat_columns([old_col, val_col]), inplace=True) - - -class _SeriesIlocIndexer(_FrameIndexer): - """ - For integer-location based selection. - """ - - _frame: cudf.Series - - @_performance_tracking - def __getitem__(self, arg): - indexing_spec = indexing_utils.parse_row_iloc_indexer( - indexing_utils.destructure_series_iloc_indexer(arg, self._frame), - len(self._frame), - ) - return self._frame._getitem_preprocessed(indexing_spec) - - @_performance_tracking - def __setitem__(self, key, value): - if isinstance(key, tuple): - key = list(key) - - # coerce value into a scalar or column - if is_scalar(value): - value = to_cudf_compatible_scalar(value) - if ( - self._frame.dtype.kind not in "mM" - and cudf.utils.utils._isnat(value) - and not ( - self._frame.dtype == "object" and isinstance(value, str) - ) - ): - raise MixedTypeError( - f"Cannot assign {value=} to non-datetime/non-timedelta " - "columns" - ) - elif ( - not ( - self._frame.dtype.kind == "f" - or ( - isinstance(self._frame.dtype, cudf.CategoricalDtype) - and self._frame.dtype.categories.dtype.kind == "f" - ) - ) - and isinstance(value, np.floating) - and np.isnan(value) - ): - raise MixedTypeError( - f"Cannot assign {value=} to " - f"non-float dtype={self._frame.dtype}" - ) - elif self._frame.dtype.kind == "b" and not ( - value in {None, cudf.NA} - or isinstance(value, (np.bool_, bool)) - or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b") - ): - raise MixedTypeError( - f"Cannot assign {value=} to " - f"bool dtype={self._frame.dtype}" - ) - elif not ( - isinstance(value, (list, dict)) - and isinstance( - self._frame.dtype, (cudf.ListDtype, cudf.StructDtype) - ) - ): - value = as_column(value) - - if ( - (self._frame.dtype.kind in "uifb" or self._frame.dtype == "object") - and hasattr(value, "dtype") - and value.dtype.kind in "uifb" - ): - # normalize types if necessary: - # In contrast to Column.__setitem__ (which downcasts the value to - # the dtype of the column) here we upcast the series to the - # larger data type mimicking pandas - to_dtype = np.result_type(value.dtype, self._frame.dtype) - value = value.astype(to_dtype) - if to_dtype != self._frame.dtype: - # Do not remove until pandas-3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn( - f"Setting an item of incompatible dtype is deprecated " - "and will raise in a future error of pandas. " - f"Value '{value}' has dtype incompatible with " - f"{self._frame.dtype}, " - "please explicitly cast to a compatible dtype first.", - FutureWarning, - ) - self._frame._column._mimic_inplace( - self._frame._column.astype(to_dtype), inplace=True - ) - - self._frame._column[key] = value - - -class _SeriesLocIndexer(_FrameIndexer): - """ - Label-based selection - """ - - @_performance_tracking - def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries: - if isinstance(arg, pd.MultiIndex): - arg = cudf.from_pandas(arg) - - if isinstance(self._frame.index, cudf.MultiIndex) and not isinstance( - arg, cudf.MultiIndex - ): - if is_scalar(arg): - row_arg = (arg,) - else: - row_arg = arg - result = self._frame.index._get_row_major(self._frame, row_arg) - if ( - isinstance(arg, tuple) - and len(arg) == self._frame.index.nlevels - and not any(isinstance(x, slice) for x in arg) - ): - result = result.iloc[0] - return result - try: - arg = self._loc_to_iloc(arg) - except (TypeError, KeyError, IndexError, ValueError) as err: - raise KeyError(arg) from err - - return self._frame.iloc[arg] - - @_performance_tracking - def __setitem__(self, key, value): - try: - key = self._loc_to_iloc(key) - except KeyError as e: - if ( - is_scalar(key) - and not isinstance(self._frame.index, cudf.MultiIndex) - and is_scalar(value) - ): - idx = self._frame.index - if isinstance(idx, cudf.RangeIndex): - if isinstance(key, int) and (key == idx[-1] + idx.step): - idx_copy = cudf.RangeIndex( - start=idx.start, - stop=idx.stop + idx.step, - step=idx.step, - name=idx.name, - ) - else: - idx_copy = idx._as_int_index() - _append_new_row_inplace(idx_copy._column, key) - else: - # TODO: Modifying index in place is bad because - # our index are immutable, but columns are not (which - # means our index are mutable with internal APIs). - # Get rid of the deep copy once columns too are - # immutable. - idx_copy = idx.copy(deep=True) - _append_new_row_inplace(idx_copy._column, key) - - self._frame._index = idx_copy - _append_new_row_inplace(self._frame._column, value) - return - else: - raise e - if isinstance(value, (pd.Series, cudf.Series)): - value = cudf.Series(value) - value = value._align_to_index(self._frame.index, how="right") - self._frame.iloc[key] = value - - def _loc_to_iloc(self, arg): - if isinstance(arg, tuple) and arg and isinstance(arg[0], slice): - if len(arg) > 1: - raise IndexError("Too many Indexers") - arg = arg[0] - if _is_scalar_or_zero_d_array(arg): - index_dtype = self._frame.index.dtype - warn_msg = ( - "Series.__getitem__ treating keys as positions is deprecated. " - "In a future version, integer keys will always be treated " - "as labels (consistent with DataFrame behavior). To access " - "a value by position, use `ser.iloc[pos]`" - ) - if not _is_non_decimal_numeric_dtype(index_dtype) and not ( - isinstance(index_dtype, cudf.CategoricalDtype) - and index_dtype.categories.dtype.kind in "iu" - ): - # TODO: switch to cudf.utils.dtypes.is_integer(arg) - if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu": - # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn(warn_msg, FutureWarning) - return arg.value - elif is_integer(arg): - # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn(warn_msg, FutureWarning) - return arg - try: - if isinstance(self._frame.index, RangeIndex): - indices = self._frame.index._indices_of(arg) - else: - indices = self._frame.index._column.indices_of(arg) - if (n := len(indices)) == 0: - raise KeyError("Label scalar is out of bounds") - elif n == 1: - return indices.element_indexing(0) - else: - return indices - except (TypeError, KeyError, IndexError, ValueError): - raise KeyError("Label scalar is out of bounds") - - elif isinstance(arg, slice): - return _get_label_range_or_mask( - self._frame.index, arg.start, arg.stop, arg.step - ) - elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)): - if isinstance(arg, pd.MultiIndex): - arg = cudf.MultiIndex.from_pandas(arg) - - return _indices_from_labels(self._frame, arg) - - else: - arg = cudf.core.series.Series._from_column( - cudf.core.column.as_column(arg) - ) - if arg.dtype.kind == "b": - return arg - else: - indices = _indices_from_labels(self._frame, arg) - if indices.null_count > 0: - raise KeyError("label scalar is out of bound") - return indices - - -class Series(SingleColumnFrame, IndexedFrame, Serializable): - """ - One-dimensional GPU array (including time series). - - Labels need not be unique but must be a hashable type. The object - supports both integer- and label-based indexing and provides a - host of methods for performing operations involving the index. - Statistical methods from ndarray have been overridden to - automatically exclude missing data (currently represented - as null/NaN). - - Operations between Series (`+`, `-`, `/`, `*`, `**`) align - values based on their associated index values, they need - not be the same length. The result index will be the - sorted union of the two indexes. - - ``Series`` objects are used as columns of ``DataFrame``. - - Parameters - ---------- - data : array-like, Iterable, dict, or scalar value - Contains data stored in Series. - - index : array-like or Index (1d) - Values must be hashable and have the same length - as data. Non-unique index values are allowed. Will - default to RangeIndex (0, 1, 2, ..., n) if not provided. - If both a dict and index sequence are used, the index will - override the keys found in the dict. - - dtype : str, :class:`numpy.dtype`, or ExtensionDtype, optional - Data type for the output Series. If not specified, - this will be inferred from data. - - name : str, optional - The name to give to the Series. - - copy : bool, default False - Copy input data. Only affects Series or 1d ndarray input. - - nan_as_null : bool, Default True - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - """ - - _accessors: set[Any] = set() - _loc_indexer_type = _SeriesLocIndexer - _iloc_indexer_type = _SeriesIlocIndexer - _groupby = SeriesGroupBy - _resampler = SeriesResampler - - # The `constructor*` properties are used by `dask` (and `dask_cudf`) - @property - def _constructor(self): - return Series - - @property - def _constructor_sliced(self): - raise NotImplementedError( - "_constructor_sliced not supported for Series!" - ) - - @property - def _constructor_expanddim(self): - return cudf.DataFrame - - @classmethod - @_performance_tracking - def from_categorical(cls, categorical, codes=None): - """Creates from a pandas.Categorical - - Parameters - ---------- - categorical : pandas.Categorical - Contains data stored in a pandas Categorical. - - codes : array-like, optional. - The category codes of this categorical. If ``codes`` are - defined, they are used instead of ``categorical.codes`` - - Returns - ------- - Series - A cudf categorical series. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> pd_categorical = pd.Categorical(pd.Series(['a', 'b', 'c', 'a'], dtype='category')) - >>> pd_categorical - ['a', 'b', 'c', 'a'] - Categories (3, object): ['a', 'b', 'c'] - >>> series = cudf.Series.from_categorical(pd_categorical) - >>> series - 0 a - 1 b - 2 c - 3 a - dtype: category - Categories (3, object): ['a', 'b', 'c'] - """ # noqa: E501 - col = as_column(categorical) - if codes is not None: - codes = as_column(codes) - - valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE) - - mask = None - if not valid_codes.all(): - mask = libcudf.transform.bools_to_mask(valid_codes) - col = CategoricalColumn( - data=col.data, - size=codes.size, - dtype=col.dtype, - mask=mask, - children=(codes,), - ) - return Series._from_column(col) - - @classmethod - @_performance_tracking - def from_arrow(cls, array: pa.Array) -> Self: - """Create from PyArrow Array/ChunkedArray. - - Parameters - ---------- - array : PyArrow Array/ChunkedArray - PyArrow Object which has to be converted. - - Raises - ------ - TypeError for invalid input type. - - Returns - ------- - SingleColumnFrame - - Examples - -------- - >>> import cudf - >>> import pyarrow as pa - >>> cudf.Series.from_arrow(pa.array(["a", "b", None])) - 0 a - 1 b - 2 - dtype: object - """ - return cls._from_column(ColumnBase.from_arrow(array)) - - @classmethod - @_performance_tracking - def from_masked_array(cls, data, mask, null_count=None): - """Create a Series with null-mask. - This is equivalent to: - - Series(data).set_mask(mask, null_count=null_count) - - Parameters - ---------- - data : 1D array-like - The values. Null values must not be skipped. They can appear - as garbage values. - mask : 1D array-like - The null-mask. Valid values are marked as ``1``; otherwise ``0``. - The mask bit given the data index ``idx`` is computed as:: - - (mask[idx // 8] >> (idx % 8)) & 1 - null_count : int, optional - The number of null values. - If None, it is calculated automatically. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> a = cudf.Series([1, 2, 3, None, 4, None]) - >>> a - 0 1 - 1 2 - 2 3 - 3 - 4 4 - 5 - dtype: int64 - >>> b = cudf.Series([10, 11, 12, 13, 14]) - >>> cudf.Series.from_masked_array(data=b, mask=a._column.mask) - 0 10 - 1 11 - 2 12 - 3 - 4 14 - dtype: int64 - """ - return cls._from_column(as_column(data).set_mask(mask)) - - @_performance_tracking - def __init__( - self, - data=None, - index=None, - dtype=None, - name=None, - copy=False, - nan_as_null=no_default, - ): - if nan_as_null is no_default: - nan_as_null = not cudf.get_option("mode.pandas_compatible") - index_from_data = None - name_from_data = None - if data is None: - data = {} - - if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)): - if copy and not isinstance(data, (pd.Series, pd.Index)): - data = data.copy(deep=True) - name_from_data = data.name - column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) - if isinstance(data, (pd.Series, Series)): - index_from_data = ensure_index(data.index) - elif isinstance(data, (ColumnAccessor, ColumnBase)): - raise TypeError( - "Use cudf.Series._from_data for constructing a Series from " - "ColumnAccessor or a ColumnBase" - ) - elif isinstance(data, dict): - if not data: - column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) - index_from_data = RangeIndex(0) - else: - column = as_column( - list(data.values()), nan_as_null=nan_as_null, dtype=dtype - ) - index_from_data = cudf.Index(list(data.keys())) - else: - # Using `getattr_static` to check if - # `data` is on device memory and perform - # a deep copy later. This is different - # from `hasattr` because, it doesn't - # invoke the property we are looking - # for and the latter actually invokes - # the property, which in this case could - # be expensive or mark a buffer as - # unspillable. - has_cai = ( - type( - inspect.getattr_static( - data, "__cuda_array_interface__", None - ) - ) - is property - ) - column = as_column( - data, - nan_as_null=nan_as_null, - dtype=dtype, - length=len(index) if index is not None else None, - ) - if copy and has_cai: - column = column.copy(deep=True) - - assert isinstance(column, ColumnBase) - - if dtype is not None: - column = column.astype(dtype) - - if name_from_data is not None and name is None: - name = name_from_data - - if index is not None: - index = ensure_index(index) - - if index_from_data is not None: - first_index = index_from_data - second_index = index - elif index is None: - first_index = RangeIndex(len(column)) - second_index = None - else: - first_index = index - second_index = None - - super().__init__({name: column}, index=first_index) - if second_index is not None: - # TODO: This there a better way to do this? - reindexed = self.reindex(index=second_index, copy=False) - self._data = reindexed._data - self._index = second_index - self._check_data_index_length_match() - - @classmethod - @_performance_tracking - def _from_column( - cls, - column: ColumnBase, - *, - name: abc.Hashable = None, - index: BaseIndex | None = None, - ) -> Self: - ca = ColumnAccessor({name: column}, verify=False) - return cls._from_data(ca, index=index) - - @classmethod - @_performance_tracking - def _from_data( - cls, - data: MutableMapping, - index: BaseIndex | None = None, - name: Any = no_default, - ) -> Series: - out = super()._from_data(data=data, index=index) - if name is not no_default: - out.name = name - return out - - @_performance_tracking - def _from_data_like_self(self, data: MutableMapping): - out = super()._from_data_like_self(data) - out.name = self.name - return out - - @_performance_tracking - def __contains__(self, item): - return item in self.index - - @classmethod - @_performance_tracking - def from_pandas(cls, s: pd.Series, nan_as_null=no_default): - """ - Convert from a Pandas Series. - - Parameters - ---------- - s : Pandas Series object - A Pandas Series object which has to be converted - to cuDF Series. - nan_as_null : bool, Default None - If ``None``/``True``, converts ``np.nan`` values to - ``null`` values. - If ``False``, leaves ``np.nan`` values as is. - - Raises - ------ - TypeError for invalid input type. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> import numpy as np - >>> data = [10, 20, 30, np.nan] - >>> pds = pd.Series(data, dtype='float64') - >>> cudf.Series.from_pandas(pds) - 0 10.0 - 1 20.0 - 2 30.0 - 3 - dtype: float64 - >>> cudf.Series.from_pandas(pds, nan_as_null=False) - 0 10.0 - 1 20.0 - 2 30.0 - 3 NaN - dtype: float64 - """ - if nan_as_null is no_default: - nan_as_null = ( - False if cudf.get_option("mode.pandas_compatible") else None - ) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - result = cls(s, nan_as_null=nan_as_null) - return result - - @property # type: ignore - @_performance_tracking - def is_unique(self): - """Return boolean if values in the object are unique. - - Returns - ------- - bool - """ - return self._column.is_unique - - @property # type: ignore - @_performance_tracking - def dt(self): - """ - Accessor object for datetime-like properties of the Series values. - - Examples - -------- - >>> s = cudf.Series(cudf.date_range( - ... start='2001-02-03 12:00:00', - ... end='2001-02-03 14:00:00', - ... freq='1H')) - >>> s.dt.hour - 0 12 - 1 13 - 2 14 - dtype: int16 - >>> s.dt.second - 0 0 - 1 0 - 2 0 - dtype: int16 - >>> s.dt.day - 0 3 - 1 3 - 2 3 - dtype: int16 - - Returns - ------- - A Series indexed like the original Series. - - Raises - ------ - TypeError if the Series does not contain datetimelike values. - """ - if self.dtype.kind == "M": - return DatetimeProperties(self) - elif self.dtype.kind == "m": - return TimedeltaProperties(self) - else: - raise AttributeError( - "Can only use .dt accessor with datetimelike values" - ) - - @property # type: ignore - @_performance_tracking - def hasnans(self): - """ - Return True if there are any NaNs or nulls. - - Returns - ------- - out : bool - If Series has at least one NaN or null value, return True, - if not return False. - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> series = cudf.Series([1, 2, np.nan, 3, 4], nan_as_null=False) - >>> series - 0 1.0 - 1 2.0 - 2 NaN - 3 3.0 - 4 4.0 - dtype: float64 - >>> series.hasnans - True - - `hasnans` returns `True` for the presence of any `NA` values: - - >>> series = cudf.Series([1, 2, 3, None, 4]) - >>> series - 0 1 - 1 2 - 2 3 - 3 - 4 4 - dtype: int64 - >>> series.hasnans - True - """ - return self._column.has_nulls(include_nan=True) - - @_performance_tracking - def serialize(self): - header, frames = super().serialize() - - header["index"], index_frames = self.index.serialize() - header["index_frame_count"] = len(index_frames) - # For backwards compatibility with older versions of cuDF, index - # columns are placed before data columns. - frames = index_frames + frames - - return header, frames - - @classmethod - @_performance_tracking - def deserialize(cls, header, frames): - index_nframes = header["index_frame_count"] - obj = super().deserialize( - header, frames[header["index_frame_count"] :] - ) - - idx_typ = pickle.loads(header["index"]["type-serialized"]) - index = idx_typ.deserialize(header["index"], frames[:index_nframes]) - obj.index = index - - return obj - - @_performance_tracking - def drop( - self, - labels=None, - axis=0, - index=None, - columns=None, - level=None, - inplace=False, - errors="raise", - ): - if axis == 1: - raise ValueError("No axis named 1 for object type Series") - # Ignore columns for Series - if columns is not None: - columns = [] - return super().drop( - labels, axis, index, columns, level, inplace, errors - ) - - def tolist(self): # noqa: D102 - raise TypeError( - "cuDF does not support conversion to host memory " - "via the `tolist()` method. Consider using " - "`.to_arrow().to_pylist()` to construct a Python list." - ) - - to_list = tolist - - @_performance_tracking - def to_dict(self, into: type[dict] = dict) -> dict: - """ - Convert Series to {label -> value} dict or dict-like object. - - Parameters - ---------- - into : class, default dict - The collections.abc.Mapping subclass to use as the return - object. Can be the actual class or an empty - instance of the mapping type you want. If you want a - collections.defaultdict, you must pass it initialized. - - Returns - ------- - collections.abc.Mapping - Key-value representation of Series. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([1, 2, 3, 4]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - dtype: int64 - >>> s.to_dict() - {0: 1, 1: 2, 2: 3, 3: 4} - >>> from collections import OrderedDict, defaultdict - >>> s.to_dict(OrderedDict) # doctest: +SKIP - OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) - >>> dd = defaultdict(list) - >>> s.to_dict(dd) - defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) - """ - return self.to_pandas().to_dict(into=into) - - @_performance_tracking - def reindex( - self, - index=None, - *, - axis=None, - method: str | None = None, - copy: bool = True, - level=None, - fill_value: ScalarLike | None = None, - limit: int | None = None, - tolerance=None, - ): - """ - Conform Series to new index. - - Parameters - ---------- - index : Index, Series-convertible, default None - New labels / index to conform to, - should be specified using keywords. - axis: int, default None - Unused. - method: Not Supported - copy : boolean, default True - level: Not Supported - fill_value : Value to use for missing values. - Defaults to ``NA``, but can be any "compatible" value. - limit: Not Supported - tolerance: Not Supported - - Returns - ------- - Series with changed index. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd']) - >>> series - a 10 - b 20 - c 30 - d 40 - dtype: int64 - >>> series.reindex(['a', 'b', 'y', 'z']) - a 10 - b 20 - y - z - dtype: int64 - - .. pandas-compat:: - :meth:`pandas.Series.reindex` - - Note: One difference from Pandas is that ``NA`` is used for rows - that do not match, rather than ``NaN``. One side effect of this is - that the series retains an integer dtype in cuDF - where it is cast to float in Pandas. - - """ - if index is None: - index = self.index - if fill_value is None: - fill_value = cudf.NA - - name = self.name or 0 - series = self._reindex( - deep=copy, - dtypes={name: self.dtype}, - index=index, - column_names=[name], - inplace=False, - fill_value=fill_value, - level=level, - method=method, - limit=limit, - tolerance=tolerance, - ) - series.name = self.name - return series - - @_performance_tracking - @docutils.doc_apply( - doc_reset_index_template.format( - klass="Series", - argument=""" - name : object, optional - The name to use for the column containing the original Series - values. Uses self.name by default. This argument is ignored when - ``drop`` is True.""", - return_type="Series or DataFrame or None", - return_doc=""" For Series, When drop is False (the default), a DataFrame - is returned. The newly created columns will come first in the - DataFrame, followed by the original Series values. When `drop` is - True, a `Series` is returned. In either case, if ``inplace=True``, - no value is returned. -""", # noqa: E501 - example=""" - >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) - >>> series - 10 a - 11 b - 12 c - 13 d - dtype: object - >>> series.reset_index() - index 0 - 0 10 a - 1 11 b - 2 12 c - 3 13 d - >>> series.reset_index(drop=True) - 0 a - 1 b - 2 c - 3 d - dtype: object - - You can also use ``reset_index`` with MultiIndex. - - >>> s2 = cudf.Series( - ... range(4), name='foo', - ... index=cudf.MultiIndex.from_tuples([ - ... ('bar', 'one'), ('bar', 'two'), - ... ('baz', 'one'), ('baz', 'two')], - ... names=['a', 'b'] - ... )) - >>> s2 - a b - bar one 0 - two 1 - baz one 2 - two 3 - Name: foo, dtype: int64 - >>> s2.reset_index(level='a') - a foo - b - one bar 0 - two bar 1 - one baz 2 - two baz 3 -""", - ) - ) - def reset_index( - self, - level=None, - drop=False, - name=no_default, - inplace=False, - allow_duplicates=False, - ): - if not drop and inplace: - raise TypeError( - "Cannot reset_index inplace on a Series " - "to create a DataFrame" - ) - data, index = self._reset_index( - level=level, drop=drop, allow_duplicates=allow_duplicates - ) - if not drop: - if name is no_default: - name = 0 if self.name is None else self.name - data[name] = data.pop(self.name) - return self._constructor_expanddim._from_data(data, index) - # For ``name`` behavior, see: - # https://github.com/pandas-dev/pandas/issues/44575 - # ``name`` has to be ignored when `drop=True` - return self._mimic_inplace( - Series._from_data(data, index, self.name), - inplace=inplace, - ) - - @_performance_tracking - def to_frame(self, name: abc.Hashable = no_default) -> cudf.DataFrame: - """Convert Series into a DataFrame - - Parameters - ---------- - name : str, default None - Name to be used for the column - - Returns - ------- - DataFrame - cudf DataFrame - - Examples - -------- - >>> import cudf - >>> series = cudf.Series(['a', 'b', 'c', None, 'd'], name='sample', index=[10, 11, 12, 13, 15]) - >>> series - 10 a - 11 b - 12 c - 13 - 15 d - Name: sample, dtype: object - >>> series.to_frame() - sample - 10 a - 11 b - 12 c - 13 - 15 d - """ # noqa: E501 - return self._to_frame(name=name, index=self.index) - - @_performance_tracking - def memory_usage(self, index=True, deep=False): - return self._column.memory_usage + ( - self.index.memory_usage() if index else 0 - ) - - @_performance_tracking - def __array_function__(self, func, types, args, kwargs): - if "out" in kwargs or not all(issubclass(t, Series) for t in types): - return NotImplemented - - try: - # Apply a Series method if one exists. - if cudf_func := getattr(Series, func.__name__, None): - result = cudf_func(*args, **kwargs) - if func.__name__ == "unique": - # NumPy expects a sorted result for `unique`, which is not - # guaranteed by cudf.Series.unique. - result = result.sort_values() - return result - - # Assume that cupy subpackages match numpy and search the - # corresponding cupy submodule based on the func's __module__. - numpy_submodule = func.__module__.split(".")[1:] - cupy_func = cupy - for name in (*numpy_submodule, func.__name__): - cupy_func = getattr(cupy_func, name, None) - - # Handle case if cupy does not implement the function or just - # aliases the numpy function. - if not cupy_func or cupy_func is func: - return NotImplemented - - # For now just fail on cases with mismatched indices. There is - # almost certainly no general solution for all array functions. - index = args[0].index - if not all(s.index.equals(index) for s in args): - return NotImplemented - out = cupy_func(*(s.values for s in args), **kwargs) - - # Return (host) scalar values immediately. - if not isinstance(out, cupy.ndarray): - return out - - # 0D array (scalar) - if out.ndim == 0: - return to_cudf_compatible_scalar(out) - # 1D array - elif ( - # Only allow 1D arrays - ((out.ndim == 1) or (out.ndim == 2 and out.shape[1] == 1)) - # If we have an index, it must be the same length as the - # output for cupy dispatching to be well-defined. - and len(index) == len(out) - ): - return Series(out, index=index) - except Exception: - # The rare instance where a "silent" failure is preferable. Except - # in the (highly unlikely) case that some other library - # interoperates with cudf objects, the result will be that numpy - # raises a TypeError indicating that the operation is not - # implemented, which is much friendlier than an arbitrary internal - # cudf error. - pass - - return NotImplemented - - @_performance_tracking - def map(self, arg, na_action=None) -> "Series": - """ - Map values of Series according to input correspondence. - - Used for substituting each value in a Series with another value, - that may be derived from a function, a ``dict`` or - a :class:`Series`. - - Parameters - ---------- - arg : function, collections.abc.Mapping subclass or Series - Mapping correspondence. - na_action : {None, 'ignore'}, default None - If 'ignore', propagate NaN values, without passing them to the - mapping correspondence. - - Returns - ------- - Series - Same index as caller. - - Examples - -------- - >>> s = cudf.Series(['cat', 'dog', np.nan, 'rabbit']) - >>> s - 0 cat - 1 dog - 2 - 3 rabbit - dtype: object - - ``map`` accepts a ``dict`` or a ``Series``. Values that are not found - in the ``dict`` are converted to ``NaN``, default values in dicts are - currently not supported.: - - >>> s.map({'cat': 'kitten', 'dog': 'puppy'}) - 0 kitten - 1 puppy - 2 - 3 - dtype: object - - It also accepts numeric functions: - - >>> s = cudf.Series([1, 2, 3, 4, np.nan]) - >>> s.map(lambda x: x ** 2) - 0 1 - 1 4 - 2 9 - 3 16 - 4 - dtype: int64 - - .. pandas-compat:: - :meth:`pandas.Series.map` - - Please note map currently only supports fixed-width numeric - type functions. - """ - if isinstance(arg, dict): - if hasattr(arg, "__missing__"): - raise NotImplementedError( - "default values in dicts are currently not supported." - ) - lhs = cudf.DataFrame( - {"x": self, "orig_order": as_column(range(len(self)))} - ) - rhs = cudf.DataFrame( - { - "x": arg.keys(), - "s": arg.values(), - "bool": as_column(True, length=len(arg), dtype=self.dtype), - } - ) - res = lhs.merge(rhs, on="x", how="left").sort_values( - by="orig_order" - ) - result = res["s"] - result.name = self.name - result.index = self.index - elif isinstance(arg, cudf.Series): - if not arg.index.is_unique: - raise ValueError( - "Reindexing only valid with" - " uniquely valued Index objects" - ) - lhs = cudf.DataFrame( - {"x": self, "orig_order": as_column(range(len(self)))} - ) - rhs = cudf.DataFrame( - { - "x": arg.keys(), - "s": arg, - "bool": as_column(True, length=len(arg), dtype=self.dtype), - } - ) - res = lhs.merge(rhs, on="x", how="left").sort_values( - by="orig_order" - ) - result = res["s"] - result.name = self.name - result.index = self.index - else: - result = self.apply(arg) - return result - - def _getitem_preprocessed( - self, - spec: indexing_utils.IndexingSpec, - ) -> Self | ScalarLike: - """Get subset of entries given structured data - - Parameters - ---------- - spec - Indexing specification - - Returns - ------- - Subsetted Series or else scalar (if a scalar entry is - requested) - - Notes - ----- - This function performs no bounds-checking or massaging of the - inputs. - """ - if isinstance(spec, indexing_utils.MapIndexer): - return self._gather(spec.key, keep_index=True) - elif isinstance(spec, indexing_utils.MaskIndexer): - return self._apply_boolean_mask(spec.key, keep_index=True) - elif isinstance(spec, indexing_utils.SliceIndexer): - return self._slice(spec.key) - elif isinstance(spec, indexing_utils.ScalarIndexer): - return self._gather( - spec.key, keep_index=False - )._column.element_indexing(0) - elif isinstance(spec, indexing_utils.EmptyIndexer): - return self._empty_like(keep_index=True) - assert_never(spec) - - @_performance_tracking - def __getitem__(self, arg): - if isinstance(arg, slice): - return self.iloc[arg] - else: - return self.loc[arg] - - iteritems = SingleColumnFrame.__iter__ - - items = SingleColumnFrame.__iter__ - - @_performance_tracking - def __setitem__(self, key, value): - if isinstance(key, slice): - self.iloc[key] = value - else: - self.loc[key] = value - - def __repr__(self): - _, height = get_terminal_size() - max_rows = ( - height - if pd.get_option("display.max_rows") == 0 - else pd.get_option("display.max_rows") - ) - if max_rows not in (0, None) and len(self) > max_rows: - top = self.head(int(max_rows / 2 + 1)) - bottom = self.tail(int(max_rows / 2 + 1)) - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - preprocess = cudf.concat([top, bottom]) - else: - preprocess = self.copy() - preprocess.index = preprocess.index._clean_nulls_from_index() - if ( - preprocess.nullable - and not isinstance( - preprocess.dtype, - ( - cudf.CategoricalDtype, - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype, - ), - ) - ) or preprocess.dtype.kind == "m": - fill_value = ( - str(cudf.NaT) - if preprocess.dtype.kind in "mM" - else str(cudf.NA) - ) - output = repr( - preprocess.astype("str").fillna(fill_value).to_pandas() - ) - elif isinstance(preprocess.dtype, cudf.CategoricalDtype): - min_rows = ( - height - if pd.get_option("display.min_rows") == 0 - else pd.get_option("display.min_rows") - ) - show_dimensions = pd.get_option("display.show_dimensions") - if preprocess.dtype.categories.dtype.kind == "f": - pd_series = ( - preprocess.astype("str") - .to_pandas() - .astype( - dtype=pd.CategoricalDtype( - categories=preprocess.dtype.categories.astype( - "str" - ).to_pandas(), - ordered=preprocess.dtype.ordered, - ) - ) - ) - else: - pd_series = preprocess.to_pandas() - output = pd_series.to_string( - name=self.name, - dtype=self.dtype, - min_rows=min_rows, - max_rows=max_rows, - length=show_dimensions, - na_rep=str(cudf.NA), - ) - else: - output = repr(preprocess.to_pandas()) - - lines = output.split("\n") - if isinstance(preprocess.dtype, cudf.CategoricalDtype): - category_memory = lines[-1] - if preprocess.dtype.categories.dtype.kind == "f": - category_memory = category_memory.replace("'", "").split(": ") - category_memory = ( - category_memory[0].replace( - "object", preprocess.dtype.categories.dtype.name - ) - + ": " - + category_memory[1] - ) - lines = lines[:-1] - if len(lines) > 1: - if lines[-1].startswith("Name: "): - lines = lines[:-1] - lines.append("Name: %s" % str(self.name)) - if len(self) > len(preprocess): - lines[-1] = lines[-1] + ", Length: %d" % len(self) - lines[-1] = lines[-1] + ", " - elif lines[-1].startswith("Length: "): - lines = lines[:-1] - lines.append("Length: %d" % len(self)) - lines[-1] = lines[-1] + ", " - else: - lines = lines[:-1] - lines[-1] = lines[-1] + "\n" - lines[-1] = lines[-1] + "dtype: %s" % self.dtype - else: - lines = output.split(",") - lines[-1] = " dtype: %s)" % self.dtype - return ",".join(lines) - if isinstance(preprocess._column, cudf.core.column.CategoricalColumn): - lines.append(category_memory) - return "\n".join(lines) - - def _make_operands_and_index_for_binop( - self, - other: Any, - fn: str, - fill_value: Any = None, - reflect: bool = False, - can_reindex: bool = False, - ) -> tuple[ - dict[str | None, tuple[ColumnBase, Any, bool, Any]] - | NotImplementedType, - BaseIndex | None, - bool, - ]: - # Specialize binops to align indices. - if isinstance(other, Series): - if ( - not can_reindex - and fn in cudf.utils.utils._EQUALITY_OPS - and not self.index.equals(other.index) - ): - raise ValueError( - "Can only compare identically-labeled Series objects" - ) - lhs, other = _align_indices([self, other], allow_non_unique=True) - else: - lhs = self - - try: - can_use_self_column_name = cudf.utils.utils._is_same_name( - self.name, other.name - ) - except AttributeError: - can_use_self_column_name = False - - operands = lhs._make_operands_for_binop(other, fill_value, reflect) - return operands, lhs.index, can_use_self_column_name - - @copy_docstring(CategoricalAccessor) # type: ignore - @property - @_performance_tracking - def cat(self): - return CategoricalAccessor(parent=self) - - @copy_docstring(StringMethods) # type: ignore - @property - @_performance_tracking - def str(self): - return StringMethods(parent=self) - - @copy_docstring(ListMethods) # type: ignore - @property - @_performance_tracking - def list(self): - return ListMethods(parent=self) - - @copy_docstring(StructMethods) # type: ignore - @property - @_performance_tracking - def struct(self): - return StructMethods(parent=self) - - @property # type: ignore - @_performance_tracking - def dtype(self): - """The dtype of the Series.""" - return self._column.dtype - - @classmethod - @_performance_tracking - def _concat(cls, objs, axis=0, index: bool = True): - # Concatenate index if not provided - if index is True: - if isinstance(objs[0].index, cudf.MultiIndex): - result_index = cudf.MultiIndex._concat([o.index for o in objs]) - else: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - result_index = cudf.core.index.Index._concat( - [o.index for o in objs] - ) - elif index is False: - result_index = None - else: - raise ValueError(f"{index=} must be a bool") - - names = {obj.name for obj in objs} - if len(names) == 1: - [name] = names - else: - name = None - - if len(objs) > 1: - dtype_mismatch = False - for obj in objs[1:]: - if ( - obj.null_count == len(obj) - or len(obj) == 0 - or isinstance( - obj._column, cudf.core.column.CategoricalColumn - ) - or isinstance( - objs[0]._column, cudf.core.column.CategoricalColumn - ) - ): - continue - - if ( - not dtype_mismatch - and ( - not isinstance( - objs[0]._column, cudf.core.column.CategoricalColumn - ) - and not isinstance( - obj._column, cudf.core.column.CategoricalColumn - ) - ) - and objs[0].dtype != obj.dtype - ): - dtype_mismatch = True - - if is_mixed_with_object_dtype(objs[0], obj): - raise TypeError( - "cudf does not support mixed types, please type-cast " - "both series to same dtypes." - ) - - if dtype_mismatch: - common_dtype = find_common_type([obj.dtype for obj in objs]) - objs = [obj.astype(common_dtype) for obj in objs] - - col = concat_columns([o._column for o in objs]) - - if len(objs): - col = col._with_type_metadata(objs[0].dtype) - - return cls._from_column(col, name=name, index=result_index) - - @property # type: ignore - @_performance_tracking - def valid_count(self): - """Number of non-null values""" - return len(self) - self._column.null_count - - @property # type: ignore - @_performance_tracking - def null_count(self): - """Number of null values""" - return self._column.null_count - - @property # type: ignore - @_performance_tracking - def nullable(self): - """A boolean indicating whether a null-mask is needed""" - return self._column.nullable - - @property # type: ignore - @_performance_tracking - def has_nulls(self): - """ - Indicator whether Series contains null values. - - Returns - ------- - out : bool - If Series has at least one null value, return True, if not - return False. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, None, 3, 4]) - >>> series - 0 1 - 1 2 - 2 - 3 3 - 4 4 - dtype: int64 - >>> series.has_nulls - True - >>> series.dropna().has_nulls - False - """ - return self._column.has_nulls() - - @_performance_tracking - def dropna( - self, axis=0, inplace=False, how=None, ignore_index: bool = False - ): - """ - Return a Series with null values removed. - - Parameters - ---------- - axis : {0 or 'index'}, default 0 - There is only one axis to drop values from. - inplace : bool, default False - If True, do operation inplace and return None. - how : str, optional - Not in use. Kept for compatibility. - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - - Returns - ------- - Series - Series with null entries dropped from it. - - See Also - -------- - Series.isna : Indicate null values. - - Series.notna : Indicate non-null values. - - Series.fillna : Replace null values. - - cudf.DataFrame.dropna : Drop rows or columns which - contain null values. - - cudf.Index.dropna : Drop null indices. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 2, None]) - >>> ser - 0 1 - 1 2 - 2 - dtype: int64 - - Drop null values from a Series. - - >>> ser.dropna() - 0 1 - 1 2 - dtype: int64 - - Keep the Series with valid entries in the same variable. - - >>> ser.dropna(inplace=True) - >>> ser - 0 1 - 1 2 - dtype: int64 - - Empty strings are not considered null values. - `None` is considered a null value. - - >>> ser = cudf.Series(['', None, 'abc']) - >>> ser - 0 - 1 - 2 abc - dtype: object - >>> ser.dropna() - 0 - 2 abc - dtype: object - """ - if axis not in (0, "index"): - raise ValueError( - "Series.dropna supports only one axis to drop values from" - ) - - result = super().dropna(axis=axis) - - if ignore_index: - result.index = RangeIndex(len(result)) - - return self._mimic_inplace(result, inplace=inplace) - - @_performance_tracking - def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): - """ - Return Series with duplicate values removed. - - Parameters - ---------- - keep : {'first', 'last', ``False``}, default 'first' - Method to handle dropping duplicates: - - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - - inplace : bool, default ``False`` - If ``True``, performs operation inplace and returns None. - - Returns - ------- - Series or None - Series with duplicates dropped or None if ``inplace=True``. - - Examples - -------- - >>> s = cudf.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], - ... name='animal') - >>> s - 0 lama - 1 cow - 2 lama - 3 beetle - 4 lama - 5 hippo - Name: animal, dtype: object - - With the `keep` parameter, the selection behavior of duplicated - values can be changed. The value 'first' keeps the first - occurrence for each set of duplicated entries. - The default value of keep is 'first'. Note that order of - the rows being returned is not guaranteed - to be sorted. - - >>> s.drop_duplicates() - 0 lama - 1 cow - 3 beetle - 5 hippo - Name: animal, dtype: object - - The value 'last' for parameter `keep` keeps the last occurrence - for each set of duplicated entries. - - >>> s.drop_duplicates(keep='last') - 1 cow - 3 beetle - 4 lama - 5 hippo - Name: animal, dtype: object - - The value `False` for parameter `keep` discards all sets - of duplicated entries. Setting the value of 'inplace' to - `True` performs the operation inplace and returns `None`. - - >>> s.drop_duplicates(keep=False, inplace=True) - >>> s - 1 cow - 3 beetle - 5 hippo - Name: animal, dtype: object - """ - result = super().drop_duplicates(keep=keep, ignore_index=ignore_index) - - return self._mimic_inplace(result, inplace=inplace) - - @_performance_tracking - def fillna( - self, value=None, method=None, axis=None, inplace=False, limit=None - ): - if isinstance(value, pd.Series): - value = Series.from_pandas(value) - elif isinstance(value, abc.Mapping): - value = Series(value) - if isinstance(value, cudf.Series): - if not self.index.equals(value.index): - value = value.reindex(self.index) - value = {self.name: value._column} - return super().fillna( - value=value, method=method, axis=axis, inplace=inplace, limit=limit - ) - - def between(self, left, right, inclusive="both") -> Series: - """ - Return boolean Series equivalent to left <= series <= right. - - This function returns a boolean vector containing `True` wherever the - corresponding Series element is between the boundary values `left` and - `right`. NA values are treated as `False`. - - Parameters - ---------- - left : scalar or list-like - Left boundary. - right : scalar or list-like - Right boundary. - inclusive : {"both", "neither", "left", "right"} - Include boundaries. Whether to set each bound as closed or open. - - Returns - ------- - Series - Series representing whether each element is between left and - right (inclusive). - - See Also - -------- - Series.gt : Greater than of series and other. - Series.lt : Less than of series and other. - - Notes - ----- - This function is equivalent to ``(left <= ser) & (ser <= right)`` - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([2, 0, 4, 8, None]) - - Boundary values are included by default: - - >>> s.between(1, 4) - 0 True - 1 False - 2 True - 3 False - 4 - dtype: bool - - With `inclusive` set to ``"neither"`` boundary values are excluded: - - >>> s.between(1, 4, inclusive="neither") - 0 True - 1 False - 2 False - 3 False - 4 - dtype: bool - - `left` and `right` can be any scalar value: - - >>> s = cudf.Series(['Alice', 'Bob', 'Carol', 'Eve']) - >>> s.between('Anna', 'Daniel') - 0 False - 1 True - 2 True - 3 False - dtype: bool - """ - left_operand = left if is_scalar(left) else as_column(left) - right_operand = right if is_scalar(right) else as_column(right) - - if inclusive == "both": - lmask = self._column >= left_operand - rmask = self._column <= right_operand - elif inclusive == "left": - lmask = self._column >= left_operand - rmask = self._column < right_operand - elif inclusive == "right": - lmask = self._column > left_operand - rmask = self._column <= right_operand - elif inclusive == "neither": - lmask = self._column > left_operand - rmask = self._column < right_operand - else: - raise ValueError( - "Inclusive has to be either string of 'both', " - "'left', 'right', or 'neither'." - ) - return self._from_column( - lmask & rmask, name=self.name, index=self.index - ) - - @_performance_tracking - def all(self, axis=0, bool_only=None, skipna=True, **kwargs): - if bool_only not in (None, True): - raise NotImplementedError( - "The bool_only parameter is not supported for Series." - ) - return super().all(axis, skipna, **kwargs) - - @_performance_tracking - def any(self, axis=0, bool_only=None, skipna=True, **kwargs): - if bool_only not in (None, True): - raise NotImplementedError( - "The bool_only parameter is not supported for Series." - ) - return super().any(axis, skipna, **kwargs) - - @_performance_tracking - def to_pandas( - self, - *, - index: bool = True, - nullable: bool = False, - arrow_type: bool = False, - ) -> pd.Series: - """ - Convert to a pandas Series. - - Parameters - ---------- - index : Boolean, Default True - If ``index`` is ``True``, converts the index of cudf.Series - and sets it to the pandas.Series. If ``index`` is ``False``, - no index conversion is performed and pandas.Series will assign - a default index. - nullable : Boolean, Default False - If ``nullable`` is ``True``, the resulting series will be - having a corresponding nullable Pandas dtype. - If there is no corresponding nullable Pandas dtype present, - the resulting dtype will be a regular pandas dtype. - If ``nullable`` is ``False``, the resulting series will - either convert null values to ``np.nan`` or ``None`` - depending on the dtype. - arrow_type : bool, Default False - Return the Series with a ``pandas.ArrowDtype`` - - Returns - ------- - out : pandas Series - - Notes - ----- - nullable and arrow_type cannot both be set to ``True`` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([-3, 2, 0]) - >>> pds = ser.to_pandas() - >>> pds - 0 -3 - 1 2 - 2 0 - dtype: int64 - >>> type(pds) - - - ``nullable=True`` converts the result to pandas nullable types: - - >>> ser = cudf.Series([10, 20, None, 30]) - >>> ser - 0 10 - 1 20 - 2 - 3 30 - dtype: int64 - >>> ser.to_pandas(nullable=True) - 0 10 - 1 20 - 2 - 3 30 - dtype: Int64 - >>> ser.to_pandas(nullable=False) - 0 10.0 - 1 20.0 - 2 NaN - 3 30.0 - dtype: float64 - - ``arrow_type=True`` converts the result to ``pandas.ArrowDtype``: - - >>> ser.to_pandas(arrow_type=True) - 0 10 - 1 20 - 2 - 3 30 - dtype: int64[pyarrow] - """ - if index is True: - index = self.index.to_pandas() - else: - index = None # type: ignore[assignment] - return pd.Series( - self._column.to_pandas(nullable=nullable, arrow_type=arrow_type), - index=index, - name=self.name, - ) - - @property # type: ignore - @_performance_tracking - def data(self): - """The gpu buffer for the data - - Returns - ------- - out : The GPU buffer of the Series. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - dtype: int64 - >>> np.array(series.data.memoryview()) - array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, - 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) - """ # noqa: E501 - return self._column.data - - @property # type: ignore - @_performance_tracking - def nullmask(self): - """The gpu buffer for the null-mask""" - return cudf.Series(self._column.nullmask) - - @_performance_tracking - def astype( - self, - dtype, - copy: bool = False, - errors: Literal["raise", "ignore"] = "raise", - ): - if is_dict_like(dtype): - if len(dtype) > 1 or self.name not in dtype: - raise KeyError( - "Only the Series name can be used for the key in Series " - "dtype mappings." - ) - else: - dtype = {self.name: dtype} - return super().astype(dtype, copy, errors) - - @_performance_tracking - def sort_index( - self, - axis=0, - level=None, - ascending=True, - inplace=False, - kind=None, - na_position="last", - sort_remaining=True, - ignore_index=False, - key=None, - ): - if axis not in (0, "index"): - raise ValueError("Only axis=0 is valid for Series.") - return super().sort_index( - axis=axis, - level=level, - ascending=ascending, - inplace=inplace, - kind=kind, - na_position=na_position, - sort_remaining=sort_remaining, - ignore_index=ignore_index, - key=key, - ) - - @_performance_tracking - def sort_values( - self, - axis=0, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, - key=None, - ): - """Sort by the values along either axis. - - Parameters - ---------- - ascending : bool or list of bool, default True - Sort ascending vs. descending. Specify list for multiple sort - orders. If this is a list of bools, must match the length of the - by. - na_position : {'first', 'last'}, default 'last' - 'first' puts nulls at the beginning, 'last' puts nulls at the end - ignore_index : bool, default False - If True, index will not be sorted. - key : callable, optional - Apply the key function to the values - before sorting. This is similar to the ``key`` argument in the - builtin ``sorted`` function, with the notable difference that - this ``key`` function should be *vectorized*. It should expect a - ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. - Currently not supported. - - Returns - ------- - Series : Series with sorted values. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([1, 5, 2, 4, 3]) - >>> s.sort_values() - 0 1 - 2 2 - 4 3 - 3 4 - 1 5 - dtype: int64 - - .. pandas-compat:: - :meth:`pandas.Series.sort_values` - - * Support axis='index' only. - * The inplace and kind argument is currently unsupported - """ - return super().sort_values( - by=self.name, - axis=axis, - ascending=ascending, - inplace=inplace, - kind=kind, - na_position=na_position, - ignore_index=ignore_index, - key=key, - ) - - @_performance_tracking - def nlargest(self, n=5, keep="first"): - """Returns a new Series of the *n* largest element. - - Parameters - ---------- - n : int, default 5 - Return this many descending sorted values. - keep : {'first', 'last'}, default 'first' - When there are duplicate values that cannot all fit in a - Series of `n` elements: - - - ``first`` : return the first `n` occurrences in order - of appearance. - - ``last`` : return the last `n` occurrences in reverse - order of appearance. - - Returns - ------- - Series - The `n` largest values in the Series, sorted in decreasing order. - - Examples - -------- - >>> import cudf - >>> countries_population = {"Italy": 59000000, "France": 65000000, - ... "Malta": 434000, "Maldives": 434000, - ... "Brunei": 434000, "Iceland": 337000, - ... "Nauru": 11300, "Tuvalu": 11300, - ... "Anguilla": 11300, "Montserrat": 5200} - >>> series = cudf.Series(countries_population) - >>> series - Italy 59000000 - France 65000000 - Malta 434000 - Maldives 434000 - Brunei 434000 - Iceland 337000 - Nauru 11300 - Tuvalu 11300 - Anguilla 11300 - Montserrat 5200 - dtype: int64 - >>> series.nlargest() - France 65000000 - Italy 59000000 - Malta 434000 - Maldives 434000 - Brunei 434000 - dtype: int64 - >>> series.nlargest(3) - France 65000000 - Italy 59000000 - Malta 434000 - dtype: int64 - >>> series.nlargest(3, keep='last') - France 65000000 - Italy 59000000 - Brunei 434000 - dtype: int64 - """ - return self._n_largest_or_smallest(True, n, [self.name], keep) - - @_performance_tracking - def nsmallest(self, n=5, keep="first"): - """ - Returns a new Series of the *n* smallest element. - - Parameters - ---------- - n : int, default 5 - Return this many ascending sorted values. - keep : {'first', 'last'}, default 'first' - When there are duplicate values that cannot all fit in a - Series of `n` elements: - - - ``first`` : return the first `n` occurrences in order - of appearance. - - ``last`` : return the last `n` occurrences in reverse - order of appearance. - - Returns - ------- - Series - The `n` smallest values in the Series, sorted in increasing order. - - Examples - -------- - >>> import cudf - >>> countries_population = {"Italy": 59000000, "France": 65000000, - ... "Brunei": 434000, "Malta": 434000, - ... "Maldives": 434000, "Iceland": 337000, - ... "Nauru": 11300, "Tuvalu": 11300, - ... "Anguilla": 11300, "Montserrat": 5200} - >>> s = cudf.Series(countries_population) - >>> s - Italy 59000000 - France 65000000 - Brunei 434000 - Malta 434000 - Maldives 434000 - Iceland 337000 - Nauru 11300 - Tuvalu 11300 - Anguilla 11300 - Montserrat 5200 - dtype: int64 - - The `n` smallest elements where ``n=5`` by default. - - >>> s.nsmallest() - Montserrat 5200 - Nauru 11300 - Tuvalu 11300 - Anguilla 11300 - Iceland 337000 - dtype: int64 - - The `n` smallest elements where ``n=3``. Default `keep` value is - 'first' so Nauru and Tuvalu will be kept. - - >>> s.nsmallest(3) - Montserrat 5200 - Nauru 11300 - Tuvalu 11300 - dtype: int64 - - The `n` smallest elements where ``n=3`` and keeping the last - duplicates. Anguilla and Tuvalu will be kept since they are the last - with value 11300 based on the index order. - - >>> s.nsmallest(3, keep='last') - Montserrat 5200 - Anguilla 11300 - Tuvalu 11300 - dtype: int64 - """ - return self._n_largest_or_smallest(False, n, [self.name], keep) - - @_performance_tracking - def argsort( - self, - axis=0, - kind="quicksort", - order=None, - ascending=True, - na_position="last", - ) -> Self: - col = as_column( - super().argsort( - axis=axis, - kind=kind, - order=order, - ascending=ascending, - na_position=na_position, - ) - ) - return self._from_data_like_self( - self._data._from_columns_like_self([col]) - ) - - @_performance_tracking - def replace( - self, - to_replace=None, - value=no_default, - inplace=False, - limit=None, - regex=False, - method=no_default, - ): - if is_dict_like(to_replace) and value not in {None, no_default}: - raise ValueError( - "Series.replace cannot use dict-like to_replace and non-None " - "value" - ) - - return super().replace( - to_replace, - value, - inplace=inplace, - limit=limit, - regex=regex, - method=method, - ) - - @_performance_tracking - def update(self, other): - """ - Modify Series in place using values from passed Series. - Uses non-NA values from passed Series to make updates. Aligns - on index. - - Parameters - ---------- - other : Series, or object coercible into Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([1, 2, 3]) - >>> s - 0 1 - 1 2 - 2 3 - dtype: int64 - >>> s.update(cudf.Series([4, 5, 6])) - >>> s - 0 4 - 1 5 - 2 6 - dtype: int64 - >>> s = cudf.Series(['a', 'b', 'c']) - >>> s - 0 a - 1 b - 2 c - dtype: object - >>> s.update(cudf.Series(['d', 'e'], index=[0, 2])) - >>> s - 0 d - 1 b - 2 e - dtype: object - >>> s = cudf.Series([1, 2, 3]) - >>> s - 0 1 - 1 2 - 2 3 - dtype: int64 - >>> s.update(cudf.Series([4, 5, 6, 7, 8])) - >>> s - 0 4 - 1 5 - 2 6 - dtype: int64 - - If ``other`` contains NaNs the corresponding values are not updated - in the original Series. - - >>> s = cudf.Series([1.0, 2.0, 3.0]) - >>> s - 0 1.0 - 1 2.0 - 2 3.0 - dtype: float64 - >>> s.update(cudf.Series([4.0, np.nan, 6.0], nan_as_null=False)) - >>> s - 0 4.0 - 1 2.0 - 2 6.0 - dtype: float64 - - ``other`` can also be a non-Series object type - that is coercible into a Series - - >>> s = cudf.Series([1, 2, 3]) - >>> s - 0 1 - 1 2 - 2 3 - dtype: int64 - >>> s.update([4, np.nan, 6]) - >>> s - 0 4 - 1 2 - 2 6 - dtype: int64 - >>> s = cudf.Series([1, 2, 3]) - >>> s - 0 1 - 1 2 - 2 3 - dtype: int64 - >>> s.update({1: 9}) - >>> s - 0 1 - 1 9 - 2 3 - dtype: int64 - """ - - if not isinstance(other, cudf.Series): - other = cudf.Series(other) - - if not self.index.equals(other.index): - other = other.reindex(index=self.index) - mask = other.notna() - - self.mask(mask, other, inplace=True) - - # UDF related - @_performance_tracking - def apply( - self, - func, - convert_dtype=True, - args=(), - by_row: Literal[False, "compat"] = "compat", - **kwargs, - ): - """ - Apply a scalar function to the values of a Series. - Similar to ``pandas.Series.apply``. - - ``apply`` relies on Numba to JIT compile ``func``. - Thus the allowed operations within ``func`` are limited to `those - supported by the CUDA Python Numba target - `__. - For more information, see the `cuDF guide to user defined functions - `__. - - Some string functions and methods are supported. Refer to the guide - to UDFs for details. - - Parameters - ---------- - func : function - Scalar Python function to apply. - convert_dtype : bool, default True - In cuDF, this parameter is always True. Because - cuDF does not support arbitrary object dtypes, - the result will always be the common type as determined - by numba based on the function logic and argument types. - See examples for details. - args : tuple - Positional arguments passed to func after the series value. - by_row : False or "compat", default "compat" - If ``"compat"`` and func is a callable, func will be passed each element of - the Series, like ``Series.map``. If func is a list or dict of - callables, will first try to translate each func into pandas methods. If - that doesn't work, will try call to apply again with ``by_row="compat"`` - and if that fails, will call apply again with ``by_row=False`` - (backward compatible). - If False, the func will be passed the whole Series at once. - - ``by_row`` has no effect when ``func`` is a string. - - Currently not implemented. - **kwargs - Not supported - - Returns - ------- - result : Series - The mask and index are preserved. - - Notes - ----- - UDFs are cached in memory to avoid recompilation. The first - call to the UDF will incur compilation overhead. `func` may - call nested functions that are decorated with the decorator - `numba.cuda.jit(device=True)`, otherwise numba will raise a - typing error. - - Examples - -------- - Apply a basic function to a series: - - >>> sr = cudf.Series([1,2,3]) - >>> def f(x): - ... return x + 1 - >>> sr.apply(f) - 0 2 - 1 3 - 2 4 - dtype: int64 - - Apply a basic function to a series with nulls: - - >>> sr = cudf.Series([1,cudf.NA,3]) - >>> def f(x): - ... return x + 1 - >>> sr.apply(f) - 0 2 - 1 - 2 4 - dtype: int64 - - Use a function that does something conditionally, - based on if the value is or is not null: - - >>> sr = cudf.Series([1,cudf.NA,3]) - >>> def f(x): - ... if x is cudf.NA: - ... return 42 - ... else: - ... return x - 1 - >>> sr.apply(f) - 0 0 - 1 42 - 2 2 - dtype: int64 - - Results will be upcast to the common dtype required - as derived from the UDFs logic. Note that this means - the common type will be returned even if such data - is passed that would not result in any values of that - dtype: - - >>> sr = cudf.Series([1,cudf.NA,3]) - >>> def f(x): - ... return x + 1.5 - >>> sr.apply(f) - 0 2.5 - 1 - 2 4.5 - dtype: float64 - - UDFs manipulating string data are allowed, as long as - they neither modify strings in place nor create new strings. - For example, the following UDF is allowed: - - >>> def f(st): - ... if len(st) == 0: - ... return -1 - ... elif st.startswith('a'): - ... return 1 - ... elif 'example' in st: - ... return 2 - ... else: - ... return 3 - ... - >>> sr = cudf.Series(['', 'abc', 'some_example']) - >>> sr.apply(f) # doctest: +SKIP - 0 -1 - 1 1 - 2 2 - dtype: int64 - - However, the following UDF is not allowed since it includes an - operation that requires the creation of a new string: a call to the - ``upper`` method. Methods that are not supported in this manner - will raise an ``AttributeError``. - - >>> def f(st): - ... new = st.upper() - ... return 'ABC' in new - ... - >>> sr.apply(f) # doctest: +SKIP - - For a complete list of supported functions and methods that may be - used to manipulate string data, see the UDF guide, - - - """ - if convert_dtype is not True: - raise ValueError("Series.apply only supports convert_dtype=True") - elif by_row != "compat": - raise NotImplementedError("by_row is currently not supported.") - - result = self._apply(func, _get_scalar_kernel, *args, **kwargs) - result.name = self.name - return result - - # - # Stats - # - @_performance_tracking - def count(self): - """ - Return number of non-NA/null observations in the Series - - Returns - ------- - int - Number of non-null values in the Series. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.count() - 5 - - .. pandas-compat:: - :meth:`pandas.Series.count` - - Parameters currently not supported is `level`. - """ - return self.valid_count - - @_performance_tracking - def mode(self, dropna=True): - """ - Return the mode(s) of the dataset. - - Always returns Series even if only one value is returned. - - Parameters - ---------- - dropna : bool, default True - Don't consider counts of NA/NaN/NaT. - - Returns - ------- - Series - Modes of the Series in sorted order. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([7, 6, 5, 4, 3, 2, 1]) - >>> series - 0 7 - 1 6 - 2 5 - 3 4 - 4 3 - 5 2 - 6 1 - dtype: int64 - >>> series.mode() - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - 6 7 - dtype: int64 - - We can include ```` values in mode by - passing ``dropna=False``. - - >>> series = cudf.Series([7, 4, 3, 3, 7, None, None]) - >>> series - 0 7 - 1 4 - 2 3 - 3 3 - 4 7 - 5 - 6 - dtype: int64 - >>> series.mode() - 0 3 - 1 7 - dtype: int64 - >>> series.mode(dropna=False) - 0 3 - 1 7 - 2 - dtype: int64 - """ - val_counts = self.value_counts(ascending=False, dropna=dropna) - if len(val_counts) > 0: - val_counts = val_counts[val_counts == val_counts.iloc[0]] - - return Series._from_column( - val_counts.index.sort_values()._column, name=self.name - ) - - @_performance_tracking - def round(self, decimals=0, how="half_even"): - if not is_integer(decimals): - raise ValueError( - f"decimals must be an int, got {type(decimals).__name__}" - ) - decimals = int(decimals) - return super().round(decimals, how) - - @_performance_tracking - def cov(self, other, min_periods=None, ddof: int | None = None): - """ - Compute covariance with Series, excluding missing values. - - Parameters - ---------- - other : Series - Series with which to compute the covariance. - - Returns - ------- - float - Covariance between Series and other normalized by N-1 - (unbiased estimator). - - Examples - -------- - >>> import cudf - >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) - >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) - >>> ser1.cov(ser2) - -0.015750000000000004 - - .. pandas-compat:: - :meth:`pandas.Series.cov` - - `min_periods` parameter is not yet supported. - """ - - if min_periods is not None: - raise NotImplementedError( - "min_periods parameter is not implemented yet" - ) - if ddof is not None: - raise NotImplementedError("ddof parameter is not implemented yet") - - if self.empty or other.empty: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - lhs = self.nans_to_nulls().dropna() - rhs = other.nans_to_nulls().dropna() - - lhs, rhs = _align_indices([lhs, rhs], how="inner") - - try: - return lhs._column.cov(rhs._column) - except AttributeError: - raise TypeError( - f"cannot perform covariance with types {self.dtype}, " - f"{other.dtype}" - ) - - @_performance_tracking - def duplicated(self, keep="first"): - """ - Indicate duplicate Series values. - - Duplicated values are indicated as ``True`` values in the resulting - Series. Either all duplicates, all except the first or all except the - last occurrence of duplicates can be indicated. - - Parameters - ---------- - keep : {'first', 'last', False}, default 'first' - Method to handle dropping duplicates: - - - ``'first'`` : Mark duplicates as ``True`` except for the first - occurrence. - - ``'last'`` : Mark duplicates as ``True`` except for the last - occurrence. - - ``False`` : Mark all duplicates as ``True``. - - Returns - ------- - Series[bool] - Series indicating whether each value has occurred in the - preceding values. - - See Also - -------- - Index.duplicated : Equivalent method on cudf.Index. - DataFrame.duplicated : Equivalent method on cudf.DataFrame. - Series.drop_duplicates : Remove duplicate values from Series. - - Examples - -------- - By default, for each set of duplicated values, the first occurrence is - set on False and all others on True: - - >>> import cudf - >>> animals = cudf.Series(['lama', 'cow', 'lama', 'beetle', 'lama']) - >>> animals.duplicated() - 0 False - 1 False - 2 True - 3 False - 4 True - dtype: bool - - which is equivalent to - - >>> animals.duplicated(keep='first') - 0 False - 1 False - 2 True - 3 False - 4 True - dtype: bool - - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: - - >>> animals.duplicated(keep='last') - 0 True - 1 False - 2 True - 3 False - 4 False - dtype: bool - - By setting keep on ``False``, all duplicates are True: - - >>> animals.duplicated(keep=False) - 0 True - 1 False - 2 True - 3 False - 4 True - dtype: bool - """ - return super().duplicated(keep=keep) - - @_performance_tracking - def corr(self, other, method="pearson", min_periods=None): - """Calculates the sample correlation between two Series, - excluding missing values. - - Parameters - ---------- - other : Series - Series with which to compute the correlation. - method : {'pearson', 'spearman'}, default 'pearson' - Method used to compute correlation: - - - pearson : Standard correlation coefficient - - spearman : Spearman rank correlation - - min_periods : int, optional - Minimum number of observations needed to have a valid result. - - Examples - -------- - >>> import cudf - >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) - >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) - >>> ser1.corr(ser2, method="pearson") - -0.20454263717316112 - >>> ser1.corr(ser2, method="spearman") - -0.5 - """ - - if method not in {"pearson", "spearman"}: - raise ValueError(f"Unknown method {method}") - - if min_periods is not None: - raise NotImplementedError("Unsupported argument 'min_periods'") - - if self.empty or other.empty: - return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) - - lhs = self.nans_to_nulls().dropna() - rhs = other.nans_to_nulls().dropna() - lhs, rhs = _align_indices([lhs, rhs], how="inner") - if method == "spearman": - lhs = lhs.rank() - rhs = rhs.rank() - - try: - return lhs._column.corr(rhs._column) - except AttributeError: - raise TypeError( - f"cannot perform corr with types {self.dtype}, {other.dtype}" - ) - - @_performance_tracking - def autocorr(self, lag=1): - """Compute the lag-N autocorrelation. This method computes the Pearson - correlation between the Series and its shifted self. - - Parameters - ---------- - lag : int, default 1 - Number of lags to apply before performing autocorrelation. - - Returns - ------- - result : float - The Pearson correlation between self and self.shift(lag). - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05, 0.17]) - >>> s.autocorr() - 0.1438853844... - >>> s.autocorr(lag=2) - -0.9647548490... - """ - return self.corr(self.shift(lag)) - - @_performance_tracking - def isin(self, values): - """Check whether values are contained in Series. - - Parameters - ---------- - values : set or list-like - The sequence of values to test. Passing in a single string will - raise a TypeError. Instead, turn a single string into a list - of one element. - - Returns - ------- - result : Series - Series of booleans indicating if each element is in values. - - Raises - ------ - TypeError - If values is a string - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['lama', 'cow', 'lama', 'beetle', 'lama', - ... 'hippo'], name='animal') - >>> s.isin(['cow', 'lama']) - 0 True - 1 True - 2 True - 3 False - 4 True - 5 False - Name: animal, dtype: bool - - Passing a single string as ``s.isin('lama')`` will raise an error. Use - a list of one element instead: - - >>> s.isin(['lama']) - 0 True - 1 False - 2 True - 3 False - 4 True - 5 False - Name: animal, dtype: bool - - Strings and integers are distinct and are therefore not comparable: - - >>> cudf.Series([1]).isin(['1']) - 0 False - dtype: bool - >>> cudf.Series([1.1]).isin(['1.1']) - 0 False - dtype: bool - """ - - # Even though only list-like objects are supposed to be passed, only - # scalars throw errors. Other types (like dicts) just transparently - # return False (see the implementation of ColumnBase.isin). - if is_scalar(values): - raise TypeError( - "only list-like objects are allowed to be passed " - f"to isin(), you passed a [{type(values).__name__}]" - ) - - return Series._from_column( - self._column.isin(values), name=self.name, index=self.index - ) - - @_performance_tracking - def unique(self): - """ - Returns unique values of this Series. - - Returns - ------- - Series - A series with only the unique values. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series(['a', 'a', 'b', None, 'b', None, 'c']) - >>> series - 0 a - 1 a - 2 b - 3 - 4 b - 5 - 6 c - dtype: object - >>> series.unique() - 0 a - 1 b - 2 - 3 c - dtype: object - """ - res = self._column.unique() - if cudf.get_option("mode.pandas_compatible"): - return res.values - return Series._from_column(res, name=self.name) - - @_performance_tracking - def value_counts( - self, - normalize=False, - sort=True, - ascending=False, - bins=None, - dropna=True, - ): - """Return a Series containing counts of unique values. - - The resulting object will be in descending order so that - the first element is the most frequently-occurring element. - Excludes NA values by default. - - Parameters - ---------- - normalize : bool, default False - If True then the object returned will contain - the relative frequencies of the unique values. - - sort : bool, default True - Sort by frequencies. - - ascending : bool, default False - Sort in ascending order. - - bins : int, optional - Rather than count values, group them into half-open bins, - only works with numeric data. - - dropna : bool, default True - Don't include counts of NaN and None. - - Returns - ------- - result : Series containing counts of unique values. - - See Also - -------- - Series.count - Number of non-NA elements in a Series. - - cudf.DataFrame.count - Number of non-NA elements in a DataFrame. - - Examples - -------- - >>> import cudf - >>> sr = cudf.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, None]) - >>> sr - 0 1.0 - 1 2.0 - 2 2.0 - 3 3.0 - 4 3.0 - 5 3.0 - 6 - dtype: float64 - >>> sr.value_counts() - 3.0 3 - 2.0 2 - 1.0 1 - Name: count, dtype: int64 - - The order of the counts can be changed by passing ``ascending=True``: - - >>> sr.value_counts(ascending=True) - 1.0 1 - 2.0 2 - 3.0 3 - Name: count, dtype: int64 - - With ``normalize`` set to True, returns the relative frequency - by dividing all values by the sum of values. - - >>> sr.value_counts(normalize=True) - 3.0 0.500000 - 2.0 0.333333 - 1.0 0.166667 - Name: proportion, dtype: float64 - - To include ``NA`` value counts, pass ``dropna=False``: - - >>> sr = cudf.Series([1.0, 2.0, 2.0, 3.0, None, 3.0, 3.0, None]) - >>> sr - 0 1.0 - 1 2.0 - 2 2.0 - 3 3.0 - 4 - 5 3.0 - 6 3.0 - 7 - dtype: float64 - >>> sr.value_counts(dropna=False) - 3.0 3 - 2.0 2 - 2 - 1.0 1 - Name: count, dtype: int64 - - >>> s = cudf.Series([3, 1, 2, 3, 4, np.nan]) - >>> s.value_counts(bins=3) - (2.0, 3.0] 2 - (0.996, 2.0] 2 - (3.0, 4.0] 1 - Name: count, dtype: int64 - """ - if bins is not None: - series_bins = cudf.cut(self, bins, include_lowest=True) - result_name = "proportion" if normalize else "count" - if dropna and self.null_count == len(self): - return Series( - [], - dtype=np.int64, - name=result_name, - index=cudf.Index([], dtype=self.dtype, name=self.name), - ) - - if bins is not None: - res = self.groupby(series_bins, dropna=dropna).count(dropna=dropna) - res = res[res.index.notna()] - else: - res = self.groupby(self, dropna=dropna).count(dropna=dropna) - if isinstance(self.dtype, cudf.CategoricalDtype) and len( - res - ) != len(self.dtype.categories): - # For categorical dtypes: When there exists - # categories in dtypes and they are missing in the - # column, `value_counts` will have to return - # their occurrences as 0. - # TODO: Remove this workaround once `observed` - # parameter support is added to `groupby` - res = res.reindex(self.dtype.categories).fillna(0) - res.index = res.index.astype(self.dtype) - - res.index.name = self.name - - if sort: - res = res.sort_values(ascending=ascending) - - if normalize: - res = res / float(res._column.sum()) - - # Pandas returns an IntervalIndex as the index of res - # this condition makes sure we do too if bins is given - if bins is not None and len(res) == len(res.index.categories): - interval_col = IntervalColumn.from_struct_column( - res.index._column._get_decategorized_column() - ) - res.index = cudf.IntervalIndex._from_column( - interval_col, name=res.index.name - ) - res.name = result_name - return res - - @_performance_tracking - def quantile( - self, q=0.5, interpolation="linear", exact=True, quant_index=True - ): - """ - Return values at the given quantile. - - Parameters - ---------- - q : float or array-like, default 0.5 (50% quantile) - 0 <= q <= 1, the quantile(s) to compute - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - This optional parameter specifies the interpolation method to use, - when the desired quantile lies between two data points i and j: - - * linear: `i + (j - i) * fraction`, where `fraction` is the - fractional part of the index surrounded by `i` and `j`. - * lower: `i`. - * higher: `j`. - * nearest: `i` or `j` whichever is nearest. - * midpoint: (`i` + `j`) / 2. - exact : boolean - Whether to use approximate or exact quantile algorithm. - quant_index : boolean - Whether to use the list of quantiles as index. - - Returns - ------- - float or Series - If ``q`` is an array, a Series will be returned where the - index is ``q`` and the values are the quantiles, otherwise - a float will be returned. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 2, 3, 4]) - >>> series - 0 1 - 1 2 - 2 3 - 3 4 - dtype: int64 - >>> series.quantile(0.5) - 2.5 - >>> series.quantile([0.25, 0.5, 0.75]) - 0.25 1.75 - 0.50 2.50 - 0.75 3.25 - dtype: float64 - """ - - return_scalar = is_scalar(q) - if return_scalar: - np_array_q = np.asarray([float(q)]) - else: - try: - np_array_q = np.asarray(q) - except TypeError: - try: - np_array_q = cudf.core.column.as_column(q).values_host - except TypeError: - raise TypeError( - f"q must be a scalar or array-like, got {type(q)}" - ) - - result = self._column.quantile( - np_array_q, interpolation, exact, return_scalar=return_scalar - ) - - if return_scalar: - return result - - return Series._from_column( - result, - name=self.name, - index=cudf.Index(np_array_q) if quant_index else None, - ) - - @docutils.doc_describe() - @_performance_tracking - def describe( - self, - percentiles=None, - include=None, - exclude=None, - ): - """{docstring}""" - - if percentiles is not None: - if not all(0 <= x <= 1 for x in percentiles): - raise ValueError( - "All percentiles must be between 0 and 1, " "inclusive." - ) - - # describe always includes 50th percentile - percentiles = list(percentiles) - if 0.5 not in percentiles: - percentiles.append(0.5) - - percentiles = np.sort(percentiles) - else: - # pandas defaults - percentiles = np.array([0.25, 0.5, 0.75]) - - dtype = "str" - if self.dtype.kind == "b": - data = _describe_categorical(self, percentiles) - elif isinstance(self._column, cudf.core.column.NumericalColumn): - data = _describe_numeric(self, percentiles) - dtype = None - elif isinstance(self._column, TimeDeltaColumn): - data = _describe_timedelta(self, percentiles) - elif isinstance(self._column, DatetimeColumn): - data = _describe_timestamp(self, percentiles) - else: - data = _describe_categorical(self, percentiles) - - return Series( - data=data.values(), - index=data.keys(), - dtype=dtype, - name=self.name, - ) - - @_performance_tracking - def digitize(self, bins, right=False): - """Return the indices of the bins to which each value belongs. - - Notes - ----- - Monotonicity of bins is assumed and not checked. - - Parameters - ---------- - bins : np.array - 1-D monotonically, increasing array with same type as this series. - right : bool - Indicates whether interval contains the right or left bin edge. - - Returns - ------- - A new Series containing the indices. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([0.2, 6.4, 3.0, 1.6]) - >>> bins = cudf.Series([0.0, 1.0, 2.5, 4.0, 10.0]) - >>> inds = s.digitize(bins) - >>> inds - 0 1 - 1 4 - 2 3 - 3 2 - dtype: int32 - """ - return Series._from_column( - cudf.core.column.numerical.digitize(self._column, bins, right), - name=self.name, - ) - - @_performance_tracking - def diff(self, periods=1): - """First discrete difference of element. - - Calculates the difference of a Series element compared with another - element in the Series (default is element in previous row). - - Parameters - ---------- - periods : int, default 1 - Periods to shift for calculating difference, - accepts negative values. - - Returns - ------- - Series - First differences of the Series. - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([1, 1, 2, 3, 5, 8]) - >>> series - 0 1 - 1 1 - 2 2 - 3 3 - 4 5 - 5 8 - dtype: int64 - - Difference with previous row - - >>> series.diff() - 0 - 1 0 - 2 1 - 3 1 - 4 2 - 5 3 - dtype: int64 - - Difference with 3rd previous row - - >>> series.diff(periods=3) - 0 - 1 - 2 - 3 2 - 4 4 - 5 6 - dtype: int64 - - Difference with following row - - >>> series.diff(periods=-1) - 0 0 - 1 -1 - 2 -1 - 3 -2 - 4 -3 - 5 - dtype: int64 - """ - if not is_integer(periods): - if not (isinstance(periods, float) and periods.is_integer()): - raise ValueError("periods must be an integer") - periods = int(periods) - - return self - self.shift(periods=periods) - - @_performance_tracking - @docutils.doc_apply( - groupby_doc_template.format( - ret=textwrap.dedent( - """ - Returns - ------- - SeriesGroupBy - Returns a SeriesGroupBy object that contains - information about the groups. - """ - ) - ) - ) - def groupby( - self, - by=None, - axis=0, - level=None, - as_index=True, - sort=no_default, - group_keys=False, - observed=True, - dropna=True, - ): - return super().groupby( - by, - axis, - level, - as_index, - sort, - group_keys, - observed, - dropna, - ) - - @_performance_tracking - def rename( - self, - index=None, - axis=None, - copy: bool = True, - inplace: bool = False, - level=None, - errors: Literal["ignore", "raise"] = "ignore", - ): - """ - Alter Series name - - Change Series.name with a scalar value - - Parameters - ---------- - index : Scalar, optional - Scalar to alter the Series.name attribute - axis : {0 or 'index'} - Unused. Parameter needed for compatibility with DataFrame. - copy : boolean, default True - Also copy underlying data - inplace : bool, default False - Whether to return a new Series. If True the value of copy is ignored. - Currently not supported. - level : int or level name, default None - In case of MultiIndex, only rename labels in the specified level. - Currently not supported. - errors : {'ignore', 'raise'}, default 'ignore' - If 'raise', raise `KeyError` when a `dict-like mapper` or - `index` contains labels that are not present in the index being transformed. - If 'ignore', existing keys will be renamed and extra keys will be ignored. - Currently not supported. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 20, 30]) - >>> series - 0 10 - 1 20 - 2 30 - dtype: int64 - >>> series.name - >>> renamed_series = series.rename('numeric_series') - >>> renamed_series - 0 10 - 1 20 - 2 30 - Name: numeric_series, dtype: int64 - >>> renamed_series.name - 'numeric_series' - - .. pandas-compat:: - :meth:`pandas.Series.rename` - - - Supports scalar values only for changing name attribute - """ - if inplace is not False: - raise NotImplementedError("inplace is currently not supported.") - if level is not None: - raise NotImplementedError("level is currently not supported.") - if errors != "ignore": - raise NotImplementedError("errors is currently not supported.") - if not is_scalar(index): - raise NotImplementedError( - ".rename does not currently support relabeling the index." - ) - out_data = self._data.copy(deep=copy) - return Series._from_data(out_data, self.index, name=index) - - @_performance_tracking - def add_prefix(self, prefix, axis=None): - if axis is not None: - raise NotImplementedError("axis is currently not implemented.") - return Series._from_data( - # TODO: Change to deep=False when copy-on-write is default - data=self._data.copy(deep=True), - index=prefix + self.index.astype(str), - ) - - @_performance_tracking - def add_suffix(self, suffix, axis=None): - if axis is not None: - raise NotImplementedError("axis is currently not implemented.") - return Series._from_data( - # TODO: Change to deep=False when copy-on-write is default - data=self._data.copy(deep=True), - index=self.index.astype(str) + suffix, - ) - - @_performance_tracking - def keys(self): - """ - Return alias for index. - - Returns - ------- - Index - Index of the Series. - - Examples - -------- - >>> import cudf - >>> sr = cudf.Series([10, 11, 12, 13, 14, 15]) - >>> sr - 0 10 - 1 11 - 2 12 - 3 13 - 4 14 - 5 15 - dtype: int64 - - >>> sr.keys() - RangeIndex(start=0, stop=6, step=1) - >>> sr = cudf.Series(['a', 'b', 'c']) - >>> sr - 0 a - 1 b - 2 c - dtype: object - >>> sr.keys() - RangeIndex(start=0, stop=3, step=1) - >>> sr = cudf.Series([1, 2, 3], index=['a', 'b', 'c']) - >>> sr - a 1 - b 2 - c 3 - dtype: int64 - >>> sr.keys() - Index(['a', 'b', 'c'], dtype='object') - """ - return self.index - - @_performance_tracking - def explode(self, ignore_index=False): - """ - Transform each element of a list-like to a row, replicating index - values. - - Parameters - ---------- - ignore_index : bool, default False - If True, the resulting index will be labeled 0, 1, …, n - 1. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([[1, 2, 3], [], None, [4, 5]]) - >>> s - 0 [1, 2, 3] - 1 [] - 2 None - 3 [4, 5] - dtype: list - >>> s.explode() - 0 1 - 0 2 - 0 3 - 1 - 2 - 3 4 - 3 5 - dtype: int64 - """ - return super()._explode(self.name, ignore_index) - - @_performance_tracking - def pct_change( - self, - periods=1, - fill_method=no_default, - limit=no_default, - freq=None, - **kwargs, - ): - """ - Calculates the percent change between sequential elements - in the Series. - - Parameters - ---------- - periods : int, default 1 - Periods to shift for forming percent change. - fill_method : str, default 'ffill' - How to handle NAs before computing percent changes. - - .. deprecated:: 24.04 - All options of `fill_method` are deprecated - except `fill_method=None`. - limit : int, optional - The number of consecutive NAs to fill before stopping. - Not yet implemented. - - .. deprecated:: 24.04 - `limit` is deprecated. - freq : str, optional - Increment to use from time series API. - Not yet implemented. - **kwargs - Additional keyword arguments are passed into - `Series.shift`. - - Returns - ------- - Series - """ - if limit is not no_default: - raise NotImplementedError("limit parameter not supported yet.") - if freq is not None: - raise NotImplementedError("freq parameter not supported yet.") - elif fill_method not in { - no_default, - None, - "ffill", - "pad", - "bfill", - "backfill", - }: - raise ValueError( - "fill_method must be one of None, 'ffill', 'pad', " - "'bfill', or 'backfill'." - ) - if fill_method not in (no_default, None) or limit is not no_default: - # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn( - "The 'fill_method' and 'limit' keywords in " - f"{type(self).__name__}.pct_change are deprecated and will be " - "removed in a future version. Either fill in any non-leading " - "NA values prior to calling pct_change or specify " - "'fill_method=None' to not fill NA values.", - FutureWarning, - ) - - if fill_method is no_default: - fill_method = "ffill" - if limit is no_default: - limit = None - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - data = self.fillna(method=fill_method, limit=limit) - diff = data.diff(periods=periods) - change = diff / data.shift(periods=periods, freq=freq, **kwargs) - return change - - @_performance_tracking - def where(self, cond, other=None, inplace=False, axis=None, level=None): - if axis is not None: - raise NotImplementedError("axis is not supported.") - elif level is not None: - raise NotImplementedError("level is not supported.") - result_col = super().where(cond, other, inplace) - return self._mimic_inplace( - self._from_data_like_self( - self._data._from_columns_like_self([result_col]) - ), - inplace=inplace, - ) - - -def make_binop_func(op): - # This function is used to wrap binary operations in Frame with an - # appropriate API for Series as required for pandas compatibility. The - # main effect is reordering and error-checking parameters in - # Series-specific ways. - wrapped_func = getattr(IndexedFrame, op) - - @functools.wraps(wrapped_func) - def wrapper(self, other, level=None, fill_value=None, axis=0): - if axis != 0: - raise NotImplementedError("Only axis=0 supported at this time.") - return wrapped_func(self, other, axis, level, fill_value) - - # functools.wraps copies module level attributes to `wrapper` and sets - # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature - # string of a function by recursively delving into __wrapped__ until - # it hits the first function that has __signature__ attribute set. To make - # the signature string of `wrapper` matches with its actual parameter list, - # we directly set the __signature__ attribute of `wrapper` below. - - new_sig = inspect.signature( - lambda self, other, level=None, fill_value=None, axis=0: None - ) - wrapper.__signature__ = new_sig - return wrapper - - -# Wrap all Frame binop functions with the expected API for Series. -for binop in ( - "add", - "radd", - "subtract", - "sub", - "rsub", - "multiply", - "mul", - "rmul", - "mod", - "rmod", - "pow", - "rpow", - "floordiv", - "rfloordiv", - "truediv", - "div", - "divide", - "rtruediv", - "rdiv", - "eq", - "ne", - "lt", - "le", - "gt", - "ge", -): - setattr(Series, binop, make_binop_func(binop)) - - -class BaseDatelikeProperties: - """ - Base accessor class for Series values. - """ - - def __init__(self, series: Series): - self.series = series - - def _return_result_like_self(self, column: ColumnBase) -> Series: - """Return the method result like self.series""" - data = ColumnAccessor({self.series.name: column}, verify=False) - return self.series._from_data_like_self(data) - - -class DatetimeProperties(BaseDatelikeProperties): - """ - Accessor object for datetimelike properties of the Series values. - - Returns - ------- - Returns a Series indexed like the original Series. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> seconds_series = cudf.Series(pd.date_range("2000-01-01", periods=3, - ... freq="s")) - >>> seconds_series - 0 2000-01-01 00:00:00 - 1 2000-01-01 00:00:01 - 2 2000-01-01 00:00:02 - dtype: datetime64[ns] - >>> seconds_series.dt.second - 0 0 - 1 1 - 2 2 - dtype: int16 - >>> hours_series = cudf.Series(pd.date_range("2000-01-01", periods=3, - ... freq="h")) - >>> hours_series - 0 2000-01-01 00:00:00 - 1 2000-01-01 01:00:00 - 2 2000-01-01 02:00:00 - dtype: datetime64[ns] - >>> hours_series.dt.hour - 0 0 - 1 1 - 2 2 - dtype: int16 - >>> weekday_series = cudf.Series(pd.date_range("2000-01-01", periods=3, - ... freq="q")) - >>> weekday_series - 0 2000-03-31 - 1 2000-06-30 - 2 2000-09-30 - dtype: datetime64[ns] - >>> weekday_series.dt.weekday - 0 4 - 1 4 - 2 5 - dtype: int16 - """ - - @property # type: ignore - @_performance_tracking - def year(self) -> Series: - """ - The year of the datetime. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="Y")) - >>> datetime_series - 0 2000-12-31 - 1 2001-12-31 - 2 2002-12-31 - dtype: datetime64[ns] - >>> datetime_series.dt.year - 0 2000 - 1 2001 - 2 2002 - dtype: int16 - """ - return self._get_dt_field("year") - - @property # type: ignore - @_performance_tracking - def month(self) -> Series: - """ - The month as January=1, December=12. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="M")) - >>> datetime_series - 0 2000-01-31 - 1 2000-02-29 - 2 2000-03-31 - dtype: datetime64[ns] - >>> datetime_series.dt.month - 0 1 - 1 2 - 2 3 - dtype: int16 - """ - return self._get_dt_field("month") - - @property # type: ignore - @_performance_tracking - def day(self) -> Series: - """ - The day of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="D")) - >>> datetime_series - 0 2000-01-01 - 1 2000-01-02 - 2 2000-01-03 - dtype: datetime64[ns] - >>> datetime_series.dt.day - 0 1 - 1 2 - 2 3 - dtype: int16 - """ - return self._get_dt_field("day") - - @property # type: ignore - @_performance_tracking - def hour(self) -> Series: - """ - The hours of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="h")) - >>> datetime_series - 0 2000-01-01 00:00:00 - 1 2000-01-01 01:00:00 - 2 2000-01-01 02:00:00 - dtype: datetime64[ns] - >>> datetime_series.dt.hour - 0 0 - 1 1 - 2 2 - dtype: int16 - """ - return self._get_dt_field("hour") - - @property # type: ignore - @_performance_tracking - def minute(self) -> Series: - """ - The minutes of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="T")) - >>> datetime_series - 0 2000-01-01 00:00:00 - 1 2000-01-01 00:01:00 - 2 2000-01-01 00:02:00 - dtype: datetime64[ns] - >>> datetime_series.dt.minute - 0 0 - 1 1 - 2 2 - dtype: int16 - """ - return self._get_dt_field("minute") - - @property # type: ignore - @_performance_tracking - def second(self) -> Series: - """ - The seconds of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="s")) - >>> datetime_series - 0 2000-01-01 00:00:00 - 1 2000-01-01 00:00:01 - 2 2000-01-01 00:00:02 - dtype: datetime64[ns] - >>> datetime_series.dt.second - 0 0 - 1 1 - 2 2 - dtype: int16 - """ - return self._get_dt_field("second") - - @property # type: ignore - @_performance_tracking - def microsecond(self) -> Series: - """ - The microseconds of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="us")) - >>> datetime_series - 0 2000-01-01 00:00:00.000000 - 1 2000-01-01 00:00:00.000001 - 2 2000-01-01 00:00:00.000002 - dtype: datetime64[ns] - >>> datetime_series.dt.microsecond - 0 0 - 1 1 - 2 2 - dtype: int32 - """ - micro = self.series._column.get_dt_field("microsecond") - # Need to manually promote column to int32 because - # pandas-matching binop behaviour requires that this - # __mul__ returns an int16 column. - extra = self.series._column.get_dt_field("millisecond").astype( - "int32" - ) * cudf.Scalar(1000, dtype="int32") - return self._return_result_like_self(micro + extra) - - @property # type: ignore - @_performance_tracking - def nanosecond(self) -> Series: - """ - The nanoseconds of the datetime. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range("2000-01-01", - ... periods=3, freq="ns")) - >>> datetime_series - 0 2000-01-01 00:00:00.000000000 - 1 2000-01-01 00:00:00.000000001 - 2 2000-01-01 00:00:00.000000002 - dtype: datetime64[ns] - >>> datetime_series.dt.nanosecond - 0 0 - 1 1 - 2 2 - dtype: int16 - """ - return self._get_dt_field("nanosecond") - - @property # type: ignore - @_performance_tracking - def weekday(self) -> Series: - """ - The day of the week with Monday=0, Sunday=6. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range('2016-12-31', - ... '2017-01-08', freq='D')) - >>> datetime_series - 0 2016-12-31 - 1 2017-01-01 - 2 2017-01-02 - 3 2017-01-03 - 4 2017-01-04 - 5 2017-01-05 - 6 2017-01-06 - 7 2017-01-07 - 8 2017-01-08 - dtype: datetime64[ns] - >>> datetime_series.dt.weekday - 0 5 - 1 6 - 2 0 - 3 1 - 4 2 - 5 3 - 6 4 - 7 5 - 8 6 - dtype: int16 - """ - return self._get_dt_field("weekday") - - @property # type: ignore - @_performance_tracking - def dayofweek(self) -> Series: - """ - The day of the week with Monday=0, Sunday=6. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range('2016-12-31', - ... '2017-01-08', freq='D')) - >>> datetime_series - 0 2016-12-31 - 1 2017-01-01 - 2 2017-01-02 - 3 2017-01-03 - 4 2017-01-04 - 5 2017-01-05 - 6 2017-01-06 - 7 2017-01-07 - 8 2017-01-08 - dtype: datetime64[ns] - >>> datetime_series.dt.dayofweek - 0 5 - 1 6 - 2 0 - 3 1 - 4 2 - 5 3 - 6 4 - 7 5 - 8 6 - dtype: int16 - """ - return self._get_dt_field("weekday") - - @property # type: ignore - @_performance_tracking - def dayofyear(self) -> Series: - """ - The day of the year, from 1-365 in non-leap years and - from 1-366 in leap years. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range('2016-12-31', - ... '2017-01-08', freq='D')) - >>> datetime_series - 0 2016-12-31 - 1 2017-01-01 - 2 2017-01-02 - 3 2017-01-03 - 4 2017-01-04 - 5 2017-01-05 - 6 2017-01-06 - 7 2017-01-07 - 8 2017-01-08 - dtype: datetime64[ns] - >>> datetime_series.dt.dayofyear - 0 366 - 1 1 - 2 2 - 3 3 - 4 4 - 5 5 - 6 6 - 7 7 - 8 8 - dtype: int16 - """ - return self._get_dt_field("day_of_year") - - @property # type: ignore - @_performance_tracking - def day_of_year(self) -> Series: - """ - The day of the year, from 1-365 in non-leap years and - from 1-366 in leap years. - - Examples - -------- - >>> import pandas as pd - >>> import cudf - >>> datetime_series = cudf.Series(pd.date_range('2016-12-31', - ... '2017-01-08', freq='D')) - >>> datetime_series - 0 2016-12-31 - 1 2017-01-01 - 2 2017-01-02 - 3 2017-01-03 - 4 2017-01-04 - 5 2017-01-05 - 6 2017-01-06 - 7 2017-01-07 - 8 2017-01-08 - dtype: datetime64[ns] - >>> datetime_series.dt.day_of_year - 0 366 - 1 1 - 2 2 - 3 3 - 4 4 - 5 5 - 6 6 - 7 7 - 8 8 - dtype: int16 - """ - return self._get_dt_field("day_of_year") - - @property # type: ignore - @_performance_tracking - def is_leap_year(self) -> Series: - """ - Boolean indicator if the date belongs to a leap year. - - A leap year is a year, which has 366 days (instead of 365) including - 29th of February as an intercalary day. Leap years are years which are - multiples of four with the exception of years divisible by 100 but not - by 400. - - Returns - ------- - Series - Booleans indicating if dates belong to a leap year. - - Examples - -------- - >>> import pandas as pd, cudf - >>> s = cudf.Series( - ... pd.date_range(start='2000-02-01', end='2013-02-01', freq='1Y')) - >>> s - 0 2000-12-31 - 1 2001-12-31 - 2 2002-12-31 - 3 2003-12-31 - 4 2004-12-31 - 5 2005-12-31 - 6 2006-12-31 - 7 2007-12-31 - 8 2008-12-31 - 9 2009-12-31 - 10 2010-12-31 - 11 2011-12-31 - 12 2012-12-31 - dtype: datetime64[ns] - >>> s.dt.is_leap_year - 0 True - 1 False - 2 False - 3 False - 4 True - 5 False - 6 False - 7 False - 8 True - 9 False - 10 False - 11 False - 12 True - dtype: bool - """ - res = libcudf.datetime.is_leap_year(self.series._column).fillna(False) - return self._return_result_like_self(res) - - @property # type: ignore - @_performance_tracking - def quarter(self) -> Series: - """ - Integer indicator for which quarter of the year the date belongs in. - - There are 4 quarters in a year. With the first quarter being from - January - March, second quarter being April - June, third quarter - being July - September and fourth quarter being October - December. - - Returns - ------- - Series - Integer indicating which quarter the date belongs to. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(["2020-05-31 08:00:00","1999-12-31 18:40:00"], - ... dtype="datetime64[ms]") - >>> s.dt.quarter - 0 2 - 1 4 - dtype: int8 - """ - res = libcudf.datetime.extract_quarter(self.series._column).astype( - np.int8 - ) - return self._return_result_like_self(res) - - @_performance_tracking - def day_name(self, locale: str | None = None) -> Series: - """ - Return the day names. Currently supports English locale only. - - Examples - -------- - >>> import cudf - >>> datetime_series = cudf.Series(cudf.date_range('2016-12-31', - ... '2017-01-08', freq='D')) - >>> datetime_series - 0 2016-12-31 - 1 2017-01-01 - 2 2017-01-02 - 3 2017-01-03 - 4 2017-01-04 - 5 2017-01-05 - 6 2017-01-06 - 7 2017-01-07 - 8 2017-01-08 - dtype: datetime64[ns] - >>> datetime_series.dt.day_name() - 0 Saturday - 1 Sunday - 2 Monday - 3 Tuesday - 4 Wednesday - 5 Thursday - 6 Friday - 7 Saturday - dtype: object - """ - return self._return_result_like_self( - self.series._column.get_day_names(locale) - ) - - @_performance_tracking - def month_name(self, locale: str | None = None) -> Series: - """ - Return the month names. Currently supports English locale only. - - Examples - -------- - >>> import cudf - >>> datetime_series = cudf.Series(cudf.date_range("2017-12-30", periods=6, freq='W')) - >>> datetime_series - 0 2017-12-30 - 1 2018-01-06 - 2 2018-01-13 - 3 2018-01-20 - 4 2018-01-27 - 5 2018-02-03 - dtype: datetime64[ns] - >>> datetime_series.dt.month_name() - 0 December - 1 January - 2 January - 3 January - 4 January - 5 February - dtype: object - """ - return self._return_result_like_self( - self.series._column.get_month_names(locale) - ) - - @_performance_tracking - def isocalendar(self) -> cudf.DataFrame: - """ - Returns a DataFrame with the year, week, and day - calculated according to the ISO 8601 standard. - - Returns - ------- - DataFrame - with columns year, week and day - - Examples - -------- - >>> ser = cudf.Series(pd.date_range(start="2021-07-25", - ... end="2021-07-30")) - >>> ser.dt.isocalendar() - year week day - 0 2021 29 7 - 1 2021 30 1 - 2 2021 30 2 - 3 2021 30 3 - 4 2021 30 4 - 5 2021 30 5 - >>> ser.dt.isocalendar().week - 0 29 - 1 30 - 2 30 - 3 30 - 4 30 - 5 30 - Name: week, dtype: object - - >>> serIndex = cudf.to_datetime(pd.Series(["2010-01-01", pd.NaT])) - >>> serIndex.dt.isocalendar() - year week day - 0 2009 53 5 - 1 - >>> serIndex.dt.isocalendar().year - 0 2009 - 1 - Name: year, dtype: object - """ - ca = ColumnAccessor(self.series._column.isocalendar(), verify=False) - return self.series._constructor_expanddim._from_data( - ca, index=self.series.index - ) - - @property # type: ignore - @_performance_tracking - def is_month_start(self) -> Series: - """ - Booleans indicating if dates are the first day of the month. - """ - return self._return_result_like_self( - self.series._column.is_month_start - ) - - @property # type: ignore - @_performance_tracking - def days_in_month(self) -> Series: - """ - Get the total number of days in the month that the date falls on. - - Returns - ------- - Series - Integers representing the number of days in month - - Examples - -------- - >>> import pandas as pd, cudf - >>> s = cudf.Series( - ... pd.date_range(start='2000-08-01', end='2001-08-01', freq='1M')) - >>> s - 0 2000-08-31 - 1 2000-09-30 - 2 2000-10-31 - 3 2000-11-30 - 4 2000-12-31 - 5 2001-01-31 - 6 2001-02-28 - 7 2001-03-31 - 8 2001-04-30 - 9 2001-05-31 - 10 2001-06-30 - 11 2001-07-31 - dtype: datetime64[ns] - >>> s.dt.days_in_month - 0 31 - 1 30 - 2 31 - 3 30 - 4 31 - 5 31 - 6 28 - 7 31 - 8 30 - 9 31 - 10 30 - 11 31 - dtype: int16 - """ - return self._return_result_like_self(self.series._column.days_in_month) - - @property # type: ignore - @_performance_tracking - def is_month_end(self) -> Series: - """ - Boolean indicator if the date is the last day of the month. - - Returns - ------- - Series - Booleans indicating if dates are the last day of the month. - - Examples - -------- - >>> import pandas as pd, cudf - >>> s = cudf.Series( - ... pd.date_range(start='2000-08-26', end='2000-09-03', freq='1D')) - >>> s - 0 2000-08-26 - 1 2000-08-27 - 2 2000-08-28 - 3 2000-08-29 - 4 2000-08-30 - 5 2000-08-31 - 6 2000-09-01 - 7 2000-09-02 - 8 2000-09-03 - dtype: datetime64[ns] - >>> s.dt.is_month_end - 0 False - 1 False - 2 False - 3 False - 4 False - 5 True - 6 False - 7 False - 8 False - dtype: bool - """ # noqa: E501 - return self._return_result_like_self(self.series._column.is_month_end) - - @property # type: ignore - @_performance_tracking - def is_quarter_start(self) -> Series: - """ - Boolean indicator if the date is the first day of a quarter. - - Returns - ------- - Series - Booleans indicating if dates are the beginning of a quarter - - Examples - -------- - >>> import pandas as pd, cudf - >>> s = cudf.Series( - ... pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D')) - >>> s - 0 2000-09-26 - 1 2000-09-27 - 2 2000-09-28 - 3 2000-09-29 - 4 2000-09-30 - 5 2000-10-01 - 6 2000-10-02 - 7 2000-10-03 - dtype: datetime64[ns] - >>> s.dt.is_quarter_start - 0 False - 1 False - 2 False - 3 False - 4 False - 5 True - 6 False - 7 False - dtype: bool - """ - return self._return_result_like_self( - self.series._column.is_quarter_start - ) - - @property # type: ignore - @_performance_tracking - def is_quarter_end(self) -> Series: - """ - Boolean indicator if the date is the last day of a quarter. - - Returns - ------- - Series - Booleans indicating if dates are the end of a quarter - - Examples - -------- - >>> import pandas as pd, cudf - >>> s = cudf.Series( - ... pd.date_range(start='2000-09-26', end='2000-10-03', freq='1D')) - >>> s - 0 2000-09-26 - 1 2000-09-27 - 2 2000-09-28 - 3 2000-09-29 - 4 2000-09-30 - 5 2000-10-01 - 6 2000-10-02 - 7 2000-10-03 - dtype: datetime64[ns] - >>> s.dt.is_quarter_end - 0 False - 1 False - 2 False - 3 False - 4 True - 5 False - 6 False - 7 False - dtype: bool - """ - return self._return_result_like_self( - self.series._column.is_quarter_end - ) - - @property # type: ignore - @_performance_tracking - def is_year_start(self) -> Series: - """ - Boolean indicator if the date is the first day of the year. - - Returns - ------- - Series - Booleans indicating if dates are the first day of the year. - - Examples - -------- - >>> import pandas as pd, cudf - >>> s = cudf.Series(pd.date_range("2017-12-30", periods=3)) - >>> dates - 0 2017-12-30 - 1 2017-12-31 - 2 2018-01-01 - dtype: datetime64[ns] - >>> dates.dt.is_year_start - 0 False - 1 False - 2 True - dtype: bool - """ - return self._return_result_like_self(self.series._column.is_year_start) - - @property # type: ignore - @_performance_tracking - def is_year_end(self) -> Series: - """ - Boolean indicator if the date is the last day of the year. - - Returns - ------- - Series - Booleans indicating if dates are the last day of the year. - - Examples - -------- - >>> import pandas as pd, cudf - >>> dates = cudf.Series(pd.date_range("2017-12-30", periods=3)) - >>> dates - 0 2017-12-30 - 1 2017-12-31 - 2 2018-01-01 - dtype: datetime64[ns] - >>> dates.dt.is_year_end - 0 False - 1 True - 2 False - dtype: bool - """ - return self._return_result_like_self(self.series._column.is_year_end) - - @_performance_tracking - def _get_dt_field(self, field: str) -> Series: - return self._return_result_like_self( - self.series._column.get_dt_field(field) - ) - - @_performance_tracking - def ceil(self, freq: str) -> Series: - """ - Perform ceil operation on the data to the specified freq. - - Parameters - ---------- - freq : str - One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. - Must be a fixed frequency like 'S' (second) not 'ME' (month end). - See `frequency aliases `__ - for more details on these aliases. - - Returns - ------- - Series - Series with all timestamps rounded up to the specified frequency. - The index is preserved. - - Examples - -------- - >>> import cudf - >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58", - ... "2001-01-01 00:05:04"], dtype="datetime64[ns]") - >>> t.dt.ceil("T") - 0 2001-01-01 00:05:00 - 1 2001-01-01 00:05:00 - 2 2001-01-01 00:06:00 - dtype: datetime64[ns] - """ - return self._return_result_like_self(self.series._column.ceil(freq)) - - @_performance_tracking - def floor(self, freq: str) -> Series: - """ - Perform floor operation on the data to the specified freq. - - Parameters - ---------- - freq : str - One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. - Must be a fixed frequency like 'S' (second) not 'ME' (month end). - See `frequency aliases `__ - for more details on these aliases. - - Returns - ------- - Series - Series with all timestamps rounded up to the specified frequency. - The index is preserved. - - Examples - -------- - >>> import cudf - >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58", - ... "2001-01-01 00:05:04"], dtype="datetime64[ns]") - >>> t.dt.floor("T") - 0 2001-01-01 00:04:00 - 1 2001-01-01 00:04:00 - 2 2001-01-01 00:05:00 - dtype: datetime64[ns] - """ - return self._return_result_like_self(self.series._column.floor(freq)) - - @_performance_tracking - def round(self, freq: str) -> Series: - """ - Perform round operation on the data to the specified freq. - - Parameters - ---------- - freq : str - One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. - Must be a fixed frequency like 'S' (second) not 'ME' (month end). - See `frequency aliases `__ - for more details on these aliases. - - Returns - ------- - Series - Series with all timestamps rounded to the specified frequency. - The index is preserved. - - Examples - -------- - >>> import cudf - >>> dt_sr = cudf.Series([ - ... "2001-01-01 00:04:45", - ... "2001-01-01 00:04:58", - ... "2001-01-01 00:05:04", - ... ], dtype="datetime64[ns]") - >>> dt_sr.dt.round("T") - 0 2001-01-01 00:05:00 - 1 2001-01-01 00:05:00 - 2 2001-01-01 00:05:00 - dtype: datetime64[ns] - """ - return self._return_result_like_self(self.series._column.round(freq)) - - @_performance_tracking - def strftime(self, date_format: str, *args, **kwargs) -> Series: - """ - Convert to Series using specified ``date_format``. - - Return a Series of formatted strings specified by ``date_format``, - which supports the same string format as the python standard library. - Details of the string format can be found in `python string format doc - `_. - - Parameters - ---------- - date_format : str - Date format string (e.g. "%Y-%m-%d"). - - Returns - ------- - Series - Series of formatted strings. - - Examples - -------- - >>> import cudf - >>> import pandas as pd - >>> weekday_series = cudf.Series(pd.date_range("2000-01-01", periods=3, - ... freq="q")) - >>> weekday_series.dt.strftime("%Y-%m-%d") - >>> weekday_series - 0 2000-03-31 - 1 2000-06-30 - 2 2000-09-30 - dtype: datetime64[ns] - 0 2000-03-31 - 1 2000-06-30 - 2 2000-09-30 - dtype: object - >>> weekday_series.dt.strftime("%Y %d %m") - 0 2000 31 03 - 1 2000 30 06 - 2 2000 30 09 - dtype: object - >>> weekday_series.dt.strftime("%Y / %d / %m") - 0 2000 / 31 / 03 - 1 2000 / 30 / 06 - 2 2000 / 30 / 09 - dtype: object - - .. pandas-compat:: - :meth:`pandas.DatetimeIndex.strftime` - - The following date format identifiers are not yet - supported: ``%c``, ``%x``,``%X`` - """ - - if not isinstance(date_format, str): - raise TypeError( - f"'date_format' must be str, not {type(date_format)}" - ) - - # TODO: Remove following validations - # once https://github.com/rapidsai/cudf/issues/5991 - # is implemented - not_implemented_formats = { - "%c", - "%x", - "%X", - } - for d_format in not_implemented_formats: - if d_format in date_format: - raise NotImplementedError( - f"{d_format} date-time format is not " - f"supported yet, Please follow this issue " - f"https://github.com/rapidsai/cudf/issues/5991 " - f"for tracking purposes." - ) - return self._return_result_like_self( - self.series._column.strftime(format=date_format) - ) - - @copy_docstring(DatetimeIndex.tz_localize) - def tz_localize( - self, - tz: str | None, - ambiguous: Literal["NaT"] = "NaT", - nonexistent: Literal["NaT"] = "NaT", - ) -> Series: - return self._return_result_like_self( - self.series._column.tz_localize(tz, ambiguous, nonexistent) - ) - - @copy_docstring(DatetimeIndex.tz_convert) - def tz_convert(self, tz: str | None) -> Series: - """ - Parameters - ---------- - tz : str - Time zone for time. Corresponding timestamps would be converted - to this time zone of the Datetime Array/Index. - A `tz` of None will convert to UTC and remove the - timezone information. - """ - return self._return_result_like_self( - self.series._column.tz_convert(tz) - ) - - -class TimedeltaProperties(BaseDatelikeProperties): - """ - Accessor object for timedelta-like properties of the Series values. - - Returns - ------- - Returns a Series indexed like the original Series. - - Examples - -------- - >>> import cudf - >>> seconds_series = cudf.Series([1, 2, 3], dtype='timedelta64[s]') - >>> seconds_series - 0 00:00:01 - 1 00:00:02 - 2 00:00:03 - dtype: timedelta64[s] - >>> seconds_series.dt.seconds - 0 1 - 1 2 - 2 3 - dtype: int64 - >>> series = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, - ... 3244334234], dtype='timedelta64[ms]') - >>> series - 0 141 days 13:35:12.123 - 1 14 days 06:00:31.231 - 2 13000 days 10:12:48.712 - 3 0 days 00:35:35.656 - 4 37 days 13:12:14.234 - dtype: timedelta64[ms] - >>> series.dt.components - days hours minutes seconds milliseconds microseconds nanoseconds - 0 141 13 35 12 123 0 0 - 1 14 6 0 31 231 0 0 - 2 13000 10 12 48 712 0 0 - 3 0 0 35 35 656 0 0 - 4 37 13 12 14 234 0 0 - >>> series.dt.days - 0 141 - 1 14 - 2 13000 - 3 0 - 4 37 - dtype: int64 - >>> series.dt.seconds - 0 48912 - 1 21631 - 2 36768 - 3 2135 - 4 47534 - dtype: int64 - >>> series.dt.microseconds - 0 123000 - 1 231000 - 2 712000 - 3 656000 - 4 234000 - dtype: int64 - >>> s.dt.nanoseconds - 0 0 - 1 0 - 2 0 - 3 0 - 4 0 - dtype: int64 - """ - - @property # type: ignore - @_performance_tracking - def days(self) -> Series: - """ - Number of days. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, - ... 3244334234], dtype='timedelta64[ms]') - >>> s - 0 141 days 13:35:12.123 - 1 14 days 06:00:31.231 - 2 13000 days 10:12:48.712 - 3 0 days 00:35:35.656 - 4 37 days 13:12:14.234 - dtype: timedelta64[ms] - >>> s.dt.days - 0 141 - 1 14 - 2 13000 - 3 0 - 4 37 - dtype: int64 - """ - return self._get_td_field("days") - - @property # type: ignore - @_performance_tracking - def seconds(self) -> Series: - """ - Number of seconds (>= 0 and less than 1 day). - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, - ... 3244334234], dtype='timedelta64[ms]') - >>> s - 0 141 days 13:35:12.123 - 1 14 days 06:00:31.231 - 2 13000 days 10:12:48.712 - 3 0 days 00:35:35.656 - 4 37 days 13:12:14.234 - dtype: timedelta64[ms] - >>> s.dt.seconds - 0 48912 - 1 21631 - 2 36768 - 3 2135 - 4 47534 - dtype: int64 - >>> s.dt.microseconds - 0 123000 - 1 231000 - 2 712000 - 3 656000 - 4 234000 - dtype: int64 - """ - return self._get_td_field("seconds") - - @property # type: ignore - @_performance_tracking - def microseconds(self) -> Series: - """ - Number of microseconds (>= 0 and less than 1 second). - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, - ... 3244334234], dtype='timedelta64[ms]') - >>> s - 0 141 days 13:35:12.123 - 1 14 days 06:00:31.231 - 2 13000 days 10:12:48.712 - 3 0 days 00:35:35.656 - 4 37 days 13:12:14.234 - dtype: timedelta64[ms] - >>> s.dt.microseconds - 0 123000 - 1 231000 - 2 712000 - 3 656000 - 4 234000 - dtype: int64 - """ - return self._get_td_field("microseconds") - - @property # type: ignore - @_performance_tracking - def nanoseconds(self) -> Series: - """ - Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, - ... 3244334234], dtype='timedelta64[ns]') - >>> s - 0 00:00:12.231312123 - 1 00:00:01.231231231 - 2 00:18:43.236768712 - 3 00:00:00.002135656 - 4 00:00:03.244334234 - dtype: timedelta64[ns] - >>> s.dt.nanoseconds - 0 123 - 1 231 - 2 712 - 3 656 - 4 234 - dtype: int64 - """ - return self._get_td_field("nanoseconds") - - @property # type: ignore - @_performance_tracking - def components(self) -> cudf.DataFrame: - """ - Return a Dataframe of the components of the Timedeltas. - - Returns - ------- - DataFrame - - Examples - -------- - >>> s = cudf.Series([12231312123, 1231231231, 1123236768712, 2135656, 3244334234], dtype='timedelta64[ms]') - >>> s - 0 141 days 13:35:12.123 - 1 14 days 06:00:31.231 - 2 13000 days 10:12:48.712 - 3 0 days 00:35:35.656 - 4 37 days 13:12:14.234 - dtype: timedelta64[ms] - >>> s.dt.components - days hours minutes seconds milliseconds microseconds nanoseconds - 0 141 13 35 12 123 0 0 - 1 14 6 0 31 231 0 0 - 2 13000 10 12 48 712 0 0 - 3 0 0 35 35 656 0 0 - 4 37 13 12 14 234 0 0 - """ # noqa: E501 - ca = ColumnAccessor(self.series._column.components(), verify=False) - return self.series._constructor_expanddim._from_data( - ca, index=self.series.index - ) - - @_performance_tracking - def _get_td_field(self, field: str) -> Series: - return self._return_result_like_self( - getattr(self.series._column, field) - ) - - -@_performance_tracking -def _align_indices(series_list, how="outer", allow_non_unique=False): - """ - Internal util to align the indices of a list of Series objects - - series_list : list of Series objects - how : {"outer", "inner"} - If "outer", the values of the resulting index are the - unique values of the index obtained by concatenating - the indices of all the series. - If "inner", the values of the resulting index are - the values common to the indices of all series. - allow_non_unique : bool - Whether or not to allow non-unique valued indices in the input - series. - """ - if len(series_list) <= 1: - return series_list - - # check if all indices are the same - head = series_list[0].index - - all_index_equal = True - for sr in series_list[1:]: - if not sr.index.equals(head): - all_index_equal = False - break - - # check if all names are the same - all_names_equal = True - for sr in series_list[1:]: - if not sr.index.names == head.names: - all_names_equal = False - new_index_names = [None] * head.nlevels - if all_names_equal: - new_index_names = head.names - - if all_index_equal: - return series_list - - combined_index = series_list[0].index - for sr in series_list[1:]: - combined_index = ( - cudf.DataFrame(index=sr.index).join( - cudf.DataFrame(index=combined_index), - sort=True, - how=how, - ) - ).index - combined_index.names = new_index_names - - # align all Series to the combined index - result = [ - sr._align_to_index( - combined_index, how=how, allow_non_unique=allow_non_unique - ) - for sr in series_list - ] - - return result - - -@acquire_spill_lock() -@_performance_tracking -def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): - r"""Returns a boolean array where two arrays are equal within a tolerance. - - Two values in ``a`` and ``b`` are considered equal when the following - equation is satisfied. - - .. math:: - |a - b| \le \mathrm{atol} + \mathrm{rtol} |b| - - Parameters - ---------- - a : list-like, array-like or cudf.Series - Input sequence to compare. - b : list-like, array-like or cudf.Series - Input sequence to compare. - rtol : float - The relative tolerance. - atol : float - The absolute tolerance. - equal_nan : bool - If ``True``, null's in ``a`` will be considered equal - to null's in ``b``. - - Returns - ------- - Series - - See Also - -------- - np.isclose : Returns a boolean array where two arrays are element-wise - equal within a tolerance. - - Examples - -------- - >>> import cudf - >>> s1 = cudf.Series([1.9876543, 2.9876654, 3.9876543, None, 9.9, 1.0]) - >>> s2 = cudf.Series([1.987654321, 2.987654321, 3.987654321, None, 19.9, - ... None]) - >>> s1 - 0 1.9876543 - 1 2.9876654 - 2 3.9876543 - 3 - 4 9.9 - 5 1.0 - dtype: float64 - >>> s2 - 0 1.987654321 - 1 2.987654321 - 2 3.987654321 - 3 - 4 19.9 - 5 - dtype: float64 - >>> cudf.isclose(s1, s2) - 0 True - 1 True - 2 True - 3 False - 4 False - 5 False - dtype: bool - >>> cudf.isclose(s1, s2, equal_nan=True) - 0 True - 1 True - 2 True - 3 True - 4 False - 5 False - dtype: bool - >>> cudf.isclose(s1, s2, equal_nan=False) - 0 True - 1 True - 2 True - 3 False - 4 False - 5 False - dtype: bool - """ - - if not can_convert_to_column(a): - raise TypeError( - f"Parameter `a` is expected to be a " - f"list-like or Series object, found:{type(a)}" - ) - if not can_convert_to_column(b): - raise TypeError( - f"Parameter `b` is expected to be a " - f"list-like or Series object, found:{type(a)}" - ) - - if isinstance(a, pd.Series): - a = Series.from_pandas(a) - if isinstance(b, pd.Series): - b = Series.from_pandas(b) - - index = None - - if isinstance(a, cudf.Series) and isinstance(b, cudf.Series): - b = b.reindex(a.index) - index = cudf.Index(a.index) - - a_col = as_column(a) - a_array = cupy.asarray(a_col.data_array_view(mode="read")) - - b_col = as_column(b) - b_array = cupy.asarray(b_col.data_array_view(mode="read")) - - result = cupy.isclose( - a=a_array, b=b_array, rtol=rtol, atol=atol, equal_nan=equal_nan - ) - result_col = as_column(result) - - if a_col.null_count and b_col.null_count: - a_nulls = a_col.isnull() - b_nulls = b_col.isnull() - null_values = a_nulls | b_nulls - - if equal_nan is True: - equal_nulls = a_nulls & b_nulls - - del a_nulls, b_nulls - elif a_col.null_count: - null_values = a_col.isnull() - elif b_col.null_count: - null_values = b_col.isnull() - else: - return Series._from_column(result_col, index=index) - - result_col[null_values] = False - if equal_nan is True and a_col.null_count and b_col.null_count: - result_col[equal_nulls] = True - - return Series._from_column(result_col, index=index) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py deleted file mode 100644 index 0e66f383ca0..00000000000 --- a/python/cudf/cudf/core/single_column_frame.py +++ /dev/null @@ -1,391 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -"""Base class for Frame types that only have a single column.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING, Any - -from typing_extensions import Self - -import cudf -from cudf.api.extensions import no_default -from cudf.api.types import ( - _is_scalar_or_zero_d_array, - is_integer, - is_numeric_dtype, -) -from cudf.core.column import ColumnBase, as_column -from cudf.core.column_accessor import ColumnAccessor -from cudf.core.frame import Frame -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import NotIterable - -if TYPE_CHECKING: - from collections.abc import Hashable - - import cupy - import numpy - import pyarrow as pa - - from cudf._typing import NotImplementedType, ScalarLike - - -class SingleColumnFrame(Frame, NotIterable): - """A one-dimensional frame. - - Frames with only a single column (Index or Series) - share certain logic that is encoded in this class. - """ - - _SUPPORT_AXIS_LOOKUP = { - 0: 0, - "index": 0, - } - - @_performance_tracking - def _reduce( - self, - op, - axis=no_default, - numeric_only=False, - **kwargs, - ): - if axis not in (None, 0, no_default): - raise NotImplementedError("axis parameter is not implemented yet") - - if numeric_only and not is_numeric_dtype(self.dtype): - raise TypeError( - f"Series.{op} does not allow numeric_only={numeric_only} " - "with non-numeric dtypes." - ) - try: - return getattr(self._column, op)(**kwargs) - except AttributeError: - raise TypeError(f"cannot perform {op} with type {self.dtype}") - - @_performance_tracking - def _scan(self, op, axis=None, *args, **kwargs): - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - return super()._scan(op, axis=axis, *args, **kwargs) - - @property # type: ignore - @_performance_tracking - def name(self): - """Get the name of this object.""" - return next(iter(self._column_names)) - - @name.setter # type: ignore - @_performance_tracking - def name(self, value): - self._data[value] = self._data.pop(self.name) - - @property # type: ignore - @_performance_tracking - def ndim(self) -> int: # noqa: D401 - """Number of dimensions of the underlying data, by definition 1.""" - return 1 - - @property # type: ignore - @_performance_tracking - def shape(self) -> tuple[int]: - """Get a tuple representing the dimensionality of the Index.""" - return (len(self),) - - @property # type: ignore - @_performance_tracking - def _num_columns(self) -> int: - return 1 - - @property # type: ignore - @_performance_tracking - def _column(self) -> ColumnBase: - return next(iter(self._columns)) - - @property # type: ignore - @_performance_tracking - def values(self) -> cupy.ndarray: # noqa: D102 - return self._column.values - - @property # type: ignore - @_performance_tracking - def values_host(self) -> numpy.ndarray: # noqa: D102 - return self._column.values_host - - @classmethod - @_performance_tracking - def _from_column( - cls, column: ColumnBase, *, name: Hashable = None - ) -> Self: - """Constructor for a single Column.""" - ca = ColumnAccessor({name: column}, verify=False) - return cls._from_data(ca) - - @classmethod - @_performance_tracking - def from_arrow(cls, array) -> Self: - raise NotImplementedError - - @_performance_tracking - def to_arrow(self) -> pa.Array: - """ - Convert to a PyArrow Array. - - Returns - ------- - PyArrow Array - - Examples - -------- - >>> import cudf - >>> sr = cudf.Series(["a", "b", None]) - >>> sr.to_arrow() - - [ - "a", - "b", - null - ] - >>> ind = cudf.Index(["a", "b", None]) - >>> ind.to_arrow() - - [ - "a", - "b", - null - ] - """ - return self._column.to_arrow() - - def _to_frame( - self, name: Hashable, index: cudf.Index | None - ) -> cudf.DataFrame: - """Helper function for Series.to_frame, Index.to_frame""" - if name is no_default: - col_name = 0 if self.name is None else self.name - else: - col_name = name - ca = ColumnAccessor({col_name: self._column}, verify=False) - return cudf.DataFrame._from_data(ca, index=index) - - @property # type: ignore - @_performance_tracking - def is_unique(self) -> bool: - """Return boolean if values in the object are unique. - - Returns - ------- - bool - """ - return self._column.is_unique - - @property # type: ignore - @_performance_tracking - def is_monotonic_increasing(self) -> bool: - """Return boolean if values in the object are monotonically increasing. - - Returns - ------- - bool - """ - return self._column.is_monotonic_increasing - - @property # type: ignore - @_performance_tracking - def is_monotonic_decreasing(self) -> bool: - """Return boolean if values in the object are monotonically decreasing. - - Returns - ------- - bool - """ - return self._column.is_monotonic_decreasing - - @property # type: ignore - @_performance_tracking - def __cuda_array_interface__(self): - # While the parent column class has a `__cuda_array_interface__` method - # defined, it is not implemented for all column types. When it is not - # implemented, though, at the Frame level we really want to throw an - # AttributeError. - try: - return self._column.__cuda_array_interface__ - except NotImplementedError: - raise AttributeError( - f"'{type(self).__name__}' object has no attribute " - "'__cuda_array_interface__'" - ) - - @_performance_tracking - def factorize( - self, sort: bool = False, use_na_sentinel: bool = True - ) -> tuple[cupy.ndarray, cudf.Index]: - """Encode the input values as integer labels. - - Parameters - ---------- - sort : bool, default True - Sort uniques and shuffle codes to maintain the relationship. - use_na_sentinel : bool, default True - If True, the sentinel -1 will be used for NA values. - If False, NA values will be encoded as non-negative - integers and will not drop the NA from the uniques - of the values. - - Returns - ------- - (labels, cats) : (cupy.ndarray, cupy.ndarray or Index) - - *labels* contains the encoded values - - *cats* contains the categories in order that the N-th - item corresponds to the (N-1) code. - - Examples - -------- - >>> import cudf - >>> s = cudf.Series(['a', 'a', 'c']) - >>> codes, uniques = s.factorize() - >>> codes - array([0, 0, 1], dtype=int8) - >>> uniques - Index(['a', 'c'], dtype='object') - """ - return cudf.core.algorithms.factorize( - self, - sort=sort, - use_na_sentinel=use_na_sentinel, - ) - - @_performance_tracking - def _make_operands_for_binop( - self, - other: Any, - fill_value: Any = None, - reflect: bool = False, - ) -> ( - dict[str | None, tuple[ColumnBase, Any, bool, Any]] - | NotImplementedType - ): - """Generate the dictionary of operands used for a binary operation. - - Parameters - ---------- - other : SingleColumnFrame - The second operand. - fill_value : Any, default None - The value to replace null values with. If ``None``, nulls are not - filled before the operation. - reflect : bool, default False - If ``True``, swap the order of the operands. See - https://docs.python.org/3/reference/datamodel.html#object.__ror__ - for more information on when this is necessary. - - Returns - ------- - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]] - The operands to be passed to _colwise_binop. - """ - # Get the appropriate name for output operations involving two objects - # that are Series-like objects. The output shares the lhs's name unless - # the rhs is a _differently_ named Series-like object. - if isinstance( - other, SingleColumnFrame - ) and not cudf.utils.utils._is_same_name(self.name, other.name): - result_name = None - else: - result_name = self.name - - if isinstance(other, SingleColumnFrame): - other = other._column - elif not _is_scalar_or_zero_d_array(other): - if not hasattr( - other, "__cuda_array_interface__" - ) and not isinstance(other, cudf.RangeIndex): - return NotImplemented - - # Non-scalar right operands are valid iff they convert to columns. - try: - other = as_column(other) - except Exception: - return NotImplemented - - return {result_name: (self._column, other, reflect, fill_value)} - - @_performance_tracking - def nunique(self, dropna: bool = True) -> int: - """ - Return count of unique values for the column. - - Parameters - ---------- - dropna : bool, default True - Don't include NaN in the counts. - - Returns - ------- - int - Number of unique values in the column. - """ - return self._column.distinct_count(dropna=dropna) - - def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: - # A generic method for getting elements from a column that supports a - # wide range of different inputs. This method should only used where - # _absolutely_ necessary, since in almost all cases a more specific - # method can be used e.g. element_indexing or slice. - if _is_scalar_or_zero_d_array(arg): - if not is_integer(arg): - raise ValueError( - "Can only select elements with an integer, " - f"not a {type(arg).__name__}" - ) - return self._column.element_indexing(int(arg)) - elif isinstance(arg, slice): - start, stop, stride = arg.indices(len(self)) - return self._column.slice(start, stop, stride) - else: - arg = as_column(arg) - if len(arg) == 0: - arg = cudf.core.column.column_empty(0, dtype="int32") - if arg.dtype.kind in "iu": - return self._column.take(arg) - if arg.dtype.kind == "b": - if (bn := len(arg)) != (n := len(self)): - raise IndexError( - f"Boolean mask has wrong length: {bn} not {n}" - ) - return self._column.apply_boolean_mask(arg) - raise NotImplementedError(f"Unknown indexer {type(arg)}") - - @_performance_tracking - def where(self, cond, other=None, inplace=False): - from cudf.core._internals.where import ( - _check_and_cast_columns_with_other, - ) - - if isinstance(other, cudf.DataFrame): - raise NotImplementedError( - "cannot align with a higher dimensional Frame" - ) - cond = as_column(cond) - if len(cond) != len(self): - raise ValueError( - """Array conditional must be same shape as self""" - ) - - if not cudf.api.types.is_scalar(other): - other = cudf.core.column.as_column(other) - - input_col, other = _check_and_cast_columns_with_other( - source_col=self._column, other=other, inplace=inplace - ) - - result = cudf._lib.copying.copy_if_else(input_col, other, cond) - return result._with_type_metadata(self.dtype) - - @_performance_tracking - def transpose(self): - """Return the transpose, which is by definition self.""" - return self - - T = property(transpose, doc=transpose.__doc__) diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py deleted file mode 100644 index 9e59b134b73..00000000000 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import warnings - -import cupy as cp - -from cudf._lib.nvtext.subword_tokenize import ( - Hashed_Vocabulary as cpp_hashed_vocabulary, - subword_tokenize_inmem_hash as cpp_subword_tokenize, -) - - -def _cast_to_appropriate_type(ar, cast_type): - if cast_type == "cp": - return ar - - if cast_type == "pt": - from torch.utils.dlpack import from_dlpack - - elif cast_type == "tf": - from tensorflow.experimental.dlpack import from_dlpack - - return from_dlpack(ar.astype("int32").toDlpack()) - - -class SubwordTokenizer: - """ - Run CUDA BERT subword tokenizer on cuDF strings column. - Encodes words to token ids using vocabulary from a pretrained - tokenizer. - This function requires about 21x the number of character bytes - in the input strings column as working memory. - - Parameters - ---------- - hash_file : str - Path to hash file containing vocabulary of words with token-ids. - This can be created from the raw vocabulary - using the ``cudf.utils.hash_vocab_utils.hash_vocab`` function - - do_lower : bool, Default is True - If set to True, original text will be lowercased before encoding. - - Returns - ------- - SubwordTokenizer - """ - - def __init__(self, hash_file: str, do_lower_case: bool = True): - self.do_lower_case = do_lower_case - self.vocab_file = cpp_hashed_vocabulary(hash_file) - - def __call__( - self, - text, - max_length: int, - max_num_rows: int, - add_special_tokens: bool = True, - padding: str = "max_length", - truncation: bool | str = False, - stride: int = 0, - return_tensors: str = "cp", - return_token_type_ids: bool = False, - ): - """ - Run CUDA BERT subword tokenizer on cuDF strings column. - Encodes words to token ids using vocabulary from a - pretrained tokenizer. - - Parameters - ---------- - text : cudf string series - The batch of sequences to be encoded. - - max_length : int - Controls the maximum length to use or pad to. - - max_num_rows : int - Maximum number of rows for the output token-ids expected to - be generated by the tokenizer. - Used for allocating temporary working memory on the GPU device. - If the output generates a larger number of rows, - behavior is undefined. - This will vary based on stride, truncation, and max_length. - For example, for non-overlapping sequences output rows will be - the same as input rows. - A good default can be twice the max_length - - add_special_tokens : bool, optional, defaults to True - Whether or not to encode the sequences with the special tokens - of the BERT classification model - - padding : "max_length" - Pad to a maximum length specified with the argument max_length - - truncation : bool, defaults to False - True: - Truncate to a maximum length specified with the argument max_length - False or 'do_not_truncate': default - No truncation (Output differs from HuggingFace) - - stride : int, optional, defaults to 0 - The value of this argument defines the number of - overlapping tokens. - The information about the overlapping tokens is - present in the metadata outputted. - - return_tensors : str, {"cp", "pt", "tf"} defaults to "cp" - "cp" : Return cupy cp.ndarray objects - "tf" : Return TensorFlow tf.constant objects - "pt" : Return PyTorch torch.Tensor objects - - - return_token_type_ids : bool, optional - Only False currently supported - - Returns - ------- - An encoding with the following fields: - input_ids:(type defined by return_tensors) - A tensor of token ids to be fed to the model. - attention_mask: (type defined by return_tensors) - A tensor of indices specifying which tokens - should be attended to by the model - metadata: (type defined by return_tensors) - Each row contains the index id of the original string and the - first and last index of the token-ids that are non-padded and - non-overlapping - - Examples - -------- - >>> import cudf - >>> from cudf.utils.hash_vocab_utils import hash_vocab - >>> hash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt') - - - >>> from cudf.core.subword_tokenizer import SubwordTokenizer - >>> cudf_tokenizer = SubwordTokenizer('voc_hash.txt', - ... do_lower_case=True) - >>> str_series = cudf.Series(['This is the', 'best book']) - >>> tokenizer_output = cudf_tokenizer(str_series, - ... max_length=8, - ... max_num_rows=len(str_series), - ... padding='max_length', - ... return_tensors='pt', - ... truncation=True) - >>> tokenizer_output['input_ids'] - tensor([[ 101, 1142, 1110, 1103, 102, 0, 0, 0], - [ 101, 1436, 1520, 102, 0, 0, 0, 0]], - device='cuda:0', - dtype=torch.int32) - >>> tokenizer_output['attention_mask'] - tensor([[1, 1, 1, 1, 1, 0, 0, 0], - [1, 1, 1, 1, 0, 0, 0, 0]], - device='cuda:0', dtype=torch.int32) - >>> tokenizer_output['metadata'] - tensor([[0, 1, 3], - [1, 1, 2]], device='cuda:0', dtype=torch.int32) - """ - - if return_token_type_ids: - # raise not currently supported - # Can also return zeros - error_msg = "Returning token_type_ids is currently supported" - raise NotImplementedError(error_msg) - - if truncation in (False, "do_not_truncate"): - if add_special_tokens: - error_msg = ( - "Adding special tokens is not supported " - f"with truncation = {truncation}. " - ) - recommendation = ( - "Custom Cupy kernel can potentially " - "be used to add it. For reference " - "see: _bert_add_special_tokens" - ) - raise NotImplementedError(error_msg + recommendation) - - truncation = False - warning_msg = ( - "When truncation is not True, the behavior currently differs " - "from HuggingFace as cudf always returns overflowing tokens" - ) - warnings.warn(warning_msg) - - if padding != "max_length": - error_msg = ( - "Only padding to the provided max_length" - "is currently supported" - ) - raise NotImplementedError(error_msg) - - if max_length <= stride: - error_msg = "Stride should be less than max_length" - raise ValueError(error_msg) - - if return_tensors not in {"cp", "pt", "tf"}: - error_msg = ( - "Only cupy(cp), pytorch(pt) and tensorflow(tf) " - "tensors are supported" - ) - raise NotImplementedError(error_msg) - - stride = max_length - stride - # behavior varies from subword_tokenize but maps with huggingface - - input_ids, attention_mask, metadata = cpp_subword_tokenize( - text._column, - self.vocab_file, - max_sequence_length=max_length, - stride=stride, - do_lower=self.do_lower_case, - do_truncate=truncation, - ) - - tokenizer_output = { - "input_ids": cp.asarray(input_ids).reshape(-1, max_length), - "attention_mask": cp.asarray(attention_mask).reshape( - -1, max_length - ), - "metadata": cp.asarray(metadata).reshape(-1, 3), - } - - if add_special_tokens: - tokenizer_output = _bert_add_special_tokens(tokenizer_output) - - tokenizer_output = { - k: _cast_to_appropriate_type(v, return_tensors) - for k, v in tokenizer_output.items() - } - - return tokenizer_output - - -def _bert_add_special_tokens(token_o): - """ - Adds special tokens (CLS,SEP) which are often used by pre-trained BERT - models to input_ids and adjusts attention_mask and metadata to account - for them. - """ - max_length = token_o["input_ids"].shape[1] - seq_end_col = max_length - (token_o["input_ids"][:, ::-1] != 0).argmax(1) - # clipping to take overflow into account - seq_end_col = cp.clip(seq_end_col + 1, a_min=None, a_max=max_length - 1) - - _bert_add_special_tokens_input_ids(token_o["input_ids"], seq_end_col) - _bert_add_special_tokens_attention_mask( - token_o["attention_mask"], seq_end_col - ) - _bert_add_special_tokens_metadata(token_o["metadata"], max_length) - - return token_o - - -def _bert_add_special_tokens_input_ids(input_ids, seq_end_col): - """ - Add token ids for special tokens ([CLS] and [SEP]) to - the start and end of each sequence - """ - # Mark sequence start with [CLS] token mapping to the start of sequence - input_ids[:, 1:-1] = input_ids[:, 0:-2] - input_ids[:, 0] = 101 - # Mark end of sequence [SEP] - - input_ids[ - cp.arange(0, input_ids.shape[0], dtype=cp.uint32), seq_end_col - ] = 102 - - -def _bert_add_special_tokens_attention_mask(attention_mask, seq_end_col): - """ - Mark attention mask for special tokens ([CLS] and [SEP]) with 1 - """ - # Copy attention masks for all but last two - attention_mask[:, 1:-1] = attention_mask[:, 0:-2] - # Mark [CLS] token with 1 - attention_mask[:, 0] = 1 - # Mark [SEP] token with 1 - attention_mask[ - cp.arange(0, attention_mask.shape[0], dtype=cp.uint32), seq_end_col - ] = 1 - - -def _bert_add_special_tokens_metadata(metadata, max_length): - """ - Edit metadata to account for the added special tokens ([CLS] and [SEP]) - """ - # metadata seq starts from plus 1 - metadata[:, 1] = metadata[:, 1] + 1 - # clip done to take overflow into account - metadata[:, 2] = cp.clip( - metadata[:, 2] + 1, a_min=None, a_max=max_length - 2 - ) diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py deleted file mode 100644 index 99d85c0c5c0..00000000000 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import cudf -from cudf._lib.nvtext.tokenize import ( - TokenizeVocabulary as cpp_tokenize_vocabulary, - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) - - -class TokenizeVocabulary: - """ - A vocabulary object used to tokenize input text. - - Parameters - ---------- - vocabulary : str - Strings column of vocabulary terms - """ - - def __init__(self, vocabulary: "cudf.Series"): - self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column) - - def tokenize( - self, text, delimiter: str = "", default_id: int = -1 - ) -> cudf.Series: - """ - Parameters - ---------- - text : cudf string series - The strings to be tokenized. - delimiter : str - Delimiter to identify tokens. Default is whitespace. - default_id : int - Value to use for tokens not found in the vocabulary. - Default is -1. - - Returns - ------- - Tokenized strings - """ - if delimiter is None: - delimiter = "" - delim = cudf.Scalar(delimiter, dtype="str") - result = cpp_tokenize_with_vocabulary( - text._column, self.vocabulary, delim, default_id - ) - - return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/tools/__init__.py b/python/cudf/cudf/core/tools/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py deleted file mode 100644 index 68f34fa28ff..00000000000 --- a/python/cudf/cudf/core/tools/datetimes.py +++ /dev/null @@ -1,1057 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import math -import re -import warnings -from typing import Literal, Sequence - -import numpy as np -import pandas as pd -import pandas.tseries.offsets as pd_offset -from typing_extensions import Self - -import cudf -from cudf import _lib as libcudf -from cudf._lib.strings.convert.convert_integers import ( - is_integer as cpp_is_integer, -) -from cudf.api.types import is_integer, is_scalar -from cudf.core import column -from cudf.core.index import ensure_index - -# https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 -_unit_map = { - "year": "year", - "years": "year", - "month": "month", - "months": "month", - "day": "day", - "days": "day", - "hour": "h", - "hours": "h", - "minute": "m", - "minutes": "m", - "second": "s", - "seconds": "s", - "ms": "ms", - "millisecond": "ms", - "milliseconds": "ms", - "us": "us", - "microsecond": "us", - "microseconds": "us", - "ns": "ns", - "nanosecond": "ns", - "nanoseconds": "ns", -} - -_unit_dtype_map = { - "ns": "datetime64[ns]", - "us": "datetime64[us]", - "ms": "datetime64[ms]", - "m": "datetime64[s]", - "h": "datetime64[s]", - "s": "datetime64[s]", - "D": "datetime64[s]", -} - - -def to_datetime( - arg, - errors: Literal["raise", "coerce", "warn", "ignore"] = "raise", - dayfirst: bool = False, - yearfirst: bool = False, - utc: bool = False, - format: str | None = None, - exact: bool = True, - unit: str = "ns", - infer_datetime_format: bool = True, - origin="unix", - cache: bool = True, -): - """ - Convert argument to datetime. - - Parameters - ---------- - arg : int, float, str, datetime, list, tuple, 1-d array, - Series DataFrame/dict-like - The object to convert to a datetime. - errors : {'ignore', 'raise', 'coerce', 'warn'}, default 'raise' - - If 'raise', then invalid parsing will raise an exception. - - If 'coerce', then invalid parsing will be set as NaT. - - If 'warn' : prints last exceptions as warnings and - return the input. - - If 'ignore', then invalid parsing will return the input. - dayfirst : bool, default False - Specify a date parse order if `arg` is str or its list-likes. - If True, parses dates with the day first, eg 10/11/12 is parsed as - 2012-11-10. - Warning: dayfirst=True is not strict, but will prefer to parse - with day first (this is a known bug, based on dateutil behavior). - utc : bool, default False - Whether the result should be have a UTC timezone. - format : str, default None - The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse - all the way up to nanoseconds. - See strftime documentation for more information on choices: - https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. - unit : str, default 'ns' - The unit of the arg (D,s,ms,us,ns) denote the unit, which is an - integer or float number. This will be based off the - origin(unix epoch start). - Example, with unit='ms' and origin='unix' (the default), this - would calculate the number of milliseconds to the unix epoch start. - infer_datetime_format : bool, default True - If True and no `format` is given, attempt to infer the format of the - datetime strings, and if it can be inferred, switch to a faster - method of parsing them. In some cases this can increase the parsing - speed by ~5-10x. - - Returns - ------- - datetime - If parsing succeeded. - Return type depends on input: - - list-like: DatetimeIndex - - Series: Series of datetime64 dtype - - scalar: Timestamp - - Examples - -------- - Assembling a datetime from multiple columns of a DataFrame. The keys can be - common abbreviations like ['year', 'month', 'day', 'minute', 'second', - 'ms', 'us', 'ns']) or plurals of the same - - >>> import cudf - >>> df = cudf.DataFrame({'year': [2015, 2016], - ... 'month': [2, 3], - ... 'day': [4, 5]}) - >>> cudf.to_datetime(df) - 0 2015-02-04 - 1 2016-03-05 - dtype: datetime64[ns] - >>> cudf.to_datetime(1490195805, unit='s') - numpy.datetime64('2017-03-22T15:16:45.000000000') - >>> cudf.to_datetime(1490195805433502912, unit='ns') - numpy.datetime64('1780-11-20T01:02:30.494253056') - """ - if errors not in {"ignore", "raise", "coerce", "warn"}: - raise ValueError( - f"errors parameter has to be either one of: " - f"{['ignore', 'raise', 'coerce', 'warn']}, found: " - f"{errors}" - ) - elif errors in {"ignore", "coerce"} and not is_scalar(arg): - raise NotImplementedError( - f"{errors=} is not implemented when arg is not scalar-like" - ) - - if errors == "ignore": - warnings.warn( - "errors='ignore' is deprecated and will raise in a " - "future version. Use to_datetime without passing `errors` " - "and catch exceptions explicitly instead", - FutureWarning, - ) - - if infer_datetime_format in {None, False}: - warnings.warn( - "`infer_datetime_format` is deprecated and will " - "be removed in a future version of cudf.", - FutureWarning, - ) - - if arg is None: - return None - - if exact is False: - raise NotImplementedError("exact support is not yet implemented") - - if origin != "unix": - raise NotImplementedError("origin support is not yet implemented") - - if yearfirst: - raise NotImplementedError("yearfirst support is not yet implemented") - - if format is not None: - if "%Z" in format or "%z" in format: - raise NotImplementedError( - "cuDF does not yet support timezone-aware datetimes" - ) - elif "%f" in format: - format = format.replace("%f", "%9f") - - try: - if isinstance(arg, cudf.DataFrame): - # we require at least Ymd - required = ["year", "month", "day"] - req = list(set(required) - set(arg._column_names)) - if len(req): - err_req = ",".join(req) - raise ValueError( - f"to assemble mappings requires at least that " - f"[year, month, day] be specified: [{err_req}] " - f"is missing" - ) - - # replace passed column name with values in _unit_map - got_units = {k: get_units(k) for k in arg._column_names} - unit_rev = {v: k for k, v in got_units.items()} - - # keys we don't recognize - excess = set(unit_rev.keys()) - set(_unit_map.values()) - if len(excess): - err_excess = ",".join(excess) - raise ValueError( - f"extra keys have been passed to the " - f"datetime assemblage: [{err_excess}]" - ) - - new_series = ( - arg[unit_rev["year"]].astype("str") - + "-" - + arg[unit_rev["month"]].astype("str").str.zfill(2) - + "-" - + arg[unit_rev["day"]].astype("str").str.zfill(2) - ) - format = "%Y-%m-%d" - for u in ["h", "m", "s", "ms", "us", "ns"]: - value = unit_rev.get(u) - if value is not None and value in arg: - arg_col = arg._data[value] - if arg_col.dtype.kind == "f": - col = new_series._column.strptime( - cudf.dtype("datetime64[ns]"), format=format - ) - break - elif arg_col.dtype.kind == "O": - if not cpp_is_integer(arg_col).all(): - col = new_series._column.strptime( - cudf.dtype("datetime64[ns]"), format=format - ) - break - else: - col = new_series._column.strptime( - cudf.dtype("datetime64[s]"), format=format - ) - - times_column = None - for u in ["h", "m", "s", "ms", "us", "ns"]: - value = unit_rev.get(u) - if value is not None and value in arg: - current_col = arg._data[value] - # If the arg[value] is of int or - # float dtype we don't want to type-cast - if current_col.dtype.kind in ("O"): - try: - current_col = current_col.astype(dtype="int64") - except ValueError: - current_col = current_col.astype(dtype="float64") - - factor = cudf.Scalar( - column.datetime._unit_to_nanoseconds_conversion[u] - / ( - column.datetime._unit_to_nanoseconds_conversion[ - "s" - ] - if np.datetime_data(col.dtype)[0] == "s" - else 1 - ) - ) - - if times_column is None: - times_column = current_col * factor - else: - times_column = times_column + (current_col * factor) - if times_column is not None: - col = (col.astype(dtype="int64") + times_column).astype( - dtype=col.dtype - ) - col = _process_col( - col=col, - unit=unit, - dayfirst=dayfirst, - infer_datetime_format=infer_datetime_format, - format=format, - utc=utc, - ) - return cudf.Series._from_column(col, index=arg.index) - else: - col = _process_col( - col=column.as_column(arg), - unit=unit, - dayfirst=dayfirst, - infer_datetime_format=infer_datetime_format, - format=format, - utc=utc, - ) - if isinstance(arg, (cudf.BaseIndex, pd.Index)): - return cudf.DatetimeIndex._from_column(col, name=arg.name) - elif isinstance(arg, (cudf.Series, pd.Series)): - return cudf.Series._from_column( - col, name=arg.name, index=ensure_index(arg.index) - ) - elif is_scalar(arg): - return col.element_indexing(0) - else: - return cudf.Index._from_column(col) - except Exception as e: - if errors == "raise": - raise e - elif errors == "warn": - import traceback - - tb = traceback.format_exc() - warnings.warn(tb) - elif errors == "ignore": - pass - elif errors == "coerce": - return np.datetime64("nat", "ns" if unit is None else unit) - return arg - - -def _process_col( - col, - unit: str, - dayfirst: bool, - infer_datetime_format: bool, - format: str | None, - utc: bool, -): - if col.dtype.kind == "f": - if unit not in (None, "ns"): - factor = cudf.Scalar( - column.datetime._unit_to_nanoseconds_conversion[unit] - ) - col = col * factor - - if format is not None: - # Converting to int because, - # pandas actually creates a datetime column - # out of float values and then creates an - # int column out of it to parse against `format`. - # Instead we directly cast to int and perform - # parsing against `format`. - col = ( - col.astype("int") - .astype("str") - .strptime( - dtype=cudf.dtype("datetime64[us]") - if "%f" in format - else cudf.dtype("datetime64[s]"), - format=format, - ) - ) - else: - col = col.astype(dtype="datetime64[ns]") - - elif col.dtype.kind in "iu": - if unit in ("D", "h", "m"): - factor = cudf.Scalar( - column.datetime._unit_to_nanoseconds_conversion[unit] - / column.datetime._unit_to_nanoseconds_conversion["s"] - ) - col = col * factor - - if format is not None: - col = col.astype("str").strptime( - dtype=cudf.dtype(_unit_dtype_map[unit]), format=format - ) - else: - col = col.astype(dtype=cudf.dtype(_unit_dtype_map[unit])) - - elif col.dtype.kind == "O": - if unit not in (None, "ns") or col.null_count == len(col): - try: - col = col.astype(dtype="int64") - except ValueError: - col = col.astype(dtype="float64") - return _process_col( - col=col, - unit=unit, - dayfirst=dayfirst, - infer_datetime_format=infer_datetime_format, - format=format, - utc=utc, - ) - else: - if format is None: - if not infer_datetime_format and dayfirst: - raise NotImplementedError( - f"{dayfirst=} not implemented " - f"when {format=} and {infer_datetime_format=}." - ) - format = column.datetime.infer_format( - element=col.element_indexing(0), - dayfirst=dayfirst, - ) - col = col.strptime( - dtype=cudf.dtype(_unit_dtype_map[unit]), - format=format, - ) - elif col.dtype.kind != "M": - raise TypeError( - f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}" - ) - if utc and not isinstance(col.dtype, pd.DatetimeTZDtype): - return col.tz_localize("UTC") - return col - - -def get_units(value): - if value in _unit_map: - return _unit_map[value] - - # m is case significant - if value.lower() in _unit_map: - return _unit_map[value.lower()] - - return value - - -class DateOffset: - """ - An object used for binary ops where calendrical arithmetic - is desired rather than absolute time arithmetic. Used to - add or subtract a whole number of periods, such as several - months or years, to a series or index of datetime dtype. - Works similarly to pd.DateOffset, but stores the offset - on the device (GPU). - - Parameters - ---------- - n : int, default 1 - The number of time periods the offset represents. - **kwds - Temporal parameter that add to or replace the offset value. - Parameters that **add** to the offset (like Timedelta): - - months - - See Also - -------- - pandas.DateOffset : The equivalent Pandas object that this - object replicates - - Examples - -------- - >>> from cudf import DateOffset - >>> ts = cudf.Series([ - ... "2000-01-01 00:00:00.012345678", - ... "2000-01-31 00:00:00.012345678", - ... "2000-02-29 00:00:00.012345678", - ... ], dtype='datetime64[ns]') - >>> ts + DateOffset(months=3) - 0 2000-04-01 00:00:00.012345678 - 1 2000-04-30 00:00:00.012345678 - 2 2000-05-29 00:00:00.012345678 - dtype: datetime64[ns] - >>> ts - DateOffset(months=12) - 0 1999-01-01 00:00:00.012345678 - 1 1999-01-31 00:00:00.012345678 - 2 1999-02-28 00:00:00.012345678 - dtype: datetime64[ns] - - Notes - ----- - Note that cuDF does not yet support DateOffset arguments - that 'replace' units in the datetime data being operated on - such as - - year - - month - - week - - day - - hour - - minute - - second - - microsecond - - millisecond - - nanosecond - - cuDF does not yet support rounding via a `normalize` - keyword argument. - """ - - _UNITS_TO_CODES = { - "nanoseconds": "ns", - "microseconds": "us", - "milliseconds": "ms", - "seconds": "s", - "minutes": "m", - "hours": "h", - "days": "D", - "weeks": "W", - "months": "M", - "years": "Y", - } - - _CODES_TO_UNITS = { - "N": "nanoseconds", - "ns": "nanoseconds", - "U": "microseconds", - "us": "microseconds", - "ms": "milliseconds", - "L": "milliseconds", - "s": "seconds", - "S": "seconds", - "m": "minutes", - "min": "minutes", - "T": "minutes", - "h": "hours", - "H": "hours", - "D": "days", - "W": "weeks", - "M": "months", - "Y": "years", - } - - _TICK_OR_WEEK_TO_UNITS = { - pd_offset.Week: "weeks", - pd_offset.Day: "days", - pd_offset.Hour: "hours", - pd_offset.Minute: "minutes", - pd_offset.Second: "seconds", - pd_offset.Milli: "milliseconds", - pd_offset.Micro: "microseconds", - pd_offset.Nano: "nanoseconds", - } - - _FREQSTR_REGEX = re.compile("([-+]?[0-9]*)([a-zA-Z]+)") - - def __init__(self, n=1, normalize=False, **kwds): - if normalize: - raise NotImplementedError( - "normalize not yet supported for DateOffset" - ) - - all_possible_units = { - "years", - "months", - "weeks", - "days", - "hours", - "minutes", - "seconds", - "milliseconds", - "microseconds", - "nanoseconds", - "year", - "month", - "week", - "day", - "hour", - "minute", - "second", - "microsecond", - "millisecond", - "nanosecond", - } - - supported_units = { - "years", - "months", - "weeks", - "days", - "hours", - "minutes", - "seconds", - "milliseconds", - "microseconds", - "nanoseconds", - } - - unsupported_units = all_possible_units - supported_units - - invalid_kwds = set(kwds) - supported_units - unsupported_units - if invalid_kwds: - raise TypeError( - f"Keyword arguments '{','.join(list(invalid_kwds))}'" - " are not recognized" - ) - - unsupported_kwds = set(kwds) & unsupported_units - if unsupported_kwds: - raise NotImplementedError( - f"Keyword arguments '{','.join(list(unsupported_kwds))}'" - " are not yet supported." - ) - - if any(not is_integer(val) for val in kwds.values()): - raise ValueError("Non-integer periods not supported") - - self._kwds = kwds - kwds = self._combine_months_and_years(**kwds) - kwds = self._combine_kwargs_to_seconds(**kwds) - - scalars = {} - for k, v in kwds.items(): - if k in all_possible_units: - # Months must be int16 - if k == "months": - # TODO: throw for out-of-bounds int16 values - dtype = "int16" - else: - unit = self._UNITS_TO_CODES[k] - dtype = cudf.dtype(f"timedelta64[{unit}]") - scalars[k] = cudf.Scalar(v, dtype=dtype) - - self._scalars = scalars - - @property - def kwds(self): - return self._kwds - - def _combine_months_and_years(self, **kwargs): - # TODO: if months is zero, don't do a binop - kwargs["months"] = kwargs.pop("years", 0) * 12 + kwargs.pop( - "months", 0 - ) - return kwargs - - def _combine_kwargs_to_seconds(self, **kwargs): - """ - Combine days, weeks, hours and minutes to a single - scalar representing the total seconds - """ - seconds = 0 - seconds += kwargs.pop("weeks", 0) * 604800 - seconds += kwargs.pop("days", 0) * 86400 - seconds += kwargs.pop("hours", 0) * 3600 - seconds += kwargs.pop("minutes", 0) * 60 - seconds += kwargs.pop("seconds", 0) - - if seconds > np.iinfo("int64").max: - raise NotImplementedError( - "Total days + weeks + hours + minutes + seconds can not exceed" - f" {np.iinfo('int64').max} seconds" - ) - - if seconds != 0: - kwargs["seconds"] = seconds - return kwargs - - def _datetime_binop( - self, datetime_col, op, reflect=False - ) -> column.DatetimeColumn: - if reflect and op == "__sub__": - raise TypeError( - f"Can not subtract a {type(datetime_col).__name__}" - f" from a {type(self).__name__}" - ) - if op not in {"__add__", "__sub__"}: - raise TypeError( - f"{op} not supported between {type(self).__name__}" - f" and {type(datetime_col).__name__}" - ) - if not self._is_no_op: - if "months" in self._scalars: - rhs = self._generate_months_column(len(datetime_col), op) - datetime_col = libcudf.datetime.add_months(datetime_col, rhs) - - for unit, value in self._scalars.items(): - if unit != "months": - value = -value if op == "__sub__" else value - datetime_col += cudf.core.column.as_column( - value, length=len(datetime_col) - ) - - return datetime_col - - def _generate_months_column(self, size, op): - months = self._scalars["months"] - months = -months if op == "__sub__" else months - # TODO: pass a scalar instead of constructing a column - # https://github.com/rapidsai/cudf/issues/6990 - col = cudf.core.column.as_column(months, length=size) - return col - - @property - def _is_no_op(self) -> bool: - # some logic could be implemented here for more complex cases - # such as +1 year, -12 months - return all(i == 0 for i in self._kwds.values()) - - def __neg__(self): - new_scalars = {k: -v for k, v in self._kwds.items()} - return DateOffset(**new_scalars) - - def __repr__(self): - includes = [] - for unit in sorted(self._UNITS_TO_CODES): - val = self._kwds.get(unit, None) - if val is not None: - includes.append(f"{unit}={val}") - unit_data = ", ".join(includes) - repr_str = f"<{self.__class__.__name__}: {unit_data}>" - - return repr_str - - @classmethod - def _from_freqstr(cls, freqstr: str) -> Self: - """ - Parse a string and return a DateOffset object - expects strings of the form 3D, 25W, 10ms, 42ns, etc. - """ - match = cls._FREQSTR_REGEX.match(freqstr) - - if match is None: - raise ValueError(f"Invalid frequency string: {freqstr}") - - numeric_part = match.group(1) - if numeric_part == "": - numeric_part = "1" - freq_part = match.group(2) - - if freq_part not in cls._CODES_TO_UNITS: - raise ValueError(f"Cannot interpret frequency str: {freqstr}") - - return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)}) - - @classmethod - def _from_pandas_ticks_or_weeks( - cls, - tick: pd.tseries.offsets.Tick | pd.tseries.offsets.Week, - ) -> Self: - return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n}) - - def _maybe_as_fast_pandas_offset(self): - if ( - len(self.kwds) == 1 - and _has_fixed_frequency(self) - and not _has_non_fixed_frequency(self) - ): - # Pandas computation between `n*offsets.Minute()` is faster than - # `n*DateOffset`. If only single offset unit is in use, we return - # the base offset for faster binary ops. - return pd.tseries.frequencies.to_offset(pd.Timedelta(**self.kwds)) - return pd.DateOffset(**self.kwds, n=1) - - -def _isin_datetimelike( - lhs: column.TimeDeltaColumn | column.DatetimeColumn, values: Sequence -) -> column.ColumnBase: - """ - Check whether values are contained in the - DateTimeColumn or TimeDeltaColumn. - - Parameters - ---------- - lhs : TimeDeltaColumn or DatetimeColumn - Column to check whether the `values` exist in. - values : set or list-like - The sequence of values to test. Passing in a single string will - raise a TypeError. Instead, turn a single string into a list - of one element. - - Returns - ------- - result: Column - Column of booleans indicating if each element is in values. - """ - rhs = None - try: - rhs = cudf.core.column.as_column(values) - was_string = len(rhs) and rhs.dtype.kind == "O" - - if rhs.dtype.kind in {"f", "i", "u"}: - return column.as_column(False, length=len(lhs), dtype="bool") - rhs = rhs.astype(lhs.dtype) - if was_string: - warnings.warn( - f"The behavior of 'isin' with dtype={lhs.dtype} and " - "castable values (e.g. strings) is deprecated. In a " - "future version, these will not be considered matching " - "by isin. Explicitly cast to the appropriate dtype before " - "calling isin instead.", - FutureWarning, - ) - res = lhs._isin_earlystop(rhs) - if res is not None: - return res - except ValueError: - # pandas functionally returns all False when cleansing via - # typecasting fails - return column.as_column(False, length=len(lhs), dtype="bool") - - res = lhs._obtain_isin_result(rhs) - return res - - -def date_range( - start=None, - end=None, - periods=None, - freq=None, - tz=None, - normalize: bool = False, - name=None, - inclusive: Literal["left", "right", "both", "neither"] = "both", - *, - unit: str | None = None, -): - """Return a fixed frequency DatetimeIndex. - - Returns the range of equally spaced time points (where the difference - between any two adjacent points is specified by the given frequency) - such that they all satisfy `start` <[=] x <[=] `end`, where the first one - and the last one are, resp., the first and last time points in that range - that are valid for `freq`. - - Parameters - ---------- - start : str or datetime-like, optional - Left bound for generating dates. - - end : str or datetime-like, optional - Right bound for generating dates. - - periods : int, optional - Number of periods to generate. - - freq : str or DateOffset - Frequencies to generate the datetime series. Mixed fixed-frequency and - non-fixed frequency offset is unsupported. See notes for detail. - Supported offset alias: ``D``, ``h``, ``H``, ``T``, ``min``, ``S``, - ``U``, ``us``, ``N``, ``ns``. - - tz : str or tzinfo, optional - Not Supported - - normalize : bool, default False - Not Supported - - name : str, default None - Name of the resulting DatetimeIndex - - inclusive : {"left", "right", "both", "neither"}, default "both" - Whether to set each bound as closed or open. - Currently only "both" is supported - - unit : str, default None - Specify the desired resolution of the result. Currently - not supported. - - Returns - ------- - DatetimeIndex - - Notes - ----- - Of the four parameters `start`, `end`, `periods`, and `freq`, exactly three - must be specified. If `freq` is omitted, the resulting DatetimeIndex will - have periods linearly spaced elements between start and end (inclusive on both - sides). - - cudf supports `freq` specified with either fixed-frequency offset - (such as weeks, days, hours, minutes...) or non-fixed frequency offset - (such as years and months). Specifying `freq` with a mixed fixed and - non-fixed frequency is currently unsupported. For example: - - >>> cudf.date_range( - ... start='2021-08-23 08:00:00', - ... freq=cudf.DateOffset(months=2, days=5), - ... periods=5) - ... - NotImplementedError: Mixing fixed and non-fixed frequency offset is - unsupported. - - Examples - -------- - >>> cudf.date_range( - ... start='2021-08-23 08:00:00', - ... freq=cudf.DateOffset(years=1, months=2), - ... periods=5) - DatetimeIndex(['2021-08-23 08:00:00', '2022-10-23 08:00:00', - '2023-12-23 08:00:00', '2025-02-23 08:00:00', - '2026-04-23 08:00:00'], - dtype='datetime64[ns]') - """ - if inclusive != "both": - raise NotImplementedError(f"{inclusive=} is currently unsupported.") - if unit is not None: - raise NotImplementedError(f"{unit=} is currently unsupported.") - if normalize is not False: - raise NotImplementedError(f"{normalize=} is currently unsupported.") - - if freq is None and any(arg is None for arg in (start, end, periods)): - freq = "D" - - if (start, end, periods, freq).count(None) > 1: - raise ValueError( - "Of the four parameters: start, end, periods, and freq, exactly " - "three must be specified" - ) - - if periods is not None and not cudf.api.types.is_integer(periods): - warnings.warn( - "Non-integer 'periods' in cudf.date_range, and cudf.interval_range" - " are deprecated and will raise in a future version.", - FutureWarning, - ) - - dtype = np.dtype("datetime64[ns]") - - if freq is None: - # `start`, `end`, `periods` is specified, we treat the timestamps as - # integers and divide the number range evenly with `periods` elements. - start = cudf.Scalar(start, dtype=dtype).value.astype("int64") - end = cudf.Scalar(end, dtype=dtype).value.astype("int64") - arr = np.linspace(start=start, stop=end, num=periods) - result = cudf.core.column.as_column(arr).astype("datetime64[ns]") - return cudf.DatetimeIndex._from_column(result, name=name).tz_localize( - tz - ) - - # The code logic below assumes `freq` is defined. It is first normalized - # into `DateOffset` for further computation with timestamps. - - if isinstance(freq, DateOffset): - offset = freq - elif isinstance(freq, str): - offset = pd.tseries.frequencies.to_offset(freq) - if not isinstance( - offset, (pd.tseries.offsets.Tick, pd.tseries.offsets.Week) - ): - raise ValueError( - f"Unrecognized frequency string {freq}. cuDF does " - "not yet support month, quarter, year-anchored frequency." - ) - offset = DateOffset._from_pandas_ticks_or_weeks(offset) - else: - raise TypeError("`freq` must be a `str` or cudf.DateOffset object.") - - if _has_fixed_frequency(offset) and _has_non_fixed_frequency(offset): - raise NotImplementedError( - "Mixing fixed and non-fixed frequency offset is unsupported." - ) - - # Depending on different combinations of `start`, `end`, `offset`, - # `periods`, the following logic makes sure before computing the sequence, - # `start`, `periods`, `offset` is defined - - _periods_not_specified = False - - if start is None: - end = cudf.Scalar(end, dtype=dtype) - start = cudf.Scalar( - pd.Timestamp(end.value) - - (periods - 1) * offset._maybe_as_fast_pandas_offset(), - dtype=dtype, - ) - elif end is None: - start = cudf.Scalar(start, dtype=dtype) - elif periods is None: - # When `periods` is unspecified, its upper bound estimated by - # dividing the number of nanoseconds between two timestamps with - # the lower bound of `freq` in nanoseconds. While the final result - # may contain extra elements that exceeds `end`, they are trimmed - # as a post processing step. [1] - _periods_not_specified = True - start = cudf.Scalar(start, dtype=dtype) - end = cudf.Scalar(end, dtype=dtype) - _is_increment_sequence = end >= start - - periods = math.floor( - int(end - start) / _offset_to_nanoseconds_lower_bound(offset) - ) - - if periods < 0: - # Mismatched sign between (end-start) and offset, return empty - # column - periods = 0 - else: - # If end == start, periods == 0 and we return exactly 1 timestamp (start). - # Otherwise, since inclusive="both", we ensure the end point is included. - periods += 1 - - # We compute `end_estim` (the estimated upper bound of the date - # range) below, but don't always use it. We do this to ensure - # that the appropriate OverflowError is raised by Pandas in case - # of overflow. - # FIXME: when `end_estim` is out of bound, but the actual `end` is not, - # we shouldn't raise but compute the sequence as is. The trailing overflow - # part should get trimmed at the end. - with warnings.catch_warnings(): - # Need to ignore userwarnings where nonzero nanoseconds - # are dropped in conversion during the binops - warnings.simplefilter("ignore", UserWarning) - end_estim = ( - pd.Timestamp(start.value) - + periods * offset._maybe_as_fast_pandas_offset() - ).to_datetime64() - - if "months" in offset.kwds or "years" in offset.kwds: - # If `offset` is non-fixed frequency, resort to libcudf. - res = libcudf.datetime.date_range(start.device_value, periods, offset) - if _periods_not_specified: - # As mentioned in [1], this is a post processing step to trim extra - # elements when `periods` is an estimated value. Only offset - # specified with non fixed frequencies requires trimming. - res = res.apply_boolean_mask( - (res <= end) if _is_increment_sequence else (res <= start) - ) - else: - # If `offset` is fixed frequency, we generate a range of - # treating `start`, `stop` and `step` as ints: - stop = end_estim.astype("int64") - start = start.value.astype("int64") - step = _offset_to_nanoseconds_lower_bound(offset) - arr = range(int(start), int(stop), step) - res = cudf.core.column.as_column(arr, dtype="int64").astype( - "datetime64[ns]" - ) - - return cudf.DatetimeIndex._from_column( - res, name=name, freq=freq - ).tz_localize(tz) - - -def _has_fixed_frequency(freq: DateOffset) -> bool: - """Utility to determine if `freq` contains fixed frequency offset""" - fixed_frequencies = { - "weeks", - "days", - "hours", - "minutes", - "seconds", - "milliseconds", - "microseconds", - "nanoseconds", - } - - return len(freq.kwds.keys() & fixed_frequencies) > 0 - - -def _has_non_fixed_frequency(freq: DateOffset) -> bool: - """Utility to determine if `freq` contains non-fixed frequency offset""" - non_fixed_frequencies = {"years", "months"} - return len(freq.kwds.keys() & non_fixed_frequencies) > 0 - - -def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int: - """Given a DateOffset, which can consist of either fixed frequency or - non-fixed frequency offset, convert to the smallest possible fixed - frequency offset based in nanoseconds. - - Specifically, the smallest fixed frequency conversion for {months=1} - is 28 * nano_seconds_per_day, because 1 month contains at least 28 days. - Similarly, the smallest fixed frequency conversion for {year=1} is - 365 * nano_seconds_per_day. - - This utility is used to compute the upper bound of the count of timestamps - given a range of datetime and an offset. - """ - nanoseconds_per_day = 24 * 60 * 60 * 10**9 - kwds = offset.kwds - return ( - kwds.get("years", 0) * (365 * nanoseconds_per_day) - + kwds.get("months", 0) * (28 * nanoseconds_per_day) - + kwds.get("weeks", 0) * (7 * nanoseconds_per_day) - + kwds.get("days", 0) * nanoseconds_per_day - + kwds.get("hours", 0) * 3600 * 10**9 - + kwds.get("minutes", 0) * 60 * 10**9 - + kwds.get("seconds", 0) * 10**9 - + kwds.get("milliseconds", 0) * 10**6 - + kwds.get("microseconds", 0) * 10**3 - + kwds.get("nanoseconds", 0) - ) diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py deleted file mode 100644 index 6cecf3fa170..00000000000 --- a/python/cudf/cudf/core/tools/numeric.py +++ /dev/null @@ -1,253 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING - -import numpy as np -import pandas as pd - -import cudf -from cudf import _lib as libcudf -from cudf._lib import strings as libstrings -from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype -from cudf.core.column import as_column -from cudf.core.dtypes import CategoricalDtype -from cudf.core.index import ensure_index -from cudf.utils.dtypes import can_convert_to_column - -if TYPE_CHECKING: - from cudf.core.column import ColumnBase - - -def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None): - """ - Convert argument into numerical types. - - Parameters - ---------- - arg : column-convertible - The object to convert to numeric types - errors : {'raise', 'ignore', 'coerce'}, defaults 'raise' - Policy to handle errors during parsing. - - * 'raise' will notify user all errors encountered. - * 'ignore' will skip error and returns ``arg``. - * 'coerce' will leave invalid values as nulls. - downcast : {'integer', 'signed', 'unsigned', 'float'}, defaults None - If set, will try to down-convert the datatype of the - parsed results to smallest possible type. For each `downcast` - type, this method will determine the smallest possible - dtype from the following sets: - - * {'integer', 'signed'}: all integer types greater or equal to - `np.int8` - * {'unsigned'}: all unsigned types greater or equal to `np.uint8` - * {'float'}: all floating types greater or equal to `np.float32` - - Note that downcast behavior is decoupled from parsing. Errors - encountered during downcast is raised regardless of ``errors`` - parameter. - dtype_backend : None - Not implemented. - - Returns - ------- - Series or ndarray - Depending on the input, if series is passed in, series is returned, - otherwise ndarray - - Examples - -------- - >>> s = cudf.Series(['1', '2.0', '3e3']) - >>> cudf.to_numeric(s) - 0 1.0 - 1 2.0 - 2 3000.0 - dtype: float64 - >>> cudf.to_numeric(s, downcast='float') - 0 1.0 - 1 2.0 - 2 3000.0 - dtype: float32 - >>> cudf.to_numeric(s, downcast='signed') - 0 1 - 1 2 - 2 3000 - dtype: int16 - >>> s = cudf.Series(['apple', '1.0', '3e3']) - >>> cudf.to_numeric(s, errors='ignore') - 0 apple - 1 1.0 - 2 3e3 - dtype: object - >>> cudf.to_numeric(s, errors='coerce') - 0 - 1 1.0 - 2 3000.0 - dtype: float64 - - .. pandas-compat:: - :func:`pandas.to_numeric` - - An important difference from pandas is that this function does not - accept mixed numeric/non-numeric type sequences. - For example ``[1, 'a']``. A ``TypeError`` will be raised when such - input is received, regardless of ``errors`` parameter. - """ - if dtype_backend is not None: - raise NotImplementedError( - "dtype_backend is not currently implemented." - ) - if errors not in {"raise", "ignore", "coerce"}: - raise ValueError("invalid error value specified") - elif errors == "ignore": - warnings.warn( - "errors='ignore' is deprecated and will raise in " - "a future version. Use to_numeric without passing `errors` " - "and catch exceptions explicitly instead", - FutureWarning, - ) - - if downcast not in {None, "integer", "signed", "unsigned", "float"}: - raise ValueError("invalid downcasting method provided") - - if not can_convert_to_column(arg) or ( - hasattr(arg, "ndim") and arg.ndim > 1 - ): - raise ValueError("arg must be column convertible") - - col = as_column(arg) - dtype = col.dtype - - if dtype.kind in "mM": - col = col.astype(cudf.dtype("int64")) - elif isinstance(dtype, CategoricalDtype): - cat_dtype = col.dtype.type - if _is_non_decimal_numeric_dtype(cat_dtype): - col = col.astype(cat_dtype) - else: - try: - col = _convert_str_col( - col._get_decategorized_column(), errors, downcast - ) - except ValueError as e: - if errors == "ignore": - return arg - else: - raise e - elif is_string_dtype(dtype): - try: - col = _convert_str_col(col, errors, downcast) - except ValueError as e: - if errors == "ignore": - return arg - else: - raise e - elif isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)): - raise ValueError("Input does not support nested datatypes") - elif _is_non_decimal_numeric_dtype(dtype): - pass - else: - raise ValueError("Unrecognized datatype") - - # str->float conversion may require lower precision - if col.dtype == cudf.dtype("float32"): - col = col.astype("float64") - - if downcast: - if downcast == "float": - # we support only float32 & float64 - type_set = [ - cudf.dtype(np.float32).char, - cudf.dtype(np.float64).char, - ] - elif downcast in ("integer", "signed"): - type_set = list(np.typecodes["Integer"]) - elif downcast == "unsigned": - type_set = list(np.typecodes["UnsignedInteger"]) - - for t in type_set: - downcast_dtype = cudf.dtype(t) - if downcast_dtype.itemsize <= col.dtype.itemsize: - if col.can_cast_safely(downcast_dtype): - col = libcudf.unary.cast(col, downcast_dtype) - break - - if isinstance(arg, (cudf.Series, pd.Series)): - return cudf.Series._from_column( - col, name=arg.name, index=ensure_index(arg.index) - ) - else: - if col.has_nulls(): - # To match pandas, always return a floating type filled with nan. - col = col.astype(float).fillna(np.nan) - return col.values - - -def _convert_str_col(col, errors, _downcast=None): - """ - Converts a string column to numeric column - - Converts to integer column if all strings are integer-like (isinteger.all) - Otherwise, converts to float column if all strings are float-like ( - isfloat.all) - - If error == 'coerce', fill non-numerics strings with null - - Looks ahead to ``downcast`` parameter, if the float may be casted to - integer, then only process in float32 pipeline. - - Parameters - ---------- - col : The string column to convert, must be string dtype - errors : {'raise', 'ignore', 'coerce'}, same as ``to_numeric`` - _downcast : Same as ``to_numeric``, see description for use - - Returns - ------- - Converted numeric column - """ - if not is_string_dtype(col): - raise TypeError("col must be string dtype.") - - is_integer = libstrings.is_integer(col) - if is_integer.all(): - return col.astype(dtype=cudf.dtype("i8")) - - col = _proc_inf_empty_strings(col) - - is_float = libstrings.is_float(col) - if is_float.all(): - if _downcast in {"unsigned", "signed", "integer"}: - warnings.warn( - UserWarning( - "Downcasting from float to int will be " - "limited by float32 precision." - ) - ) - return col.astype(dtype=cudf.dtype("float32")) - else: - return col.astype(dtype=cudf.dtype("float64")) - else: - if errors == "coerce": - col = libcudf.string_casting.stod(col) - non_numerics = is_float.unary_operator("not") - col[non_numerics] = None - return col - else: - raise ValueError("Unable to convert some strings to numerics.") - - -def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase: - """Handles empty and infinity strings""" - col = libstrings.to_lower(col) - col = col.find_and_replace(as_column([""]), as_column(["NaN"])) - # TODO: This can be handled by libcudf in - # future see StringColumn.as_numerical_column - col = libstrings.replace_multi( - col, - as_column(["+", "inf", "inity"]), - as_column(["", "Inf", ""]), - ) - return col diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py deleted file mode 100644 index 85d454652b7..00000000000 --- a/python/cudf/cudf/core/udf/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. -from . import ( - groupby_lowering, - groupby_typing, - masked_lowering, - masked_typing, - strings_lowering, - strings_typing, -) diff --git a/python/cudf/cudf/core/udf/_ops.py b/python/cudf/cudf/core/udf/_ops.py deleted file mode 100644 index 6b0640b09ed..00000000000 --- a/python/cudf/cudf/core/udf/_ops.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. - -import math -import operator - -arith_ops = [ - operator.add, - operator.sub, - operator.mul, - operator.truediv, - operator.floordiv, - operator.mod, - operator.pow, - operator.iadd, - operator.isub, - operator.imul, - operator.itruediv, - operator.floordiv, - operator.ipow, - operator.imod, -] - -bitwise_ops = [operator.and_, operator.or_, operator.xor] - -unary_ops = [ - math.acos, - math.acosh, - math.asin, - math.asinh, - math.atan, - math.atanh, - math.ceil, - math.cos, - math.degrees, - math.erf, - math.erfc, - math.exp, - math.expm1, - math.fabs, - math.floor, - math.gamma, - math.lgamma, - math.log, - math.log10, - math.log1p, - math.log2, - math.radians, - math.sin, - math.sinh, - math.sqrt, - math.tan, - math.tanh, - operator.pos, - operator.neg, - operator.not_, - operator.invert, -] - -comparison_ops = [ - operator.eq, - operator.ne, - operator.lt, - operator.le, - operator.gt, - operator.ge, -] diff --git a/python/cudf/cudf/core/udf/api.py b/python/cudf/cudf/core/udf/api.py deleted file mode 100644 index 7a68cffeed2..00000000000 --- a/python/cudf/cudf/core/udf/api.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. - - -class Masked: - """ - Most of the time, MaskedType as defined in typing.py - combined with the ops defined to operate on them are - enough to fulfill the obligations of DataFrame.apply - However sometimes we need to refer to an instance of - a masked scalar outside the context of a UDF like as - a global variable. To get numba to identify that var - a of type MaskedType and treat it as such we need to - have an actual python class we can tie to MaskedType - This is that class - """ - - def __init__(self, value, valid): - self.value = value - self.valid = valid - - -def pack_return(masked_or_scalar): - # Blank function to give us something for the typing and - # lowering to grab onto. Just a dummy function for us to - # call within kernels that will get replaced later by the - # lowered implementation - pass diff --git a/python/cudf/cudf/core/udf/groupby_lowering.py b/python/cudf/cudf/core/udf/groupby_lowering.py deleted file mode 100644 index fe0637cfaef..00000000000 --- a/python/cudf/cudf/core/udf/groupby_lowering.py +++ /dev/null @@ -1,190 +0,0 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. - -from functools import partial - -from numba import types -from numba.core import cgutils -from numba.core.extending import lower_builtin -from numba.core.typing import signature as nb_signature -from numba.cuda.cudaimpl import lower as cuda_lower - -from cudf.core.udf.groupby_typing import ( - SUPPORTED_GROUPBY_NUMBA_TYPES, - Group, - GroupType, - call_cuda_functions, - group_size_type, - index_default_type, -) - - -def group_reduction_impl_basic(context, builder, sig, args, function): - """ - Instruction boilerplate used for calling a groupby reduction - __device__ function. Centers around a forward declaration of - this function and adds the pre/post processing instructions - necessary for calling it. - """ - # return type - retty = sig.return_type - - # a variable logically corresponding to the calling `Group` - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - - # what specific (numba) GroupType - grp_type = sig.args[0] - group_dataty = grp_type.group_data_type - - # obtain the correct forward declaration from registry - type_key = (sig.return_type, grp_type.group_scalar_type) - func = call_cuda_functions[function][type_key] - - # insert the forward declaration and return its result - # pass it the data pointer and the group's size - return context.compile_internal( - builder, - func, - nb_signature(retty, group_dataty, grp_type.group_size_type), - (grp.group_data, grp.size), - ) - - -def group_corr(context, builder, sig, args): - """ - Instruction boilerplate used for calling a groupby correlation - """ - lhs_grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - rhs_grp = cgutils.create_struct_proxy(sig.args[1])( - context, builder, value=args[1] - ) - - device_func = call_cuda_functions["corr"][ - ( - sig.return_type, - sig.args[0].group_scalar_type, - sig.args[1].group_scalar_type, - ) - ] - result = context.compile_internal( - builder, - device_func, - nb_signature( - types.float64, - types.CPointer( - sig.args[0].group_scalar_type - ), # this group calls corr - types.CPointer( - sig.args[1].group_scalar_type - ), # this group is passed - group_size_type, - ), - ( - lhs_grp.group_data, - rhs_grp.group_data, - lhs_grp.size, - ), - ) - return result - - -@lower_builtin(Group, types.Array, group_size_type, types.Array) -def group_constructor(context, builder, sig, args): - """ - Instruction boilerplate used for instantiating a Group - struct from a data pointer, an index pointer, and a size - """ - # a variable logically corresponding to the calling `Group` - grp = cgutils.create_struct_proxy(sig.return_type)(context, builder) - grp.group_data = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ).data - grp.index = cgutils.create_struct_proxy(sig.args[2])( - context, builder, value=args[2] - ).data - grp.size = args[1] - return grp._getvalue() - - -def group_reduction_impl_idx_max_or_min(context, builder, sig, args, function): - """ - Instruction boilerplate used for calling a groupby reduction - __device__ function in the case where the function is either - `idxmax` or `idxmin`. See `group_reduction_impl_basic` for - details. This lowering differs from other reductions due to - the presence of the index. This results in the forward - declaration expecting an extra arg. - """ - retty = sig.return_type - - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - grp_type = sig.args[0] - - if grp_type.index_type != index_default_type: - raise TypeError( - f"Only inputs with default index dtype {index_default_type} " - "are supported." - ) - - type_key = (index_default_type, grp_type.group_scalar_type) - func = call_cuda_functions[function][type_key] - - return context.compile_internal( - builder, - func, - nb_signature( - retty, - grp_type.group_data_type, - grp_type.group_index_type, - grp_type.group_size_type, - ), - (grp.group_data, grp.index, grp.size), - ) - - -cuda_Group_max = partial(group_reduction_impl_basic, function="max") -cuda_Group_min = partial(group_reduction_impl_basic, function="min") -cuda_Group_sum = partial(group_reduction_impl_basic, function="sum") -cuda_Group_mean = partial(group_reduction_impl_basic, function="mean") -cuda_Group_std = partial(group_reduction_impl_basic, function="std") -cuda_Group_var = partial(group_reduction_impl_basic, function="var") - -cuda_Group_idxmax = partial( - group_reduction_impl_idx_max_or_min, function="idxmax" -) -cuda_Group_idxmin = partial( - group_reduction_impl_idx_max_or_min, function="idxmin" -) - - -def cuda_Group_size(context, builder, sig, args): - grp = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - return grp.size - - -cuda_Group_count = cuda_Group_size - - -for ty in SUPPORTED_GROUPBY_NUMBA_TYPES: - cuda_lower("GroupType.max", GroupType(ty))(cuda_Group_max) - cuda_lower("GroupType.min", GroupType(ty))(cuda_Group_min) - cuda_lower("GroupType.sum", GroupType(ty))(cuda_Group_sum) - cuda_lower("GroupType.count", GroupType(ty))(cuda_Group_count) - cuda_lower("GroupType.size", GroupType(ty))(cuda_Group_size) - cuda_lower("GroupType.mean", GroupType(ty))(cuda_Group_mean) - cuda_lower("GroupType.std", GroupType(ty))(cuda_Group_std) - cuda_lower("GroupType.var", GroupType(ty))(cuda_Group_var) - cuda_lower("GroupType.idxmax", GroupType(ty, types.int64))( - cuda_Group_idxmax - ) - cuda_lower("GroupType.idxmin", GroupType(ty, types.int64))( - cuda_Group_idxmin - ) - cuda_lower("GroupType.corr", GroupType(ty), GroupType(ty))(group_corr) diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py deleted file mode 100644 index dffd7db2f71..00000000000 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ /dev/null @@ -1,394 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -from typing import Any - -import numba -from numba import cuda, types -from numba.core.extending import ( - make_attribute_wrapper, - models, - register_model, - type_callable, - typeof_impl, -) -from numba.core.typing import signature as nb_signature -from numba.core.typing.templates import AbstractTemplate, AttributeTemplate -from numba.cuda.cudadecl import registry as cuda_registry -from numba.np import numpy_support - -from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops -from cudf.core.udf.utils import Row, UDFError - -index_default_type = types.int64 -group_size_type = types.int64 -SUPPORTED_GROUPBY_NUMBA_TYPES = [ - types.int32, - types.int64, - types.float32, - types.float64, -] -SUPPORTED_GROUPBY_NUMPY_TYPES = [ - numpy_support.as_dtype(dt) for dt in SUPPORTED_GROUPBY_NUMBA_TYPES -] - -_UDF_DOC_URL = ( - "https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs/" -) - - -class Group: - """ - A piece of python code whose purpose is to be replaced - during compilation. After being registered to GroupType, - serves as a handle for instantiating GroupType objects - in python code and accessing their attributes - """ - - pass - - -class GroupType(numba.types.Type): - """ - Numba extension type carrying metadata associated with a single - GroupBy group. This metadata ultimately is passed to the CUDA - __device__ function which actually performs the work. - """ - - def __init__(self, group_scalar_type, index_type=index_default_type): - if ( - group_scalar_type not in SUPPORTED_GROUPBY_NUMBA_TYPES - and not isinstance(group_scalar_type, types.Poison) - ): - # A frame containing an column with an unsupported dtype - # is calling groupby apply. Construct a GroupType with - # a poisoned type so we can later error if this group is - # used in the UDF body - group_scalar_type = types.Poison(group_scalar_type) - self.group_scalar_type = group_scalar_type - self.index_type = index_type - self.group_data_type = types.CPointer(group_scalar_type) - self.group_size_type = group_size_type - self.group_index_type = types.CPointer(index_type) - super().__init__( - name=f"Group({self.group_scalar_type}, {self.index_type})" - ) - - -class GroupByJITDataFrame(Row): - pass - - -register_model(GroupByJITDataFrame)(models.RecordModel) - - -@typeof_impl.register(Group) -def typeof_group(val, c): - """ - Tie Group and GroupType together such that when Numba - sees usage of Group in raw python code, it knows to - treat those usages as uses of GroupType - """ - return GroupType( - numba.np.numpy_support.from_dtype(val.dtype), - numba.np.numpy_support.from_dtype(val.index_dtype), - ) - - -# The typing of the python "function" Group.__init__ -# as it appears in python code -@type_callable(Group) -def type_group(context): - def typer(group_data, size, index): - if ( - isinstance(group_data, types.Array) - and isinstance(size, types.Integer) - and isinstance(index, types.Array) - ): - return GroupType(group_data.dtype, index.dtype) - - return typer - - -@register_model(GroupType) -class GroupModel(models.StructModel): - """ - Model backing GroupType instances. See the link below for details. - https://github.com/numba/numba/blob/main/numba/core/datamodel/models.py - """ - - def __init__(self, dmm, fe_type): - members = [ - ("group_data", types.CPointer(fe_type.group_scalar_type)), - ("size", group_size_type), - ("index", types.CPointer(fe_type.index_type)), - ] - super().__init__(dmm, fe_type, members) - - -call_cuda_functions: dict[Any, Any] = {} - - -def _register_cuda_binary_reduction_caller(funcname, lty, rty, retty): - cuda_func = cuda.declare_device( - f"Block{funcname}_{lty}_{rty}", - retty(types.CPointer(lty), types.CPointer(rty), group_size_type), - ) - - def caller(lhs, rhs, size): - return cuda_func(lhs, rhs, size) - - call_cuda_functions.setdefault(funcname.lower(), {}) - - type_key = retty, lty, rty - call_cuda_functions[funcname.lower()][type_key] = caller - - -def _register_cuda_unary_reduction_caller(funcname, inputty, retty): - cuda_func = cuda.declare_device( - f"Block{funcname}_{inputty}", - retty(types.CPointer(inputty), group_size_type), - ) - - def caller(data, size): - return cuda_func(data, size) - - call_cuda_functions.setdefault(funcname.lower(), {}) - - type_key = (retty, inputty) - call_cuda_functions[funcname.lower()][type_key] = caller - - -def _register_cuda_idx_reduction_caller(funcname, inputty): - cuda_func = cuda.declare_device( - f"Block{funcname}_{inputty}", - types.int64( - types.CPointer(inputty), - types.CPointer(index_default_type), - group_size_type, - ), - ) - - def caller(data, index, size): - return cuda_func(data, index, size) - - # only support default index type right now - type_key = (index_default_type, inputty) - call_cuda_functions.setdefault(funcname.lower(), {}) - call_cuda_functions[funcname.lower()][type_key] = caller - - -class GroupOpBase(AbstractTemplate): - def make_error_string(self, args): - fname = self.key.__name__ - sr_err = ", ".join(["Series" for _ in range(len(args))]) - return ( - f"{fname}({sr_err}) is not supported by JIT GroupBy " - f"apply. Supported features are listed at: {_UDF_DOC_URL}" - ) - - def generic(self, args, kws): - # early exit to make sure typing doesn't fail for normal - # non-group ops - if not all(isinstance(arg, GroupType) for arg in args): - return None - # check if any groups are poisoned for this op - for arg in args: - if isinstance(arg.group_scalar_type, types.Poison): - raise UDFError( - f"Use of a column of {arg.group_scalar_type.ty} detected " - "within UDF body. Only columns of the following dtypes " - "may be used through the GroupBy.apply() JIT engine: " - f"{[str(x) for x in SUPPORTED_GROUPBY_NUMPY_TYPES]}" - ) - if funcs := call_cuda_functions.get(self.key.__name__): - for sig in funcs.keys(): - if all( - arg.group_scalar_type == ty for arg, ty in zip(args, sig) - ): - return nb_signature(sig[0], *args) - raise UDFError(self.make_error_string(args)) - - -class GroupAttrBase(AbstractTemplate): - def make_error_string(self, args): - fname = self.key.split(".")[-1] - args = (self.this, *args) - dtype_err = ", ".join([str(g.group_scalar_type) for g in args]) - sr_err = ", ".join(["Series" for _ in range(len(args) - 1)]) - return ( - f"Series.{fname}({sr_err}) is not supported for " - f"({dtype_err}) within JIT GroupBy apply. To see " - f"what's available, visit {_UDF_DOC_URL}" - ) - - def generic(self, args, kws): - # earlystop to make sure typing doesn't fail for normal - # non-group ops - if not all(isinstance(arg, GroupType) for arg in args): - return None - # check if any groups are poisioned for this op - for arg in (self.this, *args): - if isinstance(arg.group_scalar_type, types.Poison): - raise UDFError( - f"Use of a column of {arg.group_scalar_type.ty} detected " - "within UDAF body. Only columns of the following dtypes " - "may be used through the GroupBy.apply() JIT engine: " - f"{[str(x) for x in SUPPORTED_GROUPBY_NUMPY_TYPES]}" - ) - fname = self.key.split(".")[-1] - if funcs := call_cuda_functions.get(fname): - for sig in funcs.keys(): - retty, selfty, *argtys = sig - if self.this.group_scalar_type == selfty and all( - arg.group_scalar_type == ty - for arg, ty in zip(args, argtys) - ): - return nb_signature(retty, *args, recvr=self.this) - raise UDFError(self.make_error_string(args)) - - -class GroupUnaryAttrBase(GroupAttrBase): - pass - - -class GroupBinaryAttrBase(GroupAttrBase): - pass - - -def _make_unary_attr(funcname): - class GroupUnaryReductionAttrTyping(GroupUnaryAttrBase): - key = f"GroupType.{funcname}" - - def _attr(self, mod): - return types.BoundFunction( - GroupUnaryReductionAttrTyping, - GroupType(mod.group_scalar_type, mod.index_type), - ) - - return _attr - - -def _create_reduction_attr(name, retty=None): - class Attr(AbstractTemplate): - key = name - - def generic(self, args, kws): - return nb_signature( - self.this.group_scalar_type if not retty else retty, - recvr=self.this, - ) - - Attr.generic = generic - - def _attr(self, mod): - return types.BoundFunction( - Attr, GroupType(mod.group_scalar_type, mod.index_type) - ) - - return _attr - - -class GroupIdxMax(AbstractTemplate): - key = "GroupType.idxmax" - - def generic(self, args, kws): - return nb_signature(self.this.index_type, recvr=self.this) - - -class GroupIdxMin(AbstractTemplate): - key = "GroupType.idxmin" - - def generic(self, args, kws): - return nb_signature(self.this.index_type, recvr=self.this) - - -class GroupCorr(GroupBinaryAttrBase): - key = "GroupType.corr" - - -class DataFrameAttributeTemplate(AttributeTemplate): - def resolve(self, value, attr): - raise UDFError( - f"JIT GroupBy.apply() does not support DataFrame.{attr}(). " - ) - - -@cuda_registry.register_attr -class DataFrameAttr(DataFrameAttributeTemplate): - key = GroupByJITDataFrame - - -@cuda_registry.register_attr -class GroupAttr(AttributeTemplate): - key = GroupType - - resolve_max = _make_unary_attr("max") - resolve_min = _make_unary_attr("min") - resolve_sum = _make_unary_attr("sum") - - resolve_mean = _make_unary_attr("mean") - resolve_var = _make_unary_attr("var") - resolve_std = _make_unary_attr("std") - - resolve_size = _create_reduction_attr( - "GroupType.size", retty=group_size_type - ) - resolve_count = _create_reduction_attr( - "GroupType.count", retty=types.int64 - ) - - def resolve_idxmax(self, mod): - return types.BoundFunction( - GroupIdxMax, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_idxmin(self, mod): - return types.BoundFunction( - GroupIdxMin, GroupType(mod.group_scalar_type, mod.index_type) - ) - - def resolve_corr(self, mod): - return types.BoundFunction( - GroupCorr, GroupType(mod.group_scalar_type, mod.index_type) - ) - - -for ty in SUPPORTED_GROUPBY_NUMBA_TYPES: - _register_cuda_unary_reduction_caller("Max", ty, ty) - _register_cuda_unary_reduction_caller("Min", ty, ty) - _register_cuda_idx_reduction_caller("IdxMax", ty) - _register_cuda_idx_reduction_caller("IdxMin", ty) - - if ty in types.integer_domain: - _register_cuda_binary_reduction_caller("Corr", ty, ty, types.float64) - - -_register_cuda_unary_reduction_caller("Sum", types.int32, types.int64) -_register_cuda_unary_reduction_caller("Sum", types.int64, types.int64) -_register_cuda_unary_reduction_caller("Sum", types.float32, types.float32) -_register_cuda_unary_reduction_caller("Sum", types.float64, types.float64) - - -_register_cuda_unary_reduction_caller("Mean", types.int32, types.float64) -_register_cuda_unary_reduction_caller("Mean", types.int64, types.float64) -_register_cuda_unary_reduction_caller("Mean", types.float32, types.float32) -_register_cuda_unary_reduction_caller("Mean", types.float64, types.float64) - -_register_cuda_unary_reduction_caller("Std", types.int32, types.float64) -_register_cuda_unary_reduction_caller("Std", types.int64, types.float64) -_register_cuda_unary_reduction_caller("Std", types.float32, types.float32) -_register_cuda_unary_reduction_caller("Std", types.float64, types.float64) - -_register_cuda_unary_reduction_caller("Var", types.int32, types.float64) -_register_cuda_unary_reduction_caller("Var", types.int64, types.float64) -_register_cuda_unary_reduction_caller("Var", types.float32, types.float32) -_register_cuda_unary_reduction_caller("Var", types.float64, types.float64) - - -for attr in ("group_data", "index", "size"): - make_attribute_wrapper(GroupType, attr, attr) - - -for op in arith_ops + comparison_ops + unary_ops: - cuda_registry.register_global(op)(GroupOpBase) diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py deleted file mode 100644 index 3af662b62ea..00000000000 --- a/python/cudf/cudf/core/udf/groupby_utils.py +++ /dev/null @@ -1,229 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - - -import cupy as cp -import numpy as np -from numba import cuda, types -from numba.core.errors import TypingError -from numba.cuda.cudadrv.devices import get_context -from numba.np import numpy_support - -import cudf.core.udf.utils -from cudf.core.udf.groupby_typing import ( - SUPPORTED_GROUPBY_NUMPY_TYPES, - Group, - GroupByJITDataFrame, - GroupType, -) -from cudf.core.udf.templates import ( - group_initializer_template, - groupby_apply_kernel_template, -) -from cudf.core.udf.utils import ( - UDFError, - _all_dtypes_from_frame, - _compile_or_get, - _get_extensionty_size, - _get_kernel, - _get_udf_return_type, - _supported_cols_from_frame, - _supported_dtypes_from_frame, -) -from cudf.utils._numba import _CUDFNumbaConfig -from cudf.utils.performance_tracking import _performance_tracking - - -def _get_frame_groupby_type(dtype, index_dtype): - """ - Get the Numba type corresponding to a row of grouped data. Models the - column as a Record-like data structure containing GroupTypes. See - numba.np.numpy_support.from_struct_dtype for details. - - Parameters - ---------- - level : np.dtype - A numpy structured array dtype associating field names - to scalar dtypes - index_dtype : np.dtype - A numpy scalar dtype associated with the index of the - incoming grouped data - """ - # Create the numpy structured type corresponding to the numpy dtype. - fields = [] - offset = 0 - - sizes = [val[0].itemsize for val in dtype.fields.values()] - for i, (name, info) in enumerate(dtype.fields.items()): - elemdtype = info[0] - title = info[2] if len(info) == 3 else None - ty = numpy_support.from_dtype(elemdtype) - indexty = numpy_support.from_dtype(index_dtype) - groupty = GroupType(ty, indexty) - infos = { - "type": groupty, - "offset": offset, - "title": title, - } - fields.append((name, infos)) - offset += _get_extensionty_size(groupty) - - # Align the next member of the struct to be a multiple of the - # memory access size, per PTX ISA 7.4/5.4.5 - if i < len(sizes) - 1: - alignment = offset % 8 - if alignment != 0: - offset += 8 - alignment - - # Numba requires that structures are aligned for the CUDA target - _is_aligned_struct = True - return GroupByJITDataFrame(fields, offset, _is_aligned_struct) - - -def _groupby_apply_kernel_string_from_template(frame, args): - """ - Function to write numba kernels for `Groupby.apply` as a string. - Workaround until numba supports functions that use `*args` - """ - # Create argument list for kernel - frame = _supported_cols_from_frame( - frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES - ) - input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))]) - extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) - - # Generate the initializers for each device function argument - initializers = [] - for i, colname in enumerate(frame.keys()): - initializers.append( - group_initializer_template.format(idx=i, name=colname) - ) - - return groupby_apply_kernel_template.format( - input_columns=input_columns, - extra_args=extra_args, - group_initializers="\n".join(initializers), - ) - - -def _get_groupby_apply_kernel(frame, func, args): - np_field_types = np.dtype(list(_all_dtypes_from_frame(frame).items())) - dataframe_group_type = _get_frame_groupby_type( - np_field_types, frame.index.dtype - ) - - return_type = _get_udf_return_type(dataframe_group_type, func, args) - - # Dict of 'local' variables into which `_kernel` is defined - global_exec_context = { - "cuda": cuda, - "Group": Group, - "dataframe_group_type": dataframe_group_type, - "types": types, - } - kernel_string = _groupby_apply_kernel_string_from_template(frame, args) - kernel = _get_kernel(kernel_string, global_exec_context, None, func) - - return kernel, return_type - - -@_performance_tracking -def jit_groupby_apply(offsets, grouped_values, function, *args): - """ - Main entrypoint for JIT Groupby.apply via Numba. - - Parameters - ---------- - offsets : list - A list of integers denoting the indices of the group - boundaries in grouped_values - grouped_values : DataFrame - A DataFrame representing the source data - sorted by group keys - function : callable - The user-defined function to execute - """ - - kernel, return_type = _compile_or_get( - grouped_values, - function, - args, - kernel_getter=_get_groupby_apply_kernel, - suffix="__GROUPBY_APPLY_UDF", - ) - - offsets = cp.asarray(offsets) - ngroups = len(offsets) - 1 - - output = cudf.core.column.column_empty(ngroups, dtype=return_type) - - launch_args = [ - offsets, - output, - grouped_values.index, - ] - launch_args += list( - _supported_cols_from_frame( - grouped_values, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES - ).values() - ) - launch_args += list(args) - - max_group_size = cp.diff(offsets).max() - - if max_group_size >= 256: - blocklim = 256 - else: - blocklim = ((max_group_size + 32 - 1) // 32) * 32 - - if kernel.specialized: - specialized = kernel - else: - specialized = kernel.specialize(*launch_args) - - # Ask the driver to give a good config - ctx = get_context() - # Dispatcher is specialized, so there's only one definition - get - # it so we can get the cufunc from the code library - (kern_def,) = specialized.overloads.values() - grid, tpb = ctx.get_max_potential_block_size( - func=kern_def._codelibrary.get_cufunc(), - b2d_func=0, - memsize=0, - blocksizelimit=int(blocklim), - ) - - # Launch kernel - with _CUDFNumbaConfig(): - specialized[ngroups, tpb](*launch_args) - - return output - - -def _can_be_jitted(frame, func, args): - """ - Determine if this UDF is supported through the JIT engine - by attempting to compile just the function to PTX using the - target set of types - """ - if not hasattr(func, "__code__"): - # Numba requires bytecode to be present to proceed. - # See https://github.com/numba/numba/issues/4587 - return False - - if any(col.has_nulls() for col in frame._columns): - return False - np_field_types = np.dtype( - list( - _supported_dtypes_from_frame( - frame, supported_types=SUPPORTED_GROUPBY_NUMPY_TYPES - ).items() - ) - ) - dataframe_group_type = _get_frame_groupby_type( - np_field_types, frame.index.dtype - ) - try: - _get_udf_return_type(dataframe_group_type, func, args) - return True - except (UDFError, TypingError): - return False diff --git a/python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py deleted file mode 100644 index ae09294e3f9..00000000000 --- a/python/cudf/cudf/core/udf/masked_lowering.py +++ /dev/null @@ -1,400 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. - -import operator - -from llvmlite import ir -from numba.core import cgutils -from numba.core.typing import signature as nb_signature -from numba.cuda.cudaimpl import ( - lower as cuda_lower, - registry as cuda_lowering_registry, -) -from numba.extending import lower_builtin, types - -from cudf.core.udf import api -from cudf.core.udf._ops import ( - arith_ops, - bitwise_ops, - comparison_ops, - unary_ops, -) -from cudf.core.udf.masked_typing import ( - MaskedType, - NAType, - _supported_masked_types, -) - - -@cuda_lowering_registry.lower_constant(NAType) -def constant_na(context, builder, ty, pyval): - # This handles None, etc. - return context.get_dummy_value() - - -# In the typing phase, we declared that a `MaskedType` can be -# added to another `MaskedType` and specified what kind of a -# `MaskedType` would result. Now we have to actually fill in -# the implementation details of how to do that. This is where -# we can involve both validities in constructing the answer - - -def make_arithmetic_op(op): - """ - Make closures that implement arithmetic operations. See - register_arithmetic_op for details. - """ - - def masked_scalar_op_impl(context, builder, sig, args): - """ - Implement `MaskedType` `MaskedType` - """ - # MaskedType(...), MaskedType(...) - masked_type_1, masked_type_2 = sig.args - # MaskedType(...) - masked_return_type = sig.return_type - - # Let there be two actual LLVM structs backing the two inputs - # https://mapping-high-level-constructs-to-llvm-ir.readthedocs.io/en/latest/basic-constructs/structures.html - m1 = cgutils.create_struct_proxy(masked_type_1)( - context, builder, value=args[0] - ) - m2 = cgutils.create_struct_proxy(masked_type_2)( - context, builder, value=args[1] - ) - - # we will return an output struct - result = cgutils.create_struct_proxy(masked_return_type)( - context, builder - ) - # compute output validity - valid = builder.and_(m1.valid, m2.valid) - result.valid = valid - with builder.if_then(valid): - # Let numba handle generating the extra IR needed to perform - # operations on mixed types, by compiling the final core op between - # the two primitive values as a separate function and calling it - result.value = context.compile_internal( - builder, - lambda x, y: op(x, y), - nb_signature( - masked_return_type.value_type, - masked_type_1.value_type, - masked_type_2.value_type, - ), - (m1.value, m2.value), - ) - return result._getvalue() - - return masked_scalar_op_impl - - -def make_unary_op(op): - """ - Make closures that implement unary operations. See register_unary_op for - details. - """ - - def masked_scalar_unary_op_impl(context, builder, sig, args): - """ - Implement `MaskedType` - """ - # MaskedType(...) - masked_type_1 = sig.args[0] - # MaskedType(...) - masked_return_type = sig.return_type - - m1 = cgutils.create_struct_proxy(masked_type_1)( - context, builder, value=args[0] - ) - - # we will return an output struct - result = cgutils.create_struct_proxy(masked_return_type)( - context, builder - ) - - # compute output validity - result.valid = m1.valid - with builder.if_then(m1.valid): - # Let numba handle generating the extra IR needed to perform - # operations on mixed types, by compiling the final core op between - # the two primitive values as a separate function and calling it - result.value = context.compile_internal( - builder, - lambda x: op(x), - nb_signature( - masked_return_type.value_type, - masked_type_1.value_type, - ), - (m1.value,), - ) - return result._getvalue() - - return masked_scalar_unary_op_impl - - -def register_arithmetic_op(op): - """ - Register a lowering implementation for the - arithmetic op `op`. - - Because the lowering implementations compile the final - op separately using a lambda and compile_internal, `op` - needs to be tied to each lowering implementation using - a closure. - - This function makes and lowers a closure for one op. - - """ - to_lower_op = make_arithmetic_op(op) - cuda_lower(op, MaskedType, MaskedType)(to_lower_op) - - -def register_unary_op(op): - """ - Register a lowering implementation for the - unary op `op`. - - Because the lowering implementations compile the final - op separately using a lambda and compile_internal, `op` - needs to be tied to each lowering implementation using - a closure. - - This function makes and lowers a closure for one op. - - """ - to_lower_op = make_unary_op(op) - cuda_lower(op, MaskedType)(to_lower_op) - - -def masked_scalar_null_op_impl(context, builder, sig, args): - """ - Implement `MaskedType` `NAType` - or `NAType` `MaskedType` - The answer to this is known up front so no actual operation - needs to take place - """ - - return_type = sig.return_type # MaskedType(...) - result = cgutils.create_struct_proxy(MaskedType(return_type.value_type))( - context, builder - ) - - # Invalidate the struct and leave `value` uninitialized - result.valid = context.get_constant(types.boolean, 0) - return result._getvalue() - - -def make_const_op(op): - def masked_scalar_const_op_impl(context, builder, sig, args): - return_type = sig.return_type - result = cgutils.create_struct_proxy(return_type)(context, builder) - result.valid = context.get_constant(types.boolean, 0) - if isinstance(sig.args[0], MaskedType): - masked_type, const_type = sig.args - masked_value, const_value = args - - indata = cgutils.create_struct_proxy(masked_type)( - context, builder, value=masked_value - ) - nb_sig = nb_signature( - return_type.value_type, masked_type.value_type, const_type - ) - compile_args = (indata.value, const_value) - else: - const_type, masked_type = sig.args - const_value, masked_value = args - indata = cgutils.create_struct_proxy(masked_type)( - context, builder, value=masked_value - ) - nb_sig = nb_signature( - return_type.value_type, const_type, masked_type.value_type - ) - compile_args = (const_value, indata.value) - with builder.if_then(indata.valid): - result.value = context.compile_internal( - builder, lambda x, y: op(x, y), nb_sig, compile_args - ) - result.valid = context.get_constant(types.boolean, 1) - return result._getvalue() - - return masked_scalar_const_op_impl - - -def register_const_op(op): - to_lower_op = make_const_op(op) - cuda_lower(op, MaskedType, types.Number)(to_lower_op) - cuda_lower(op, types.Number, MaskedType)(to_lower_op) - cuda_lower(op, MaskedType, types.Boolean)(to_lower_op) - cuda_lower(op, types.Boolean, MaskedType)(to_lower_op) - cuda_lower(op, MaskedType, types.NPDatetime)(to_lower_op) - cuda_lower(op, types.NPDatetime, MaskedType)(to_lower_op) - cuda_lower(op, MaskedType, types.NPTimedelta)(to_lower_op) - cuda_lower(op, types.NPTimedelta, MaskedType)(to_lower_op) - - -# register all lowering at init -for binary_op in arith_ops + bitwise_ops + comparison_ops: - register_arithmetic_op(binary_op) - register_const_op(binary_op) - # null op impl can be shared between all ops - cuda_lower(binary_op, MaskedType, NAType)(masked_scalar_null_op_impl) - cuda_lower(binary_op, NAType, MaskedType)(masked_scalar_null_op_impl) - -# register all lowering at init -for unary_op in unary_ops: - register_unary_op(unary_op) -register_unary_op(abs) - - -@cuda_lower(operator.is_, MaskedType, NAType) -@cuda_lower(operator.is_, NAType, MaskedType) -def masked_scalar_is_null_impl(context, builder, sig, args): - """ - Implement `MaskedType` is `NA` - """ - if isinstance(sig.args[1], NAType): - masked_type, na = sig.args - value = args[0] - else: - na, masked_type = sig.args - value = args[1] - - indata = cgutils.create_struct_proxy(masked_type)( - context, builder, value=value - ) - result = cgutils.alloca_once(builder, ir.IntType(1)) - with builder.if_else(indata.valid) as (then, otherwise): - with then: - builder.store(context.get_constant(types.boolean, 0), result) - with otherwise: - builder.store(context.get_constant(types.boolean, 1), result) - - return builder.load(result) - - -# Main kernel always calls `pack_return` on whatever the user defined -# function returned. This returns the same data if its already a `Masked` -# else packs it up into a new one that is valid from the get go -@cuda_lower(api.pack_return, MaskedType) -def pack_return_masked_impl(context, builder, sig, args): - return args[0] - - -@cuda_lower(api.pack_return, types.Boolean) -@cuda_lower(api.pack_return, types.Number) -@cuda_lower(api.pack_return, types.NPDatetime) -@cuda_lower(api.pack_return, types.NPTimedelta) -def pack_return_scalar_impl(context, builder, sig, args): - outdata = cgutils.create_struct_proxy(sig.return_type)(context, builder) - outdata.value = args[0] - outdata.valid = context.get_constant(types.boolean, 1) - - return outdata._getvalue() - - -@cuda_lower(operator.truth, MaskedType) -@cuda_lower(bool, MaskedType) -def masked_scalar_bool_impl(context, builder, sig, args): - indata = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - result = cgutils.alloca_once(builder, ir.IntType(1)) - with builder.if_else(indata.valid) as (then, otherwise): - with then: - builder.store( - context.cast( - builder, - indata.value, - sig.args[0].value_type, - types.boolean, - ), - result, - ) - with otherwise: - builder.store(context.get_constant(types.boolean, 0), result) - return builder.load(result) - - -@cuda_lower(float, MaskedType) -@cuda_lower(int, MaskedType) -def masked_scalar_cast_impl(context, builder, sig, args): - input = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - result = cgutils.create_struct_proxy(sig.return_type)(context, builder) - - casted = context.cast( - builder, - input.value, - sig.args[0].value_type, - sig.return_type.value_type, - ) - result.value = casted - result.valid = input.valid - return result._getvalue() - - -# To handle the unification, we need to support casting from any type to a -# masked type. The cast implementation takes the value passed in and returns -# a masked type struct wrapping that value. -@cuda_lowering_registry.lower_cast(types.Any, MaskedType) -def cast_primitive_to_masked(context, builder, fromty, toty, val): - casted = context.cast(builder, val, fromty, toty.value_type) - ext = cgutils.create_struct_proxy(toty)(context, builder) - ext.value = casted - ext.valid = context.get_constant(types.boolean, 1) - return ext._getvalue() - - -@cuda_lowering_registry.lower_cast(NAType, MaskedType) -def cast_na_to_masked(context, builder, fromty, toty, val): - result = cgutils.create_struct_proxy(toty)(context, builder) - result.valid = context.get_constant(types.boolean, 0) - - return result._getvalue() - - -@cuda_lowering_registry.lower_cast(MaskedType, MaskedType) -def cast_masked_to_masked(context, builder, fromty, toty, val): - """ - When numba encounters an op that expects a certain type and - the input to the op is not of the expected type it will try - to cast the input to the appropriate type. But, in our case - the input may be a MaskedType, which numba doesn't natively - know how to cast to a different MaskedType with a different - `value_type`. This implements and registers that cast. - """ - - # We will - operand = cgutils.create_struct_proxy(fromty)(context, builder, value=val) - casted = context.cast( - builder, operand.value, fromty.value_type, toty.value_type - ) - ext = cgutils.create_struct_proxy(toty)(context, builder) - ext.value = casted - ext.valid = operand.valid - return ext._getvalue() - - -# Masked constructor for use in a kernel for testing -def masked_constructor(context, builder, sig, args): - ty = sig.return_type - value, valid = args - masked = cgutils.create_struct_proxy(ty)(context, builder) - masked.value = value - masked.valid = valid - return masked._getvalue() - - -for ty in _supported_masked_types: - lower_builtin(api.Masked, ty, types.boolean)(masked_constructor) - - -# Allows us to make an instance of MaskedType a global variable -# and properly use it inside functions we will later compile -@cuda_lowering_registry.lower_constant(MaskedType) -def lower_constant_masked(context, builder, ty, val): - masked = cgutils.create_struct_proxy(ty)(context, builder) - masked.value = context.get_constant(ty.value_type, val.value) - masked.valid = context.get_constant(types.boolean, val.valid) - return masked._getvalue() diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py deleted file mode 100644 index 4c90c5bbba0..00000000000 --- a/python/cudf/cudf/core/udf/masked_typing.py +++ /dev/null @@ -1,677 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -import operator - -import numpy as np -from numba import types -from numba.core.extending import ( - make_attribute_wrapper, - models, - register_model, - typeof_impl, -) -from numba.core.typing import signature as nb_signature -from numba.core.typing.templates import ( - AbstractTemplate, - AttributeTemplate, - ConcreteTemplate, -) -from numba.core.typing.typeof import typeof -from numba.cuda.cudadecl import registry as cuda_decl_registry -from numba.np.numpy_support import from_dtype - -from cudf.core.missing import NA -from cudf.core.udf import api -from cudf.core.udf._ops import ( - arith_ops, - bitwise_ops, - comparison_ops, - unary_ops, -) -from cudf.core.udf.strings_typing import ( - StringView, - UDFString, - bool_binary_funcs, - id_unary_funcs, - int_binary_funcs, - size_type, - string_return_attrs, - string_unary_funcs, - string_view, - udf_string, -) -from cudf.utils.dtypes import ( - DATETIME_TYPES, - NUMERIC_TYPES, - STRING_TYPES, - TIMEDELTA_TYPES, -) - -SUPPORTED_NUMPY_TYPES = ( - NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | STRING_TYPES -) -supported_type_str = "\n".join(sorted(list(SUPPORTED_NUMPY_TYPES) + ["bool"])) - -_units = ["ns", "ms", "us", "s"] -_datetime_cases = {types.NPDatetime(u) for u in _units} -_timedelta_cases = {types.NPTimedelta(u) for u in _units} -_supported_masked_types = ( - types.integer_domain - | types.real_domain - | _datetime_cases - | _timedelta_cases - | {types.boolean} - | {string_view, udf_string} -) - - -SUPPORTED_NUMBA_TYPES = ( - types.Number, - types.Boolean, - types.NPDatetime, - types.NPTimedelta, - StringView, - UDFString, -) - - -def _format_error_string(err): - """ - Wrap an error message in newlines and color it red. - """ - return "\033[91m" + "\n" + err + "\n" + "\033[0m" - - -def _type_to_masked_type(t): - if isinstance(t, SUPPORTED_NUMBA_TYPES): - return t - else: - # Unsupported Dtype. Numba tends to print out the type info - # for whatever operands and operation failed to type and then - # output its own error message. Putting the message in the repr - # then is one way of getting the true cause to the user - err = _format_error_string( - "Unsupported MaskedType. This is usually caused by " - "attempting to use a column of unsupported dtype in a UDF. " - f"Supported dtypes are:\n{supported_type_str}" - ) - return types.Poison(err) - - -# Masked scalars of all types -class MaskedType(types.Type): - """ - A Numba type consisting of a value of some primitive type - and a validity boolean, over which we can define math ops - """ - - def __init__(self, value): - # MaskedType in Numba shall be parameterized - # with a value type - self.value_type = _type_to_masked_type(value) - super().__init__(name=f"Masked({self.value_type})") - - def __hash__(self): - """ - Needed so that numba caches type instances with different - `value_type` separately. - """ - return hash(repr(self)) - - def unify(self, context, other): - """ - Often within a UDF an instance arises where a variable could - be a `MaskedType`, an `NAType`, or a literal based off - the data at runtime, for example the variable `ret` here: - - def f(x): - if x == 1: - ret = x - elif x > 2: - ret = 1 - else: - ret = cudf.NA - return ret - - When numba analyzes this function it will eventually figure - out that the variable `ret` could be any of the three types - from above. This scenario will only work if numba knows how - to find some kind of common type between the possibilities, - and this function implements that - the goal is to return a - common type when comparing `self` to other. - - """ - - # If we have Masked and NA, the output should be a - # MaskedType with the original type as its value_type - if isinstance(other, NAType): - return self - - # two MaskedType unify to a new MaskedType whose value_type - # is the result of unifying `self` and `other` `value_type` - elif isinstance(other, MaskedType): - return MaskedType( - context.unify_pairs(self.value_type, other.value_type) - ) - - # if we have MaskedType and something that results in a - # scalar, unify between the MaskedType's value_type - # and that other thing - unified = context.unify_pairs(self.value_type, other) - if unified is None: - # The value types don't unify, so there is no unified masked type - return None - - return MaskedType(unified) - - def __eq__(self, other): - # Equality is required for determining whether a cast is required - # between two different types. - if not isinstance(other, MaskedType): - # Require a cast when the other type is not masked - return False - - # Require a cast for another masked with a different value type - return self.value_type == other.value_type - - -# For typing a Masked constant value defined outside a kernel (e.g. captured in -# a closure). -@typeof_impl.register(api.Masked) -def typeof_masked(val, c): - return MaskedType(typeof(val.value)) - - -# Implemented typing for Masked(value, valid) - the construction of a Masked -# type in a kernel. -@cuda_decl_registry.register -class MaskedConstructor(ConcreteTemplate): - key = api.Masked - cases = [ - nb_signature(MaskedType(t), t, types.boolean) - for t in _supported_masked_types - ] - - -# Typing for `api.Masked` -@cuda_decl_registry.register_attr -class ClassesTemplate(AttributeTemplate): - key = types.Module(api) - - def resolve_Masked(self, mod): - return types.Function(MaskedConstructor) - - -# Registration of the global is also needed for Numba to type api.Masked -cuda_decl_registry.register_global(api, types.Module(api)) -# For typing bare Masked (as in `from .api import Masked` -cuda_decl_registry.register_global( - api.Masked, types.Function(MaskedConstructor) -) - - -# Provide access to `m.value` and `m.valid` in a kernel for a Masked `m`. -make_attribute_wrapper(MaskedType, "value", "value") -make_attribute_wrapper(MaskedType, "valid", "valid") - - -# Tell numba how `MaskedType` is constructed on the backend in terms -# of primitive things that exist at the LLVM level -@register_model(MaskedType) -class MaskedModel(models.StructModel): - def __init__(self, dmm, fe_type): - # This struct has two members, a value and a validity - # let the type of the `value` field be the same as the - # `value_type` and let `valid` be a boolean - members = [("value", fe_type.value_type), ("valid", types.bool_)] - models.StructModel.__init__(self, dmm, fe_type, members) - - -class NAType(types.Type): - """ - A type for handling ops against nulls - Exists so we can: - 1. Teach numba that all occurrences of `cudf.NA` are - to be read as instances of this type instead - 2. Define ops like `if x is cudf.NA` where `x` is of - type `Masked` to mean `if x.valid is False` - """ - - def __init__(self): - super().__init__(name="NA") - - def unify(self, context, other): - """ - Masked <-> NA is deferred to MaskedType.unify() - Literal <-> NA -> Masked - """ - if isinstance(other, MaskedType): - # bounce to MaskedType.unify - return None - elif isinstance(other, NAType): - # unify {NA, NA} -> NA - return self - else: - return MaskedType(other) - - -na_type = NAType() - - -@typeof_impl.register(type(NA)) -def typeof_na(val, c): - """ - Tie instances of _NAType (cudf.NA) to our NAType. - Effectively make it so numba sees `cudf.NA` as an - instance of this NAType -> handle it accordingly. - """ - return na_type - - -register_model(NAType)(models.OpaqueModel) - - -# Ultimately, we want numba to produce PTX code that specifies how to implement -# an operation on two singular `Masked` structs together, which is defined -# as producing a new `Masked` with the right validity and if valid, -# the correct value. This happens in two phases: -# 1. Specify that `Masked` `Masked` exists and what it should return -# 2. Implement how to actually do (1) at the LLVM level -# The following code accomplishes (1) - it is really just a way of specifying -# that the has a CUDA overload that accepts two `Masked` that -# are parameterized with `value_type` and what flavor of `Masked` to return. -class MaskedScalarArithOp(AbstractTemplate): - def generic(self, args, kws): - """ - Typing for `Masked` `Masked` - Numba expects a valid numba type to be returned if typing is successful - else `None` signifies the error state (this pattern is commonly used - in Numba) - """ - if isinstance(args[0], MaskedType) and isinstance(args[1], MaskedType): - # In the case of op(Masked, Masked), the return type is a Masked - # such that Masked.value is the primitive type that would have - # been resolved if we were just operating on the - # `value_type`s. - return_type = self.context.resolve_function_type( - self.key, (args[0].value_type, args[1].value_type), kws - ).return_type - return nb_signature(MaskedType(return_type), args[0], args[1]) - - -class MaskedScalarUnaryOp(AbstractTemplate): - def generic(self, args, kws): - if len(args) == 1 and isinstance(args[0], MaskedType): - return_type = self.context.resolve_function_type( - self.key, (args[0].value_type,), kws - ).return_type - return nb_signature(MaskedType(return_type), args[0]) - - -class MaskedScalarNullOp(AbstractTemplate): - def generic(self, args, kws): - """ - Typing for `Masked` + `NA` - Handles situations like `x + cudf.NA` - """ - if isinstance(args[0], MaskedType) and isinstance(args[1], NAType): - # In the case of op(Masked, NA), the result has the same - # dtype as the original regardless of what it is - return nb_signature( - args[0], - args[0], - na_type, - ) - elif isinstance(args[0], NAType) and isinstance(args[1], MaskedType): - return nb_signature(args[1], na_type, args[1]) - - -class MaskedScalarScalarOp(AbstractTemplate): - def generic(self, args, kws): - """ - Typing for `Masked` a scalar (and vice-versa). - handles situations like `x + 1` - """ - # In the case of op(Masked, scalar), we resolve the type between - # the Masked value_type and the scalar's type directly - to_resolve_types = None - if isinstance(args[0], MaskedType) and isinstance( - args[1], SUPPORTED_NUMBA_TYPES - ): - to_resolve_types = (args[0].value_type, args[1]) - elif isinstance(args[0], SUPPORTED_NUMBA_TYPES) and isinstance( - args[1], MaskedType - ): - to_resolve_types = (args[1].value_type, args[0]) - else: - # fail typing - return None - return_type = self.context.resolve_function_type( - self.key, to_resolve_types, kws - ).return_type - return nb_signature( - MaskedType(return_type), - args[0], - args[1], - ) - - -@cuda_decl_registry.register_global(operator.is_) -class MaskedScalarIsNull(AbstractTemplate): - """ - Typing for `Masked is cudf.NA` - """ - - def generic(self, args, kws): - if isinstance(args[0], MaskedType) and isinstance(args[1], NAType): - return nb_signature(types.boolean, args[0], na_type) - elif isinstance(args[1], MaskedType) and isinstance(args[0], NAType): - return nb_signature(types.boolean, na_type, args[1]) - - -@cuda_decl_registry.register_global(operator.truth) -class MaskedScalarTruth(AbstractTemplate): - """ - Typing for `if Masked` - Used for `if x > y` - The truthiness of a MaskedType shall be the truthiness - of the `value` stored therein - """ - - def generic(self, args, kws): - if isinstance(args[0], MaskedType): - return nb_signature(types.boolean, MaskedType(types.boolean)) - - -@cuda_decl_registry.register_global(float) -class MaskedScalarFloatCast(AbstractTemplate): - """ - Typing for float(Masked) - returns the result of calling "float" on the input - TODO: retains the validity of the input rather than - raising as in float(pd.NA) - """ - - def generic(self, args, kws): - if isinstance(args[0], MaskedType): - # following numpy convention np.dtype(float) -> dtype('float64') - return nb_signature(MaskedType(types.float64), args[0]) - - -@cuda_decl_registry.register_global(int) -class MaskedScalarIntCast(AbstractTemplate): - """ - Typing for int(Masked) - returns the result of calling "int" on the input - TODO: retains the validity of the input rather than - raising as in int(pd.NA) - """ - - def generic(self, args, kws): - if isinstance(args[0], MaskedType): - # following numpy convention np.dtype(int) -> dtype('int64') - return nb_signature(MaskedType(types.int64), args[0]) - - -@cuda_decl_registry.register_global(abs) -class MaskedScalarAbsoluteValue(AbstractTemplate): - """ - Typing for the builtin function abs. Returns the same - type as input except for boolean values which are converted - to integer. - - This follows the expected result from the builtin abs function - which differs from numpy - np.abs returns a bool whereas abs - itself performs the cast. - """ - - def generic(self, args, kws): - if isinstance(args[0], MaskedType): - if isinstance(args[0].value_type, (StringView, UDFString)): - # reject string types - return - else: - return_type = self.context.resolve_function_type( - self.key, (args[0].value_type,), kws - ).return_type - if return_type in types.signed_domain: - # promote to unsigned to avoid overflow - return_type = from_dtype(np.dtype("u" + return_type.name)) - return nb_signature(MaskedType(return_type), args[0]) - - -@cuda_decl_registry.register_global(api.pack_return) -class UnpackReturnToMasked(AbstractTemplate): - """ - Turn a returned MaskedType into its value and validity - or turn a scalar into the tuple (scalar, True). - """ - - def generic(self, args, kws): - if isinstance(args[0], MaskedType): - # MaskedType(dtype, valid) -> MaskedType(dtype, valid) - return nb_signature(args[0], args[0]) - elif isinstance(args[0], SUPPORTED_NUMBA_TYPES): - # scalar_type -> MaskedType(scalar_type, True) - return_type = MaskedType(args[0]) - return nb_signature(return_type, args[0]) - - -for binary_op in arith_ops + bitwise_ops + comparison_ops: - # Every op shares the same typing class - cuda_decl_registry.register_global(binary_op)(MaskedScalarArithOp) - cuda_decl_registry.register_global(binary_op)(MaskedScalarNullOp) - cuda_decl_registry.register_global(binary_op)(MaskedScalarScalarOp) - -for unary_op in unary_ops: - cuda_decl_registry.register_global(unary_op)(MaskedScalarUnaryOp) - - -# Strings functions and utilities -def _is_valid_string_arg(ty): - return ( - isinstance(ty, MaskedType) - and isinstance(ty.value_type, (StringView, UDFString)) - ) or isinstance(ty, types.StringLiteral) - - -def register_masked_string_function(func): - """ - Helper function wrapping numba's low level extension API. Provides - the boilerplate needed to associate a signature with a function or - operator to be overloaded. - """ - - def deco(generic): - class MaskedStringFunction(AbstractTemplate): - pass - - MaskedStringFunction.generic = generic - cuda_decl_registry.register_global(func)(MaskedStringFunction) - - return deco - - -@register_masked_string_function(len) -def len_typing(self, args, kws): - if isinstance(args[0], MaskedType) and isinstance( - args[0].value_type, (StringView, UDFString) - ): - return nb_signature(MaskedType(size_type), MaskedType(string_view)) - elif isinstance(args[0], types.StringLiteral) and len(args) == 1: - return nb_signature(size_type, args[0]) - - -@register_masked_string_function(operator.add) -def concat_typing(self, args, kws): - if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]): - return nb_signature( - MaskedType(udf_string), - MaskedType(string_view), - MaskedType(string_view), - ) - - -@register_masked_string_function(operator.contains) -def contains_typing(self, args, kws): - if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]): - return nb_signature( - MaskedType(types.boolean), - MaskedType(string_view), - MaskedType(string_view), - ) - - -class MaskedStringViewCmpOp(AbstractTemplate): - """ - return the boolean result of `cmpop` between to strings - since the typing is the same for every comparison operator, - we can reuse this class for all of them. - """ - - def generic(self, args, kws): - if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]): - return nb_signature( - MaskedType(types.boolean), - MaskedType(string_view), - MaskedType(string_view), - ) - - -for op in comparison_ops: - cuda_decl_registry.register_global(op)(MaskedStringViewCmpOp) - - -def create_masked_binary_attr(attrname, retty): - """ - Helper function wrapping numba's low level extension API. Provides - the boilerplate needed to register a binary function of two masked - string objects as an attribute of one, e.g. `string.func(other)`. - """ - - class MaskedStringViewBinaryAttr(AbstractTemplate): - key = attrname - - def generic(self, args, kws): - return nb_signature( - MaskedType(retty), MaskedType(string_view), recvr=self.this - ) - - def attr(self, mod): - return types.BoundFunction( - MaskedStringViewBinaryAttr, - MaskedType(string_view), - ) - - return attr - - -def create_masked_unary_attr(attrname, retty): - """ - Helper function wrapping numba's low level extension API. Provides - the boilerplate needed to register a unary function of a masked - string object as an attribute, e.g. `string.func()`. - """ - - class MaskedStringViewIdentifierAttr(AbstractTemplate): - key = attrname - - def generic(self, args, kws): - return nb_signature(MaskedType(retty), recvr=self.this) - - def attr(self, mod): - return types.BoundFunction( - MaskedStringViewIdentifierAttr, - MaskedType(string_view), - ) - - return attr - - -class MaskedStringViewCount(AbstractTemplate): - key = "MaskedType.count" - - def generic(self, args, kws): - return nb_signature( - MaskedType(size_type), MaskedType(string_view), recvr=self.this - ) - - -class MaskedStringViewReplace(AbstractTemplate): - key = "MaskedType.replace" - - def generic(self, args, kws): - return nb_signature( - MaskedType(udf_string), - MaskedType(string_view), - MaskedType(string_view), - recvr=self.this, - ) - - -class MaskedStringViewAttrs(AttributeTemplate): - key = MaskedType(string_view) - - def resolve_replace(self, mod): - return types.BoundFunction( - MaskedStringViewReplace, MaskedType(string_view) - ) - - def resolve_count(self, mod): - return types.BoundFunction( - MaskedStringViewCount, MaskedType(string_view) - ) - - def resolve_value(self, mod): - return string_view - - def resolve_valid(self, mod): - return types.boolean - - -# Build attributes for `MaskedType(string_view)` -for func in bool_binary_funcs: - setattr( - MaskedStringViewAttrs, - f"resolve_{func}", - create_masked_binary_attr(f"MaskedType.{func}", types.boolean), - ) - -for func in int_binary_funcs: - setattr( - MaskedStringViewAttrs, - f"resolve_{func}", - create_masked_binary_attr(f"MaskedType.{func}", size_type), - ) - -for func in string_return_attrs: - setattr( - MaskedStringViewAttrs, - f"resolve_{func}", - create_masked_binary_attr(f"MaskedType.{func}", udf_string), - ) - -for func in id_unary_funcs: - setattr( - MaskedStringViewAttrs, - f"resolve_{func}", - create_masked_unary_attr(f"MaskedType.{func}", types.boolean), - ) - -for func in string_unary_funcs: - setattr( - MaskedStringViewAttrs, - f"resolve_{func}", - create_masked_unary_attr(f"MaskedType.{func}", udf_string), - ) - - -class MaskedUDFStringAttrs(MaskedStringViewAttrs): - key = MaskedType(udf_string) - - def resolve_value(self, mod): - return udf_string - - -cuda_decl_registry.register_attr(MaskedStringViewAttrs) -cuda_decl_registry.register_attr(MaskedUDFStringAttrs) diff --git a/python/cudf/cudf/core/udf/row_function.py b/python/cudf/cudf/core/udf/row_function.py deleted file mode 100644 index e040836f97d..00000000000 --- a/python/cudf/cudf/core/udf/row_function.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -import math - -import numpy as np -from numba import cuda -from numba.np import numpy_support - -from cudf.core.udf.api import Masked, pack_return -from cudf.core.udf.masked_typing import MaskedType -from cudf.core.udf.strings_typing import string_view -from cudf.core.udf.templates import ( - masked_input_initializer_template, - row_initializer_template, - row_kernel_template, - unmasked_input_initializer_template, -) -from cudf.core.udf.utils import ( - Row, - _all_dtypes_from_frame, - _construct_signature, - _get_extensionty_size, - _get_kernel, - _get_udf_return_type, - _mask_get, - _supported_cols_from_frame, - _supported_dtypes_from_frame, -) - - -def _get_frame_row_type(dtype): - """ - Get the Numba type of a row in a frame. Models each column and its mask as - a MaskedType and models the row as a dictionary like data structure - containing these MaskedTypes. Large parts of this function are copied with - comments from the Numba internals and slightly modified to account for - validity bools to be present in the final struct. See - numba.np.numpy_support.from_struct_dtype for details. - """ - - # Create the numpy structured type corresponding to the numpy dtype. - - fields = [] - offset = 0 - - sizes = [ - _get_extensionty_size(string_view) - if val[0] == np.dtype("O") - else val[0].itemsize - for val in dtype.fields.values() - ] - - for i, (name, info) in enumerate(dtype.fields.items()): - # *info* consists of the element dtype, its offset from the beginning - # of the record, and an optional "title" containing metadata. - # We ignore the offset in info because its value assumes no masking; - # instead, we compute the correct offset based on the masked type. - elemdtype = info[0] - title = info[2] if len(info) == 3 else None - - ty = ( - # columns of dtype string start life as string_view - string_view - if elemdtype == np.dtype("O") - else numpy_support.from_dtype(elemdtype) - ) - infos = { - "type": MaskedType(ty), - "offset": offset, - "title": title, - } - fields.append((name, infos)) - - # increment offset by itemsize plus one byte for validity - itemsize = ( - _get_extensionty_size(string_view) - if elemdtype == np.dtype("O") - else elemdtype.itemsize - ) - offset += itemsize + 1 - - # Align the next member of the struct to be a multiple of the - # memory access size, per PTX ISA 7.4/5.4.5 - if i < len(sizes) - 1: - next_itemsize = sizes[i + 1] - offset = int(math.ceil(offset / next_itemsize) * next_itemsize) - - # Numba requires that structures are aligned for the CUDA target - _is_aligned_struct = True - return Row(fields, offset, _is_aligned_struct) - - -def _row_kernel_string_from_template(frame, row_type, args): - """ - Function to write numba kernels for `DataFrame.apply` as a string. - Workaround until numba supports functions that use `*args` - - `DataFrame.apply` expects functions of a dict like row as well as - possibly one or more scalar arguments - - def f(row, c, k): - return (row['x'] + c) / k - - Both the number of input columns as well as their nullability and any - scalar arguments may vary, so the kernels vary significantly. See - templates.py for the full row kernel template and more details. - """ - # Create argument list for kernel - frame = _supported_cols_from_frame(frame) - - input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))]) - input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))]) - extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) - - # Generate the initializers for each device function argument - initializers = [] - row_initializers = [] - for i, (colname, col) in enumerate(frame.items()): - idx = str(i) - template = ( - masked_input_initializer_template - if col.mask is not None - else unmasked_input_initializer_template - ) - initializers.append(template.format(idx=idx)) - row_initializers.append( - row_initializer_template.format(idx=idx, name=colname) - ) - - return row_kernel_template.format( - input_columns=input_columns, - input_offsets=input_offsets, - extra_args=extra_args, - masked_input_initializers="\n".join(initializers), - row_initializers="\n".join(row_initializers), - numba_rectype=row_type, - ) - - -def _get_row_kernel(frame, func, args): - row_type = _get_frame_row_type( - np.dtype(list(_all_dtypes_from_frame(frame).items())) - ) - scalar_return_type = _get_udf_return_type(row_type, func, args) - # this is the signature for the final full kernel compilation - sig = _construct_signature(frame, scalar_return_type, args) - # this row type is used within the kernel to pack up the column and - # mask data into the dict like data structure the user udf expects - np_field_types = np.dtype( - list(_supported_dtypes_from_frame(frame).items()) - ) - row_type = _get_frame_row_type(np_field_types) - - # Dict of 'local' variables into which `_kernel` is defined - global_exec_context = { - "cuda": cuda, - "Masked": Masked, - "_mask_get": _mask_get, - "pack_return": pack_return, - "row_type": row_type, - } - kernel_string = _row_kernel_string_from_template(frame, row_type, args) - kernel = _get_kernel(kernel_string, global_exec_context, sig, func) - - return kernel, scalar_return_type diff --git a/python/cudf/cudf/core/udf/scalar_function.py b/python/cudf/cudf/core/udf/scalar_function.py deleted file mode 100644 index ff7fad3fb82..00000000000 --- a/python/cudf/cudf/core/udf/scalar_function.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -from numba import cuda -from numba.np import numpy_support - -from cudf.core.udf.api import Masked, pack_return -from cudf.core.udf.masked_typing import MaskedType -from cudf.core.udf.strings_typing import string_view -from cudf.core.udf.templates import ( - masked_input_initializer_template, - scalar_kernel_template, - unmasked_input_initializer_template, -) -from cudf.core.udf.utils import ( - _construct_signature, - _get_kernel, - _get_udf_return_type, - _mask_get, -) - - -def _scalar_kernel_string_from_template(sr, args): - """ - Function to write numba kernels for `Series.apply` as a string. - Workaround until numba supports functions that use `*args` - - `Series.apply` expects functions of a single variable and possibly - one or more constants, such as: - - def f(x, c, k): - return (x + c) / k - - where the `x` are meant to be the values of the series. Since there - can be only one column, the only thing that varies in the kinds of - kernels that we want is the number of extra_args. See templates.py - for the full kernel template. - """ - extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))]) - - masked_initializer = ( - masked_input_initializer_template - if sr._column.mask - else unmasked_input_initializer_template - ).format(idx=0) - - return scalar_kernel_template.format( - extra_args=extra_args, masked_initializer=masked_initializer - ) - - -def _get_scalar_kernel(sr, func, args): - sr_type = MaskedType( - string_view if sr.dtype == "O" else numpy_support.from_dtype(sr.dtype) - ) - scalar_return_type = _get_udf_return_type(sr_type, func, args) - - sig = _construct_signature(sr, scalar_return_type, args=args) - f_ = cuda.jit(device=True)(func) - global_exec_context = { - "f_": f_, - "cuda": cuda, - "Masked": Masked, - "_mask_get": _mask_get, - "pack_return": pack_return, - } - kernel_string = _scalar_kernel_string_from_template(sr, args=args) - kernel = _get_kernel(kernel_string, global_exec_context, sig, func) - - return kernel, scalar_return_type diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py deleted file mode 100644 index 3c02ee52b25..00000000000 --- a/python/cudf/cudf/core/udf/strings_lowering.py +++ /dev/null @@ -1,723 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -import operator -from functools import partial - -from numba import cuda, types -from numba.core import cgutils -from numba.core.datamodel import default_manager -from numba.core.typing import signature as nb_signature -from numba.cuda.cudaimpl import ( - lower as cuda_lower, - registry as cuda_lowering_registry, -) - -from cudf._lib.strings_udf import ( - get_character_cases_table_ptr, - get_character_flags_table_ptr, - get_special_case_mapping_table_ptr, -) -from cudf.core.udf.masked_typing import MaskedType -from cudf.core.udf.strings_typing import size_type, string_view, udf_string - -_STR_VIEW_PTR = types.CPointer(string_view) -_UDF_STRING_PTR = types.CPointer(udf_string) - - -# CUDA function declarations -# read-only (input is a string_view, output is a fixed with type) -_string_view_len = cuda.declare_device("len", size_type(_STR_VIEW_PTR)) -_concat_string_view = cuda.declare_device( - "concat", types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) -) - -_string_view_replace = cuda.declare_device( - "replace", - types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), -) - - -def _declare_binary_func(lhs, rhs, out, name): - # Declare a binary function - return cuda.declare_device( - name, - out(lhs, rhs), - ) - - -def _declare_strip_func(name): - return cuda.declare_device( - name, size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR) - ) - - -# A binary function of the form f(string, string) -> bool -_declare_bool_str_str_func = partial( - _declare_binary_func, _STR_VIEW_PTR, _STR_VIEW_PTR, types.boolean -) - -_declare_size_type_str_str_func = partial( - _declare_binary_func, _STR_VIEW_PTR, _STR_VIEW_PTR, size_type -) - -_string_view_contains = _declare_bool_str_str_func("contains") -_string_view_eq = _declare_bool_str_str_func("eq") -_string_view_ne = _declare_bool_str_str_func("ne") -_string_view_ge = _declare_bool_str_str_func("ge") -_string_view_le = _declare_bool_str_str_func("le") -_string_view_gt = _declare_bool_str_str_func("gt") -_string_view_lt = _declare_bool_str_str_func("lt") -_string_view_startswith = _declare_bool_str_str_func("startswith") -_string_view_endswith = _declare_bool_str_str_func("endswith") -_string_view_find = _declare_size_type_str_str_func("find") -_string_view_rfind = _declare_size_type_str_str_func("rfind") -_string_view_contains = _declare_bool_str_str_func("contains") -_string_view_strip = _declare_strip_func("strip") -_string_view_lstrip = _declare_strip_func("lstrip") -_string_view_rstrip = _declare_strip_func("rstrip") - - -# A binary function of the form f(string, int) -> bool -_declare_bool_str_int_func = partial( - _declare_binary_func, _STR_VIEW_PTR, types.int64, types.boolean -) - - -def _declare_upper_or_lower(func): - return cuda.declare_device( - func, - types.void( - _UDF_STRING_PTR, - _STR_VIEW_PTR, - types.uintp, - types.uintp, - types.uintp, - ), - ) - - -_string_view_isdigit = _declare_bool_str_int_func("pyisdigit") -_string_view_isalnum = _declare_bool_str_int_func("pyisalnum") -_string_view_isalpha = _declare_bool_str_int_func("pyisalpha") -_string_view_isdecimal = _declare_bool_str_int_func("pyisdecimal") -_string_view_isnumeric = _declare_bool_str_int_func("pyisnumeric") -_string_view_isspace = _declare_bool_str_int_func("pyisspace") -_string_view_isupper = _declare_bool_str_int_func("pyisupper") -_string_view_islower = _declare_bool_str_int_func("pyislower") -_string_view_istitle = _declare_bool_str_int_func("pyistitle") -_string_view_upper = _declare_upper_or_lower("upper") -_string_view_lower = _declare_upper_or_lower("lower") - - -_string_view_count = cuda.declare_device( - "pycount", - size_type(_STR_VIEW_PTR, _STR_VIEW_PTR), -) - - -# casts -@cuda_lowering_registry.lower_cast(types.StringLiteral, string_view) -def cast_string_literal_to_string_view(context, builder, fromty, toty, val): - """ - Cast a literal to a string_view - """ - # create an empty string_view - sv = cgutils.create_struct_proxy(string_view)(context, builder) - - # set the empty strview data pointer to point to the literal value - sv.data = context.insert_string_const_addrspace( - builder, fromty.literal_value - ) - sv.length = context.get_constant(size_type, len(fromty.literal_value)) - sv.bytes = context.get_constant( - size_type, len(fromty.literal_value.encode("UTF-8")) - ) - - return sv._getvalue() - - -@cuda_lowering_registry.lower_cast(string_view, udf_string) -def cast_string_view_to_udf_string(context, builder, fromty, toty, val): - sv_ptr = builder.alloca(default_manager[fromty].get_value_type()) - udf_str_ptr = builder.alloca(default_manager[toty].get_value_type()) - builder.store(val, sv_ptr) - _ = context.compile_internal( - builder, - call_create_udf_string_from_string_view, - nb_signature(types.void, _STR_VIEW_PTR, types.CPointer(udf_string)), - (sv_ptr, udf_str_ptr), - ) - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) - ) - - return result._getvalue() - - -@cuda_lowering_registry.lower_cast(udf_string, string_view) -def cast_udf_string_to_string_view(context, builder, fromty, toty, val): - udf_str_ptr = builder.alloca(default_manager[fromty].get_value_type()) - sv_ptr = builder.alloca(default_manager[toty].get_value_type()) - builder.store(val, udf_str_ptr) - - context.compile_internal( - builder, - call_create_string_view_from_udf_string, - nb_signature(types.void, _UDF_STRING_PTR, _STR_VIEW_PTR), - (udf_str_ptr, sv_ptr), - ) - - result = cgutils.create_struct_proxy(string_view)( - context, builder, value=builder.load(sv_ptr) - ) - - return result._getvalue() - - -# utilities -_create_udf_string_from_string_view = cuda.declare_device( - "udf_string_from_string_view", - types.void(_STR_VIEW_PTR, _UDF_STRING_PTR), -) -_create_string_view_from_udf_string = cuda.declare_device( - "string_view_from_udf_string", - types.void(_UDF_STRING_PTR, _STR_VIEW_PTR), -) - - -def call_create_udf_string_from_string_view(sv, udf_str): - _create_udf_string_from_string_view(sv, udf_str) - - -def call_create_string_view_from_udf_string(udf_str, sv): - _create_string_view_from_udf_string(udf_str, sv) - - -# String function implementations -def call_len_string_view(st): - return _string_view_len(st) - - -@cuda_lower(len, string_view) -def len_impl(context, builder, sig, args): - sv_ptr = builder.alloca(args[0].type) - builder.store(args[0], sv_ptr) - result = context.compile_internal( - builder, - call_len_string_view, - nb_signature(size_type, _STR_VIEW_PTR), - (sv_ptr,), - ) - - return result - - -def call_concat_string_view(result, lhs, rhs): - return _concat_string_view(result, lhs, rhs) - - -@cuda_lower(operator.add, string_view, string_view) -def concat_impl(context, builder, sig, args): - lhs_ptr = builder.alloca(args[0].type) - rhs_ptr = builder.alloca(args[1].type) - builder.store(args[0], lhs_ptr) - builder.store(args[1], rhs_ptr) - - udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) - _ = context.compile_internal( - builder, - call_concat_string_view, - types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), - (udf_str_ptr, lhs_ptr, rhs_ptr), - ) - - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) - ) - return result._getvalue() - - -def call_string_view_replace(result, src, to_replace, replacement): - return _string_view_replace(result, src, to_replace, replacement) - - -@cuda_lower("StringView.replace", string_view, string_view, string_view) -@cuda_lower("UDFString.replace", string_view, string_view, string_view) -def replace_impl(context, builder, sig, args): - src_ptr = builder.alloca(args[0].type) - to_replace_ptr = builder.alloca(args[1].type) - replacement_ptr = builder.alloca(args[2].type) - - builder.store(args[0], src_ptr) - builder.store(args[1], to_replace_ptr) - builder.store(args[2], replacement_ptr) - - udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type()) - - _ = context.compile_internal( - builder, - call_string_view_replace, - types.void( - _UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR - ), - (udf_str_ptr, src_ptr, to_replace_ptr, replacement_ptr), - ) - - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) - ) - return result._getvalue() - - -def create_binary_string_func(binary_func, retty): - """ - Provide a wrapper around numba's low-level extension API which - produces the boilerplate needed to implement a binary function - of two strings. - """ - - def deco(cuda_func): - @cuda_lower(binary_func, string_view, string_view) - def binary_func_impl(context, builder, sig, args): - lhs_ptr = builder.alloca(args[0].type) - rhs_ptr = builder.alloca(args[1].type) - builder.store(args[0], lhs_ptr) - builder.store(args[1], rhs_ptr) - - # these conditional statements should compile out - if retty != udf_string: - # binary function of two strings yielding a fixed-width type - # example: str.startswith(other) -> bool - # shim functions can return the value through nb_retval - result = context.compile_internal( - builder, - cuda_func, - nb_signature(retty, _STR_VIEW_PTR, _STR_VIEW_PTR), - (lhs_ptr, rhs_ptr), - ) - return result - else: - # binary function of two strings yielding a new string - # example: str.strip(other) -> str - # shim functions can not return a struct due to C linkage - # so we create a new udf_string and pass a pointer to it - # for the shim function to write the output to. The return - # value of compile_internal is therefore discarded (although - # this may change in the future if we need to return error - # codes, for instance). - udf_str_ptr = builder.alloca( - default_manager[udf_string].get_value_type() - ) - _ = context.compile_internal( - builder, - cuda_func, - size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR), - (udf_str_ptr, lhs_ptr, rhs_ptr), - ) - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) - ) - return result._getvalue() - - # binary_func can be attribute-like: str.binary_func - # or operator-like: binary_func(str, other) - if isinstance(binary_func, str): - binary_func_impl = cuda_lower( - f"StringView.{binary_func}", string_view, string_view - )(binary_func_impl) - binary_func_impl = cuda_lower( - f"UDFString.{binary_func}", string_view, string_view - )(binary_func_impl) - else: - binary_func_impl = cuda_lower( - binary_func, string_view, string_view - )(binary_func_impl) - - return binary_func_impl - - return deco - - -@create_binary_string_func(operator.contains, types.boolean) -def contains_impl(st, substr): - return _string_view_contains(st, substr) - - -@create_binary_string_func(operator.eq, types.boolean) -def eq_impl(st, rhs): - return _string_view_eq(st, rhs) - - -@create_binary_string_func(operator.ne, types.boolean) -def ne_impl(st, rhs): - return _string_view_ne(st, rhs) - - -@create_binary_string_func(operator.ge, types.boolean) -def ge_impl(st, rhs): - return _string_view_ge(st, rhs) - - -@create_binary_string_func(operator.le, types.boolean) -def le_impl(st, rhs): - return _string_view_le(st, rhs) - - -@create_binary_string_func(operator.gt, types.boolean) -def gt_impl(st, rhs): - return _string_view_gt(st, rhs) - - -@create_binary_string_func(operator.lt, types.boolean) -def lt_impl(st, rhs): - return _string_view_lt(st, rhs) - - -@create_binary_string_func("strip", udf_string) -def strip_impl(result, to_strip, strip_char): - return _string_view_strip(result, to_strip, strip_char) - - -@create_binary_string_func("lstrip", udf_string) -def lstrip_impl(result, to_strip, strip_char): - return _string_view_lstrip(result, to_strip, strip_char) - - -@create_binary_string_func("rstrip", udf_string) -def rstrip_impl(result, to_strip, strip_char): - return _string_view_rstrip(result, to_strip, strip_char) - - -@create_binary_string_func("startswith", types.boolean) -def startswith_impl(sv, substr): - return _string_view_startswith(sv, substr) - - -@create_binary_string_func("endswith", types.boolean) -def endswith_impl(sv, substr): - return _string_view_endswith(sv, substr) - - -@create_binary_string_func("count", size_type) -def count_impl(st, substr): - return _string_view_count(st, substr) - - -@create_binary_string_func("find", size_type) -def find_impl(sv, substr): - return _string_view_find(sv, substr) - - -@create_binary_string_func("rfind", size_type) -def rfind_impl(sv, substr): - return _string_view_rfind(sv, substr) - - -def create_unary_identifier_func(id_func): - """ - Provide a wrapper around numba's low-level extension API which - produces the boilerplate needed to implement a unary function - of a string. - """ - - def deco(cuda_func): - @cuda_lower(f"StringView.{id_func}", string_view) - @cuda_lower(f"UDFString.{id_func}", string_view) - def id_func_impl(context, builder, sig, args): - str_ptr = builder.alloca(args[0].type) - builder.store(args[0], str_ptr) - - # Lookup table required for conversion functions - # must be resolved at runtime after context initialization, - # therefore cannot be a global variable - tbl_ptr = context.get_constant( - types.uintp, get_character_flags_table_ptr() - ) - result = context.compile_internal( - builder, - cuda_func, - nb_signature(types.boolean, _STR_VIEW_PTR, types.uintp), - (str_ptr, tbl_ptr), - ) - - return result - - return id_func_impl - - return deco - - -def create_upper_or_lower(id_func): - """ - Provide a wrapper around numba's low-level extension API which - produces the boilerplate needed to implement either the upper - or lower attrs of a string view. - """ - - def deco(cuda_func): - @cuda_lower(f"StringView.{id_func}", string_view) - @cuda_lower(f"UDFString.{id_func}", string_view) - def id_func_impl(context, builder, sig, args): - str_ptr = builder.alloca(args[0].type) - builder.store(args[0], str_ptr) - - # Lookup table required for conversion functions - # must be resolved at runtime after context initialization, - # therefore cannot be a global variable - flags_tbl_ptr = context.get_constant( - types.uintp, get_character_flags_table_ptr() - ) - cases_tbl_ptr = context.get_constant( - types.uintp, get_character_cases_table_ptr() - ) - special_tbl_ptr = context.get_constant( - types.uintp, get_special_case_mapping_table_ptr() - ) - udf_str_ptr = builder.alloca( - default_manager[udf_string].get_value_type() - ) - - _ = context.compile_internal( - builder, - cuda_func, - types.void( - _UDF_STRING_PTR, - _STR_VIEW_PTR, - types.uintp, - types.uintp, - types.uintp, - ), - ( - udf_str_ptr, - str_ptr, - flags_tbl_ptr, - cases_tbl_ptr, - special_tbl_ptr, - ), - ) - - result = cgutils.create_struct_proxy(udf_string)( - context, builder, value=builder.load(udf_str_ptr) - ) - return result._getvalue() - - return id_func_impl - - return deco - - -@create_upper_or_lower("upper") -def upper_impl(result, st, flags, cases, special): - return _string_view_upper(result, st, flags, cases, special) - - -@create_upper_or_lower("lower") -def lower_impl(result, st, flags, cases, special): - return _string_view_lower(result, st, flags, cases, special) - - -@create_unary_identifier_func("isdigit") -def isdigit_impl(st, tbl): - return _string_view_isdigit(st, tbl) - - -@create_unary_identifier_func("isalnum") -def isalnum_impl(st, tbl): - return _string_view_isalnum(st, tbl) - - -@create_unary_identifier_func("isalpha") -def isalpha_impl(st, tbl): - return _string_view_isalpha(st, tbl) - - -@create_unary_identifier_func("isnumeric") -def isnumeric_impl(st, tbl): - return _string_view_isnumeric(st, tbl) - - -@create_unary_identifier_func("isdecimal") -def isdecimal_impl(st, tbl): - return _string_view_isdecimal(st, tbl) - - -@create_unary_identifier_func("isspace") -def isspace_impl(st, tbl): - return _string_view_isspace(st, tbl) - - -@create_unary_identifier_func("isupper") -def isupper_impl(st, tbl): - return _string_view_isupper(st, tbl) - - -@create_unary_identifier_func("islower") -def islower_impl(st, tbl): - return _string_view_islower(st, tbl) - - -@create_unary_identifier_func("istitle") -def istitle_impl(st, tbl): - return _string_view_istitle(st, tbl) - - -@cuda_lower(len, MaskedType(string_view)) -@cuda_lower(len, MaskedType(udf_string)) -def masked_len_impl(context, builder, sig, args): - ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) - masked_sv_ty = sig.args[0] - masked_sv = cgutils.create_struct_proxy(masked_sv_ty)( - context, builder, value=args[0] - ) - result = len_impl( - context, builder, size_type(string_view), (masked_sv.value,) - ) - ret.value = result - ret.valid = masked_sv.valid - - return ret._getvalue() - - -def _masked_proxies(context, builder, maskedty, *args): - return tuple( - cgutils.create_struct_proxy(maskedty)(context, builder, value=arg) - for arg in args - ) - - -@cuda_lower( - "MaskedType.replace", - MaskedType(string_view), - MaskedType(string_view), - MaskedType(string_view), -) -def masked_string_view_replace_impl(context, builder, sig, args): - ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) - src_masked, to_replace_masked, replacement_masked = _masked_proxies( - context, builder, MaskedType(string_view), *args - ) - result = replace_impl( - context, - builder, - nb_signature(udf_string, string_view, string_view, string_view), - (src_masked.value, to_replace_masked.value, replacement_masked.value), - ) - - ret.value = result - ret.valid = builder.and_( - builder.and_(src_masked.valid, to_replace_masked.valid), - replacement_masked.valid, - ) - - return ret._getvalue() - - -def create_masked_binary_string_func(op, cuda_func, retty): - """ - Provide a wrapper around numba's low-level extension API which - produces the boilerplate needed to implement a binary function - of two masked strings. - """ - - def masked_binary_func_impl(context, builder, sig, args): - ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) - - lhs_masked = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - rhs_masked = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[1] - ) - - result = cuda_func( - context, - builder, - nb_signature(retty, string_view, string_view), - (lhs_masked.value, rhs_masked.value), - ) - - ret.value = result - ret.valid = builder.and_(lhs_masked.valid, rhs_masked.valid) - - return ret._getvalue() - - cuda_lower(op, MaskedType(string_view), MaskedType(string_view))( - masked_binary_func_impl - ) - - -def create_masked_unary_identifier_func(op, cuda_func): - """ - Provide a wrapper around numba's low-level extension API which - produces the boilerplate needed to implement a unary function - of a masked string. - """ - - def masked_unary_func_impl(context, builder, sig, args): - ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) - masked_str = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - - result = cuda_func( - context, - builder, - types.boolean(string_view, string_view), - (masked_str.value,), - ) - ret.value = result - ret.valid = masked_str.valid - return ret._getvalue() - - cuda_lower(op, MaskedType(string_view))(masked_unary_func_impl) - - -def create_masked_upper_or_lower(op, cuda_func): - def upper_or_lower_impl(context, builder, sig, args): - ret = cgutils.create_struct_proxy(sig.return_type)(context, builder) - masked_str = cgutils.create_struct_proxy(sig.args[0])( - context, builder, value=args[0] - ) - - result = cuda_func( - context, - builder, - udf_string(string_view), - (masked_str.value,), - ) - ret.value = result - ret.valid = masked_str.valid - return ret._getvalue() - - cuda_lower(op, MaskedType(string_view))(upper_or_lower_impl) - - -create_masked_binary_string_func("MaskedType.strip", strip_impl, udf_string) -create_masked_binary_string_func("MaskedType.lstrip", lstrip_impl, udf_string) -create_masked_binary_string_func("MaskedType.rstrip", rstrip_impl, udf_string) -create_masked_binary_string_func( - "MaskedType.startswith", - startswith_impl, - types.boolean, -) -create_masked_binary_string_func( - "MaskedType.endswith", endswith_impl, types.boolean -) -create_masked_binary_string_func("MaskedType.find", find_impl, size_type) -create_masked_binary_string_func("MaskedType.rfind", rfind_impl, size_type) -create_masked_binary_string_func("MaskedType.count", count_impl, size_type) -create_masked_binary_string_func( - operator.contains, contains_impl, types.boolean -) - - -create_masked_unary_identifier_func("MaskedType.isalnum", isalnum_impl) -create_masked_unary_identifier_func("MaskedType.isalpha", isalpha_impl) -create_masked_unary_identifier_func("MaskedType.isdigit", isdigit_impl) -create_masked_unary_identifier_func("MaskedType.isupper", isupper_impl) -create_masked_unary_identifier_func("MaskedType.islower", islower_impl) -create_masked_unary_identifier_func("MaskedType.isspace", isspace_impl) -create_masked_unary_identifier_func("MaskedType.isdecimal", isdecimal_impl) -create_masked_unary_identifier_func("MaskedType.istitle", istitle_impl) -create_masked_upper_or_lower("MaskedType.upper", upper_impl) -create_masked_upper_or_lower("MaskedType.lower", lower_impl) diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py deleted file mode 100644 index 43604ab21a7..00000000000 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -import operator - -import numpy as np -from numba import types -from numba.core.extending import models, register_model -from numba.core.typing import signature as nb_signature -from numba.core.typing.templates import AbstractTemplate, AttributeTemplate -from numba.cuda.cudadecl import registry as cuda_decl_registry - -import rmm - -# libcudf size_type -size_type = types.int32 - - -# String object definitions -class UDFString(types.Type): - np_dtype = np.dtype("object") - - def __init__(self): - super().__init__(name="udf_string") - - @property - def return_type(self): - return self - - -class StringView(types.Type): - np_dtype = np.dtype("object") - - def __init__(self): - super().__init__(name="string_view") - - @property - def return_type(self): - return UDFString() - - -@register_model(StringView) -class stringview_model(models.StructModel): - # from string_view.hpp: - _members = ( - # const char* _data{} - # Pointer to device memory contain char array for this string - ("data", types.CPointer(types.char)), - # size_type _bytes{}; - # Number of bytes in _data for this string - ("bytes", size_type), - # mutable size_type _length{}; - # Number of characters in this string (computed) - ("length", size_type), - ) - - def __init__(self, dmm, fe_type): - super().__init__(dmm, fe_type, self._members) - - -@register_model(UDFString) -class udf_string_model(models.StructModel): - # from udf_string.hpp: - # private: - # char* m_data{}; - # cudf::size_type m_bytes{}; - # cudf::size_type m_size{}; - - _members = ( - ("m_data", types.CPointer(types.char)), - ("m_bytes", size_type), - ("m_size", size_type), - ) - - def __init__(self, dmm, fe_type): - super().__init__(dmm, fe_type, self._members) - - -any_string_ty = (StringView, UDFString, types.StringLiteral) -string_view = StringView() -udf_string = UDFString() - - -class StrViewArgHandler: - """ - As part of Numba's preprocessing step, incoming function arguments are - modified based on the associated type for that argument that was used - to JIT the kernel. However it only knows how to handle built in array - types natively. With string UDFs, the jitted type is string_view*, - which numba does not know how to handle. - - This class converts string_view* to raw pointer arguments, which Numba - knows how to use. - - See numba.cuda.compiler._prepare_args for details. - """ - - def prepare_args(self, ty, val, **kwargs): - if isinstance(ty, types.CPointer) and isinstance( - ty.dtype, (StringView, UDFString) - ): - return types.uint64, val.ptr if isinstance( - val, rmm._lib.device_buffer.DeviceBuffer - ) else val.get_ptr(mode="read") - else: - return ty, val - - -str_view_arg_handler = StrViewArgHandler() - - -# String functions -@cuda_decl_registry.register_global(len) -class StringLength(AbstractTemplate): - """ - provide the length of a cudf::string_view like struct - """ - - def generic(self, args, kws): - if isinstance(args[0], any_string_ty) and len(args) == 1: - # length: - # string_view -> int32 - # udf_string -> int32 - # literal -> int32 - return nb_signature(size_type, string_view) - - -def register_stringview_binaryop(op, retty): - """ - Helper function wrapping numba's low level extension API. Provides - the boilerplate needed to associate a signature with a function or - operator expecting a string. - """ - - class StringViewBinaryOp(AbstractTemplate): - def generic(self, args, kws): - if isinstance(args[0], any_string_ty) and isinstance( - args[1], any_string_ty - ): - return nb_signature(retty, string_view, string_view) - - cuda_decl_registry.register_global(op)(StringViewBinaryOp) - - -def create_binary_attr(attrname, retty): - """ - Helper function wrapping numba's low level extension API. Provides - the boilerplate needed to register a binary function of two string - objects as an attribute of one, e.g. `string.func(other)`. - """ - - class StringViewBinaryAttr(AbstractTemplate): - key = f"StringView.{attrname}" - - def generic(self, args, kws): - return nb_signature(retty, string_view, recvr=self.this) - - def attr(self, mod): - return types.BoundFunction(StringViewBinaryAttr, string_view) - - return attr - - -def create_identifier_attr(attrname, retty): - """ - Helper function wrapping numba's low level extension API. Provides - the boilerplate needed to register a unary function of a string - object as an attribute, e.g. `string.func()`. - """ - - class StringViewIdentifierAttr(AbstractTemplate): - key = f"StringView.{attrname}" - - def generic(self, args, kws): - return nb_signature(retty, recvr=self.this) - - def attr(self, mod): - return types.BoundFunction(StringViewIdentifierAttr, string_view) - - return attr - - -class StringViewCount(AbstractTemplate): - key = "StringView.count" - - def generic(self, args, kws): - return nb_signature(size_type, string_view, recvr=self.this) - - -class StringViewReplace(AbstractTemplate): - key = "StringView.replace" - - def generic(self, args, kws): - return nb_signature( - udf_string, string_view, string_view, recvr=self.this - ) - - -class StringViewAttrs(AttributeTemplate): - key = string_view - - def resolve_count(self, mod): - return types.BoundFunction(StringViewCount, string_view) - - def resolve_replace(self, mod): - return types.BoundFunction(StringViewReplace, string_view) - - -bool_binary_funcs = ["startswith", "endswith"] -int_binary_funcs = ["find", "rfind"] -id_unary_funcs = [ - "isalpha", - "isalnum", - "isdecimal", - "isdigit", - "isupper", - "islower", - "isspace", - "isnumeric", - "istitle", -] -string_unary_funcs = ["upper", "lower"] -string_return_attrs = ["strip", "lstrip", "rstrip"] - -for func in bool_binary_funcs: - setattr( - StringViewAttrs, - f"resolve_{func}", - create_binary_attr(func, types.boolean), - ) - -for func in string_return_attrs: - setattr( - StringViewAttrs, - f"resolve_{func}", - create_binary_attr(func, udf_string), - ) - - -for func in int_binary_funcs: - setattr( - StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type) - ) - -for func in id_unary_funcs: - setattr( - StringViewAttrs, - f"resolve_{func}", - create_identifier_attr(func, types.boolean), - ) - -for func in string_unary_funcs: - setattr( - StringViewAttrs, - f"resolve_{func}", - create_identifier_attr(func, udf_string), - ) - - -@cuda_decl_registry.register_attr -class UDFStringAttrs(StringViewAttrs): - key = udf_string - - -cuda_decl_registry.register_attr(StringViewAttrs) -cuda_decl_registry.register_attr(UDFStringAttrs) - -register_stringview_binaryop(operator.eq, types.boolean) -register_stringview_binaryop(operator.ne, types.boolean) -register_stringview_binaryop(operator.lt, types.boolean) -register_stringview_binaryop(operator.gt, types.boolean) -register_stringview_binaryop(operator.le, types.boolean) -register_stringview_binaryop(operator.ge, types.boolean) - -# st in other -register_stringview_binaryop(operator.contains, types.boolean) - -# st + other -register_stringview_binaryop(operator.add, udf_string) diff --git a/python/cudf/cudf/core/udf/strings_utils.py b/python/cudf/cudf/core/udf/strings_utils.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/core/udf/templates.py b/python/cudf/cudf/core/udf/templates.py deleted file mode 100644 index 9a032146992..00000000000 --- a/python/cudf/cudf/core/udf/templates.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -unmasked_input_initializer_template = """\ - d_{idx} = input_col_{idx} - masked_{idx} = Masked(d_{idx}[i], True) -""" - -masked_input_initializer_template = """\ - d_{idx}, m_{idx} = input_col_{idx} - masked_{idx} = Masked(d_{idx}[i], _mask_get(m_{idx}, i + offset_{idx})) -""" - -row_initializer_template = """\ - row["{name}"] = masked_{idx} -""" - -group_initializer_template = """\ - arr_{idx} = input_col_{idx}[offset[block_id]:offset[block_id+1]] - dataframe_group["{name}"] = Group(arr_{idx}, size, arr_index) -""" - -row_kernel_template = """\ -def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}): - i = cuda.grid(1) - ret_data_arr, ret_mask_arr = retval - if i < size: - # Create a structured array with the desired fields - rows = cuda.local.array(1, dtype=row_type) - - # one element of that array - row = rows[0] - -{masked_input_initializers} -{row_initializers} - - # pass the assembled row into the udf - ret = f_(row, {extra_args}) - - # pack up the return values and set them - ret_masked = pack_return(ret) - ret_data_arr[i] = ret_masked.value - ret_mask_arr[i] = ret_masked.valid -""" - -scalar_kernel_template = """ -def _kernel(retval, size, input_col_0, offset_0, {extra_args}): - i = cuda.grid(1) - ret_data_arr, ret_mask_arr = retval - - if i < size: - -{masked_initializer} - - ret = f_(masked_0, {extra_args}) - - ret_masked = pack_return(ret) - ret_data_arr[i] = ret_masked.value - ret_mask_arr[i] = ret_masked.valid -""" - -groupby_apply_kernel_template = """ -def _kernel(offset, out, index, {input_columns}, {extra_args}): - tid = cuda.threadIdx.x - block_id = cuda.blockIdx.x - tb_size = cuda.blockDim.x - - recarray = cuda.local.array(1, dtype=dataframe_group_type) - dataframe_group = recarray[0] - - if block_id < (len(offset) - 1): - - size = offset[block_id+1] - offset[block_id] - arr_index = index[offset[block_id]:offset[block_id+1]] - -{group_initializers} - - result = f_(dataframe_group, {extra_args}) - if cuda.threadIdx.x == 0: - out[block_id] = result -""" diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py deleted file mode 100644 index bfe716f0afc..00000000000 --- a/python/cudf/cudf/core/udf/utils.py +++ /dev/null @@ -1,374 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import functools -import os -from typing import TYPE_CHECKING, Any - -import cachetools -import cupy as cp -import llvmlite.binding as ll -import numpy as np -from cuda import cudart -from numba import cuda, typeof -from numba.core.datamodel import default_manager, models -from numba.core.errors import TypingError -from numba.core.extending import register_model -from numba.np import numpy_support -from numba.types import CPointer, Poison, Record, Tuple, boolean, int64, void - -import rmm - -from cudf._lib import strings_udf -from cudf.api.types import is_scalar -from cudf.core.column.column import as_column -from cudf.core.dtypes import dtype -from cudf.core.udf.masked_typing import MaskedType -from cudf.core.udf.strings_typing import ( - str_view_arg_handler, - string_view, - udf_string, -) -from cudf.utils import cudautils -from cudf.utils._numba import _CUDFNumbaConfig, _get_ptx_file -from cudf.utils.dtypes import ( - BOOL_TYPES, - DATETIME_TYPES, - NUMERIC_TYPES, - STRING_TYPES, - TIMEDELTA_TYPES, -) -from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import initfunc - -if TYPE_CHECKING: - from collections.abc import Callable - -# Maximum size of a string column is 2 GiB -_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get("STRINGS_UDF_HEAP_SIZE", 2**31) -_heap_size = 0 -_cudf_str_dtype = dtype(str) - - -JIT_SUPPORTED_TYPES = ( - NUMERIC_TYPES - | BOOL_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | STRING_TYPES -) -libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32")) -MASK_BITSIZE = np.dtype("int32").itemsize * 8 - -precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) -launch_arg_getters: dict[Any, Any] = {} - - -@functools.cache -def _ptx_file(): - return _get_ptx_file( - os.path.join( - os.path.dirname(strings_udf.__file__), "..", "core", "udf" - ), - "shim_", - ) - - -@_performance_tracking -def _get_udf_return_type(argty, func: Callable, args=()): - """ - Get the return type of a masked UDF for a given set of argument dtypes. It - is assumed that the function consumes a dictionary whose keys are strings - and whose values are of MaskedType. Initially assume that the UDF may be - written to utilize any field in the row - including those containing an - unsupported dtype. If an unsupported dtype is actually used in the function - the compilation should fail at `compile_udf`. If compilation succeeds, one - can infer that the function does not use any of the columns of unsupported - dtype - meaning we can drop them going forward and the UDF will still end - up getting fed rows containing all the fields it actually needs to use to - compute the answer for that row. - """ - - # present a row containing all fields to the UDF and try and compile - compile_sig = (argty, *(typeof(arg) for arg in args)) - - # Get the return type. The PTX is also returned by compile_udf, but is not - # needed here. - with _CUDFNumbaConfig(): - ptx, output_type = cudautils.compile_udf(func, compile_sig) - - if not isinstance(output_type, MaskedType): - numba_output_type = numpy_support.from_dtype(np.dtype(output_type)) - else: - numba_output_type = output_type - - result = ( - numba_output_type - if not isinstance(numba_output_type, MaskedType) - else numba_output_type.value_type - ) - result = result if result.is_internal else result.return_type - - # _get_udf_return_type will throw a TypingError if the user tries to use - # a field in the row containing an unsupported dtype, except in the - # edge case where all the function does is return that element: - - # def f(row): - # return row[] - # In this case numba is happy to return MaskedType() - # because it relies on not finding overloaded operators for types to raise - # the exception, so we have to explicitly check for that case. - if isinstance(result, Poison): - raise TypingError(str(result)) - - return result - - -def _all_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): - return { - colname: dtype if str(dtype) in supported_types else np.dtype("O") - for colname, dtype in frame._dtypes - } - - -def _supported_dtypes_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): - return { - colname: dtype - for colname, dtype in frame._dtypes - if str(dtype) in supported_types - } - - -def _supported_cols_from_frame(frame, supported_types=JIT_SUPPORTED_TYPES): - return { - colname: col - for colname, col in frame._column_labels_and_values - if str(col.dtype) in supported_types - } - - -def _masked_array_type_from_col(col): - """ - Return a type representing a tuple of arrays, - the first element an array of the numba type - corresponding to `dtype`, and the second an - array of bools representing a mask. - """ - - if col.dtype == _cudf_str_dtype: - col_type = CPointer(string_view) - else: - nb_scalar_ty = numpy_support.from_dtype(col.dtype) - col_type = nb_scalar_ty[::1] - - if col.mask is None: - return col_type - else: - return Tuple((col_type, libcudf_bitmask_type[::1])) - - -def _construct_signature(frame, return_type, args): - """ - Build the signature of numba types that will be used to - actually JIT the kernel itself later, accounting for types - and offsets. Skips columns with unsupported dtypes. - """ - if not return_type.is_internal: - return_type = CPointer(return_type) - else: - return_type = return_type[::1] - # Tuple of arrays, first the output data array, then the mask - return_type = Tuple((return_type, boolean[::1])) - offsets = [] - sig = [return_type, int64] - for col in _supported_cols_from_frame(frame).values(): - sig.append(_masked_array_type_from_col(col)) - offsets.append(int64) - - # return_type, size, data, masks, offsets, extra args - sig = void(*(sig + offsets + [typeof(arg) for arg in args])) - - return sig - - -class Row(Record): - # Numba's Record type provides a convenient abstraction for representing a - # row, in that it provides a mapping from strings (column / field names) to - # types. However, it cannot be used directly since it assumes that all its - # fields can be converted to NumPy types by Numba's internal conversion - # mechanism (`numba.np_support.as_dtype). This is not the case for cuDF - # extension types that might be the column types (e.g. masked types, string - # types or group types). - # - # We use this type for type inference and type checking, but not in code - # generation. For this use case, it is sufficient to provide a dtype for a - # row that corresponds to any Python object. - @property - def dtype(self): - return np.dtype("object") - - -register_model(Row)(models.RecordModel) - - -@cuda.jit(device=True) -def _mask_get(mask, pos): - """Return the validity of mask[pos] as a word.""" - return (mask[pos // MASK_BITSIZE] >> (pos % MASK_BITSIZE)) & 1 - - -def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"): - """Create a cache key that uniquely identifies a compilation. - - A new compilation is needed any time any of the following things change: - - The UDF itself as defined in python by the user - - The types of the columns utilized by the UDF - - The existence of the input columns masks - """ - scalar_argtypes = tuple(typeof(arg) for arg in args) - return ( - *cudautils.make_cache_key( - func, tuple(_all_dtypes_from_frame(frame).values()) - ), - *(col.mask is None for col in frame._columns), - *frame._column_names, - scalar_argtypes, - suffix, - ) - - -@_performance_tracking -def _compile_or_get( - frame, func, args, kernel_getter=None, suffix="__APPLY_UDF" -): - """ - Return a compiled kernel in terms of MaskedTypes that launches a - kernel equivalent of `f` for the dtypes of `df`. The kernel uses - a thread for each row and calls `f` using that rows data / mask - to produce an output value and output validity for each row. - - If the UDF has already been compiled for this requested dtypes, - a cached version will be returned instead of running compilation. - - CUDA kernels are void and do not return values. Thus, we need to - preallocate a column of the correct dtype and pass it in as one of - the kernel arguments. This creates a chicken-and-egg problem where - we need the column type to compile the kernel, but normally we would - be getting that type FROM compiling the kernel (and letting numba - determine it as a return value). As a workaround, we compile the UDF - itself outside the final kernel to invoke a full typing pass, which - unfortunately is difficult to do without running full compilation. - we then obtain the return type from that separate compilation and - use it to allocate an output column of the right dtype. - """ - if not all(is_scalar(arg) for arg in args): - raise TypeError("only scalar valued args are supported by apply") - - # check to see if we already compiled this function - cache_key = _generate_cache_key(frame, func, args, suffix=suffix) - if precompiled.get(cache_key) is not None: - kernel, masked_or_scalar = precompiled[cache_key] - return kernel, masked_or_scalar - - # precompile the user udf to get the right return type. - # could be a MaskedType or a scalar type. - - kernel, scalar_return_type = kernel_getter(frame, func, args) - np_return_type = ( - numpy_support.as_dtype(scalar_return_type) - if scalar_return_type.is_internal - else scalar_return_type.np_dtype - ) - - precompiled[cache_key] = (kernel, np_return_type) - - return kernel, np_return_type - - -def _get_kernel(kernel_string, globals_, sig, func): - """Template kernel compilation helper function.""" - f_ = cuda.jit(device=True)(func) - globals_["f_"] = f_ - exec(kernel_string, globals_) - _kernel = globals_["_kernel"] - kernel = cuda.jit( - sig, link=[_ptx_file()], extensions=[str_view_arg_handler] - )(_kernel) - - return kernel - - -def _get_input_args_from_frame(fr): - args = [] - offsets = [] - for col in _supported_cols_from_frame(fr).values(): - if col.dtype == _cudf_str_dtype: - data = column_to_string_view_array_init_heap(col) - else: - data = col.data - if col.mask is not None: - # argument is a tuple of data, mask - args.append((data, col.mask)) - else: - # argument is just the data pointer - args.append(data) - offsets.append(col.offset) - - return args + offsets - - -def _return_arr_from_dtype(dtype, size): - if dtype == _cudf_str_dtype: - return rmm.DeviceBuffer(size=size * _get_extensionty_size(udf_string)) - return cp.empty(size, dtype=dtype) - - -def _post_process_output_col(col, retty): - if retty == _cudf_str_dtype: - return strings_udf.column_from_udf_string_array(col) - return as_column(col, retty) - - -# The only supported data layout in NVVM. -# See: https://docs.nvidia.com/cuda/nvvm-ir-spec/index.html?#data-layout -_nvvm_data_layout = ( - "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-" - "i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-" - "v64:64:64-v128:128:128-n16:32:64" -) - - -def _get_extensionty_size(ty): - """ - Return the size of an extension type in bytes - """ - target_data = ll.create_target_data(_nvvm_data_layout) - llty = default_manager[ty].get_value_type() - return llty.get_abi_size(target_data) - - -@initfunc -def set_malloc_heap_size(size=None): - """ - Heap size control for strings_udf, size in bytes. - """ - global _heap_size - if size is None: - size = _STRINGS_UDF_DEFAULT_HEAP_SIZE - if size != _heap_size: - (ret,) = cudart.cudaDeviceSetLimit( - cudart.cudaLimit.cudaLimitMallocHeapSize, size - ) - if ret.value != 0: - raise RuntimeError("Unable to set cudaMalloc heap size") - - _heap_size = size - - -def column_to_string_view_array_init_heap(col): - # lazily allocate heap only when a string needs to be returned - return strings_udf.column_to_string_view_array(col) - - -class UDFError(RuntimeError): - pass diff --git a/python/cudf/cudf/core/window/__init__.py b/python/cudf/cudf/core/window/__init__.py deleted file mode 100644 index 23522588d33..00000000000 --- a/python/cudf/cudf/core/window/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION -from cudf.core.window.ewm import ExponentialMovingWindow -from cudf.core.window.rolling import Rolling diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py deleted file mode 100644 index 094df955273..00000000000 --- a/python/cudf/cudf/core/window/ewm.py +++ /dev/null @@ -1,246 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING, Literal - -import numpy as np - -from cudf._lib.reduce import scan -from cudf.api.types import is_numeric_dtype -from cudf.core.window.rolling import _RollingBase - -if TYPE_CHECKING: - from cudf.core.column.column import ColumnBase - - -class ExponentialMovingWindow(_RollingBase): - r""" - Provide exponential weighted (EW) functions. - Available EW functions: ``mean()`` - Exactly one parameter: ``com``, ``span``, ``halflife``, or ``alpha`` - must be provided. - - Parameters - ---------- - com : float, optional - Specify decay in terms of center of mass, - :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`. - span : float, optional - Specify decay in terms of span, - :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`. - halflife : float, str, timedelta, optional - Specify decay in terms of half-life, - :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for - :math:`halflife > 0`. - alpha : float, optional - Specify smoothing factor :math:`\alpha` directly, - :math:`0 < \alpha \leq 1`. - min_periods : int, default 0 - Not Supported - adjust : bool, default True - Controls assumptions about the first value in the sequence. - https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.ewm.html - for details. - ignore_na : bool, default False - Not Supported - axis : {0, 1}, default 0 - Not Supported - times : str, np.ndarray, Series, default None - Not Supported - - Returns - ------- - ``ExponentialMovingWindow`` object - - Notes - ----- - cuDF input data may contain both nulls and nan values. For the purposes - of this method, they are taken to have the same meaning, meaning nulls - in cuDF will affect the result the same way that nan values would using - the equivalent pandas method. - - .. pandas-compat:: - :meth:`pandas.DataFrame.ewm` - - The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times`` - are not yet supported. Behavior is defined only for data that begins - with a valid (non-null) element. - - Currently, only ``mean`` is a supported method. - - Examples - -------- - >>> df = cudf.DataFrame({'B': [0, 1, 2, cudf.NA, 4]}) - >>> df - B - 0 0 - 1 1 - 2 2 - 3 - 4 4 - >>> df.ewm(com=0.5).mean() - B - 0 0.000000 - 1 0.750000 - 2 1.615385 - 3 1.615385 - 4 3.670213 - - >>> df.ewm(com=0.5, adjust=False).mean() - B - 0 0.000000 - 1 0.666667 - 2 1.555556 - 3 1.555556 - 4 3.650794 - """ - - def __init__( - self, - obj, - com: float | None = None, - span: float | None = None, - halflife: float | None = None, - alpha: float | None = None, - min_periods: int | None = 0, - adjust: bool = True, - ignore_na: bool = False, - axis: int = 0, - times: str | np.ndarray | None = None, - method: Literal["single", "table"] = "single", - ): - if min_periods != 0: - raise NotImplementedError( - "min_periods is currently not supported." - ) - if ignore_na is not False: - raise NotImplementedError("ignore_na is currently not supported.") - if axis != 0: - warnings.warn( - "axis is deprecated with will be removed in a future version. " - "Transpose the DataFrame first instead." - ) - raise NotImplementedError("axis is currently not supported.") - if times is not None: - raise NotImplementedError("times is currently not supported.") - if method != "single": - raise NotImplementedError("method is currently not supported.") - self.obj = obj - self.adjust = adjust - self.com = get_center_of_mass(com, span, halflife, alpha) - - def online(self, engine: str = "numba", engine_kwargs=None): - """ - Return an ``OnlineExponentialMovingWindow`` object to calculate - exponentially moving window aggregations in an online method. - - Currently not supported. - """ - raise NotImplementedError("online is currently not supported.") - - def mean( - self, numeric_only: bool = False, engine=None, engine_kwargs=None - ): - """ - Calculate the ewm (exponential weighted moment) mean. - """ - if numeric_only is not False: - raise NotImplementedError( - "numeric_only is currently not supported." - ) - if engine is not None: - raise NotImplementedError( - "engine is non-functional and added for compatibility with pandas." - ) - if engine_kwargs is not None: - raise NotImplementedError( - "engine_kwargs is non-functional and added for compatibility with pandas." - ) - return self._apply_agg("ewma") - - def sum(self, numeric_only: bool = False, engine=None, engine_kwargs=None): - raise NotImplementedError("sum not yet supported.") - - def var(self, bias: bool = False, numeric_only: bool = False): - raise NotImplementedError("var not yet supported.") - - def std(self, bias: bool = False, numeric_only: bool = False): - raise NotImplementedError("std not yet supported.") - - def corr( - self, other, pairwise: bool | None = None, numeric_only: bool = False - ): - raise NotImplementedError("corr not yet supported.") - - def cov( - self, - other, - pairwise: bool | None = None, - bias: bool = False, - numeric_only: bool = False, - ): - raise NotImplementedError("cov not yet supported.") - - def _apply_agg_column( - self, source_column: ColumnBase, agg_name: str - ) -> ColumnBase: - if not is_numeric_dtype(source_column.dtype): - raise TypeError("No numeric types to aggregate") - - # libcudf ewm has special casing for nulls only - # and come what may with nans. It treats those nulls like - # pandas does nans in the same positions mathematically. - # as such we need to convert the nans to nulls before - # passing them in. - to_libcudf_column = source_column.astype("float64").nans_to_nulls() - - return scan( - agg_name, - to_libcudf_column, - True, - com=self.com, - adjust=self.adjust, - ) - - -def get_center_of_mass( - comass: float | None, - span: float | None, - halflife: float | None, - alpha: float | None, -) -> float: - valid_count = count_not_none(comass, span, halflife, alpha) - if valid_count > 1: - raise ValueError( - "comass, span, halflife, and alpha are mutually exclusive" - ) - - # Convert to center of mass; domain checks ensure 0 < alpha <= 1 - if comass is not None: - if comass < 0: - raise ValueError("comass must satisfy: comass >= 0") - elif span is not None: - if span < 1: - raise ValueError("span must satisfy: span >= 1") - comass = (span - 1) / 2 - elif halflife is not None: - if halflife <= 0: - raise ValueError("halflife must satisfy: halflife > 0") - decay = 1 - np.exp(np.log(0.5) / halflife) - comass = 1 / decay - 1 - elif alpha is not None: - if alpha <= 0 or alpha > 1: - raise ValueError("alpha must satisfy: 0 < alpha <= 1") - comass = (1 - alpha) / alpha - else: - raise ValueError("Must pass one of comass, span, halflife, or alpha") - - return float(comass) - - -def count_not_none(*args) -> int: - """ - Returns the count of arguments that are not None. - """ - return sum(x is not None for x in args) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py deleted file mode 100644 index 967edc2ab15..00000000000 --- a/python/cudf/cudf/core/window/rolling.py +++ /dev/null @@ -1,563 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION -from __future__ import annotations - -import warnings -from typing import TYPE_CHECKING - -import numba -import pandas as pd -from pandas.api.indexers import BaseIndexer - -import cudf -from cudf import _lib as libcudf -from cudf.api.types import is_integer, is_number -from cudf.core.buffer import acquire_spill_lock -from cudf.core.column.column import as_column -from cudf.core.mixins import Reducible -from cudf.utils import cudautils -from cudf.utils.utils import GetAttrGetItemMixin - -if TYPE_CHECKING: - from cudf.core.column.column import ColumnBase - - -class _RollingBase: - """ - Contains routines to apply a window aggregation to a column. - """ - - obj: cudf.DataFrame | cudf.Series - - def _apply_agg_column( - self, source_column: ColumnBase, agg_name: str - ) -> ColumnBase: - raise NotImplementedError - - def _apply_agg(self, agg_name: str) -> cudf.DataFrame | cudf.Series: - applied = ( - self._apply_agg_column(col, agg_name) for col in self.obj._columns - ) - return self.obj._from_data_like_self( - self.obj._data._from_columns_like_self(applied) - ) - - -class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible): - """ - Rolling window calculations. - - Parameters - ---------- - window : int, offset or a BaseIndexer subclass - Size of the window, i.e., the number of observations used - to calculate the statistic. - For datetime indexes, an offset can be provided instead - of an int. The offset must be convertible to a timedelta. - As opposed to a fixed window size, each window will be - sized to accommodate observations within the time period - specified by the offset. - If a BaseIndexer subclass is passed, calculates the window - boundaries based on the defined ``get_window_bounds`` method. - min_periods : int, optional - The minimum number of observations in the window that are - required to be non-null, so that the result is non-null. - If not provided or ``None``, ``min_periods`` is equal to - the window size. - center : bool, optional - If ``True``, the result is set at the center of the window. - If ``False`` (default), the result is set at the right edge - of the window. - - Returns - ------- - ``Rolling`` object. - - Examples - -------- - >>> import cudf - >>> a = cudf.Series([1, 2, 3, None, 4]) - - Rolling sum with window size 2. - - >>> print(a.rolling(2).sum()) - 0 - 1 3 - 2 5 - 3 - 4 - dtype: int64 - - Rolling sum with window size 2 and min_periods 1. - - >>> print(a.rolling(2, min_periods=1).sum()) - 0 1 - 1 3 - 2 5 - 3 3 - 4 4 - dtype: int64 - - Rolling count with window size 3. - - >>> print(a.rolling(3).count()) - 0 1 - 1 2 - 2 3 - 3 2 - 4 2 - dtype: int64 - - Rolling count with window size 3, but with the result set at the - center of the window. - - >>> print(a.rolling(3, center=True).count()) - 0 2 - 1 3 - 2 2 - 3 2 - 4 1 dtype: int64 - - Rolling max with variable window size specified by an offset; - only valid for datetime index. - - >>> a = cudf.Series( - ... [1, 9, 5, 4, np.nan, 1], - ... index=[ - ... pd.Timestamp('20190101 09:00:00'), - ... pd.Timestamp('20190101 09:00:01'), - ... pd.Timestamp('20190101 09:00:02'), - ... pd.Timestamp('20190101 09:00:04'), - ... pd.Timestamp('20190101 09:00:07'), - ... pd.Timestamp('20190101 09:00:08') - ... ] - ... ) - - >>> print(a.rolling('2s').max()) - 2019-01-01T09:00:00.000 1 - 2019-01-01T09:00:01.000 9 - 2019-01-01T09:00:02.000 9 - 2019-01-01T09:00:04.000 4 - 2019-01-01T09:00:07.000 - 2019-01-01T09:00:08.000 1 - dtype: int64 - - Apply custom function on the window with the *apply* method - - >>> import numpy as np - >>> import math - >>> b = cudf.Series([16, 25, 36, 49, 64, 81], dtype=np.float64) - >>> def some_func(A): - ... b = 0 - ... for a in A: - ... b = b + math.sqrt(a) - ... return b - ... - >>> print(b.rolling(3, min_periods=1).apply(some_func)) - 0 4.0 - 1 9.0 - 2 15.0 - 3 18.0 - 4 21.0 - 5 24.0 - dtype: float64 - - And this also works for window rolling set by an offset - - >>> import pandas as pd - >>> c = cudf.Series( - ... [16, 25, 36, 49, 64, 81], - ... index=[ - ... pd.Timestamp('20190101 09:00:00'), - ... pd.Timestamp('20190101 09:00:01'), - ... pd.Timestamp('20190101 09:00:02'), - ... pd.Timestamp('20190101 09:00:04'), - ... pd.Timestamp('20190101 09:00:07'), - ... pd.Timestamp('20190101 09:00:08') - ... ], - ... dtype=np.float64 - ... ) - >>> print(c.rolling('2s').apply(some_func)) - 2019-01-01T09:00:00.000 4.0 - 2019-01-01T09:00:01.000 9.0 - 2019-01-01T09:00:02.000 11.0 - 2019-01-01T09:00:04.000 7.0 - 2019-01-01T09:00:07.000 8.0 - 2019-01-01T09:00:08.000 17.0 - dtype: float64 - """ - - _PROTECTED_KEYS = frozenset(("obj",)) - - _time_window = False - - _VALID_REDUCTIONS = { - "sum", - "min", - "max", - "mean", - "var", - "std", - } - - def __init__( - self, - obj, - window, - min_periods=None, - center: bool = False, - win_type: str | None = None, - on=None, - axis=0, - closed: str | None = None, - step: int | None = None, - method: str = "single", - ): - self.obj = obj - self.window = window - self.min_periods = min_periods - self.center = center - self._normalize() - # for var & std only? - self.agg_params: dict[str, int] = {} - if axis != 0: - warnings.warn( - "axis is deprecated with will be removed in a future version. " - "Transpose the DataFrame first instead." - ) - raise NotImplementedError("axis != 0 is not supported yet.") - self.axis = axis - - if win_type is not None: - if win_type != "boxcar": - raise NotImplementedError( - "Only the default win_type 'boxcar' is currently supported" - ) - self.win_type = win_type - - if on is not None: - raise NotImplementedError("on is currently not supported") - if closed not in (None, "right"): - raise NotImplementedError("closed is currently not supported") - if step is not None: - raise NotImplementedError("step is currently not supported") - if method != "single": - raise NotImplementedError("method is currently not supported") - - def __getitem__(self, arg): - if isinstance(arg, tuple): - arg = list(arg) - return self.obj[arg].rolling( - window=self.window, - min_periods=self.min_periods, - center=self.center, - ) - - def _apply_agg_column(self, source_column, agg_name): - min_periods = self.min_periods or 1 - if isinstance(self.window, int): - preceding_window = None - following_window = None - window = self.window - elif isinstance(self.window, BaseIndexer): - start, end = self.window.get_window_bounds( - num_values=len(self.obj), - min_periods=self.min_periods, - center=self.center, - closed=None, - step=None, - ) - start = as_column(start, dtype="int32") - end = as_column(end, dtype="int32") - - idx = as_column(range(len(start))) - preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( - "int32" - ) - following_window = (end - idx - cudf.Scalar(1, "int32")).astype( - "int32" - ) - window = None - else: - preceding_window = as_column(self.window) - following_window = as_column( - 0, length=self.window.size, dtype=self.window.dtype - ) - window = None - - return libcudf.rolling.rolling( - source_column=source_column, - pre_column_window=preceding_window, - fwd_column_window=following_window, - window=window, - min_periods=min_periods, - center=self.center, - op=agg_name, - agg_params=self.agg_params, - ) - - def _reduce( - self, - op: str, - *args, - **kwargs, - ): - """Calculate the rolling {op}. - - Returns - ------- - Series or DataFrame - Return type is the same as the original object. - """ - return self._apply_agg(op) - - def var(self, ddof=1): - """Calculate the rolling variance. - - Parameters - ---------- - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of - elements. - - Returns - ------- - Series or DataFrame - Return type is the same as the original object. - """ - self.agg_params["ddof"] = ddof - return self._apply_agg("var") - - def std(self, ddof=1): - """Calculate the rolling standard deviation. - - Parameters - ---------- - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of - elements. - - Returns - ------- - Series or DataFrame - Return type is the same as the original object. - """ - self.agg_params["ddof"] = ddof - return self._apply_agg("std") - - def count(self): - """Calculate the rolling count of non NaN observations. - - Returns - ------- - Series or DataFrame - Return type is the same as the original object. - """ - return self._apply_agg("count") - - def apply(self, func, *args, **kwargs): - """ - Calculate the rolling custom aggregation function. - - Parameters - ---------- - func : function - A user defined function that takes an 1D array as input - args : tuple - unsupported. - kwargs - unsupported - - See Also - -------- - cudf.Series.apply: Apply an elementwise function to - transform the values in the Column. - - Notes - ----- - The supported Python features are listed in - - https://numba.readthedocs.io/en/stable/cuda/cudapysupported.html - - with these exceptions: - - * Math functions in `cmath` are not supported since `libcudf` does not - have complex number support and output of `cmath` functions are most - likely complex numbers. - - * These five functions in `math` are not supported since numba - generates multiple PTX functions from them: - - * math.sin() - * math.cos() - * math.tan() - * math.gamma() - * math.lgamma() - - * Series with string dtypes are not supported. - - * Global variables need to be re-defined explicitly inside - the udf, as numba considers them to be compile-time constants - and there is no known way to obtain value of the global variable. - - Examples - -------- - >>> import cudf - >>> def count_if_gt_3(window): - ... count = 0 - ... for i in window: - ... if i > 3: - ... count += 1 - ... return count - ... - >>> s = cudf.Series([0, 1.1, 5.8, 3.1, 6.2, 2.0, 1.5]) - >>> s.rolling(3, min_periods=1).apply(count_if_gt_3) - 0 0 - 1 0 - 2 1 - 3 2 - 4 3 - 5 2 - 6 1 - dtype: int64 - """ - has_nulls = False - if isinstance(self.obj, cudf.Series): - if self.obj._column.has_nulls(): - has_nulls = True - else: - for col in self.obj._data: - if self.obj[col].has_nulls: - has_nulls = True - if has_nulls: - raise NotImplementedError( - "Handling UDF with null values is not yet supported" - ) - return self._apply_agg(func) - - def _normalize(self): - """ - Normalize the *window* and *min_periods* args - - *window* can be: - - * An integer, in which case it is the window size. - If *min_periods* is unspecified, it is set to be equal to - the window size. - - * A timedelta offset, in which case it is used to generate - a column of window sizes to use for each element. - If *min_periods* is unspecified, it is set to 1. - Only valid for datetime index. - """ - window, min_periods = self.window, self.min_periods - if is_number(window): - # only allow integers - if not is_integer(window): - raise ValueError("window must be an integer") - if window <= 0: - raise ValueError("window cannot be zero or negative") - if self.min_periods is None: - min_periods = window - else: - if isinstance( - window, (numba.cuda.devicearray.DeviceNDArray, BaseIndexer) - ): - # window is a device_array of window sizes or BaseIndexer - self.window = window - self.min_periods = min_periods - return - - if not isinstance(self.obj.index, cudf.core.index.DatetimeIndex): - raise ValueError( - "window must be an integer for non datetime index" - ) - - self._time_window = True - - try: - window = pd.to_timedelta(window) - # to_timedelta will also convert np.arrays etc., - if not isinstance(window, pd.Timedelta): - raise ValueError - window = window.to_timedelta64() - except ValueError as e: - raise ValueError( - "window must be integer or convertible to a timedelta" - ) from e - if self.min_periods is None: - min_periods = 1 - - self.window = self._window_to_window_sizes(window) - self.min_periods = min_periods - - def _window_to_window_sizes(self, window): - """ - For non-fixed width windows, - convert the window argument into window sizes. - """ - if is_integer(window): - return window - else: - with acquire_spill_lock(): - return cudautils.window_sizes_from_offset( - self.obj.index._values.data_array_view(mode="write"), - window, - ) - - def __repr__(self): - return "{} [window={},min_periods={},center={}]".format( - self.__class__.__name__, self.window, self.min_periods, self.center - ) - - -class RollingGroupby(Rolling): - """ - Grouped rolling window calculation. - - See Also - -------- - cudf.core.window.Rolling - """ - - def __init__(self, groupby, window, min_periods=None, center=False): - sort_order = groupby.grouping.keys.argsort() - - # TODO: there may be overlap between the columns - # of `groupby.grouping.keys` and `groupby.obj`. - # As an optimization, avoid gathering those twice. - self._group_keys = groupby.grouping.keys.take(sort_order) - obj = groupby.obj.drop(columns=groupby.grouping._named_columns).take( - sort_order - ) - - gb_size = groupby.size().sort_index() - self._group_starts = ( - gb_size.cumsum().shift(1).fillna(0).repeat(gb_size) - ) - - super().__init__(obj, window, min_periods=min_periods, center=center) - - @acquire_spill_lock() - def _window_to_window_sizes(self, window): - if is_integer(window): - return cudautils.grouped_window_sizes_from_offset( - as_column(range(len(self.obj))).data_array_view(mode="read"), - self._group_starts, - window, - ) - else: - return cudautils.grouped_window_sizes_from_offset( - self.obj.index._values.data_array_view(mode="read"), - self._group_starts, - window, - ) - - def _apply_agg(self, agg_name): - index = cudf.MultiIndex._from_data( - {**self._group_keys._data, **self.obj.index._data} - ) - result = super()._apply_agg(agg_name) - result.index = index - return result diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py deleted file mode 100644 index dbabaacf6b5..00000000000 --- a/python/cudf/cudf/datasets.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd - -import cudf -from cudf._lib.transform import bools_to_mask - -__all__ = ["timeseries", "randomdata"] - - -# TODO: -# change default of name from category to str type when nvstring are merged -def timeseries( - start="2000-01-01", - end="2000-01-31", - freq="1s", - dtypes=None, - nulls_frequency=0, - seed=None, -): - """Create timeseries dataframe with random data - - Parameters - ---------- - start : datetime (or datetime-like string) - Start of time series - end : datetime (or datetime-like string) - End of time series - dtypes : dict - Mapping of column names to types. - Valid types include {float, int, str, 'category'}. - If none is provided, this defaults to - ``{"name": "category", "id": int, "x": float, "y": float}`` - freq : string - String like '2s' or '1H' or '12W' for the time series frequency - nulls_frequency : float - Fill the series with the specified proportion of nulls. Default is 0. - seed : int (optional) - Randomstate seed - - Examples - -------- - >>> import cudf - >>> gdf = cudf.datasets.timeseries() - >>> gdf.head() # doctest: +SKIP - timestamp id name x y - 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633 - 2000-01-01 00:00:01 1066 Michael -0.262136 0.307107 - 2000-01-01 00:00:02 988 Wendy -0.526331 0.128641 - 2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270 - 2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278 - """ - if dtypes is None: - dtypes = {"name": "category", "id": int, "x": float, "y": float} - - index = pd.DatetimeIndex( - pd.date_range(start, end, freq=freq, name="timestamp") - ) - state = np.random.RandomState(seed) - columns = {k: make[dt](len(index), state) for k, dt in dtypes.items()} - df = pd.DataFrame(columns, index=index, columns=sorted(columns)) - if df.index[-1] == end: - df = df.iloc[:-1] - - gdf = cudf.from_pandas(df) - for col in gdf: - mask = state.choice( - [True, False], - size=len(index), - p=[1 - nulls_frequency, nulls_frequency], - ) - mask_buf = bools_to_mask(cudf.core.column.as_column(mask)) - masked_col = gdf[col]._column.set_mask(mask_buf) - gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index) - - return gdf - - -def randomdata(nrows=10, dtypes=None, seed=None): - """Create a dataframe with random data - - Parameters - ---------- - nrows : int - number of rows in the dataframe - dtypes : dict - Mapping of column names to types. - Valid types include {float, int, str, 'category'} - If none is provided, this defaults to - ``{"id": int, "x": float, "y": float}`` - seed : int (optional) - Randomstate seed - - Examples - -------- - >>> import cudf - >>> gdf = cudf.datasets.randomdata() - >>> cdf.head() # doctest: +SKIP - id x y - 0 1014 0.28361267466770146 -0.44274170661264334 - 1 1026 -0.9937981936047235 -0.09433464773262323 - 2 1038 -0.1266722796765325 0.20971126368240123 - 3 1002 0.9280495300010041 0.5137701393017848 - 4 976 0.9089527839187654 0.9881063385586304 - """ - if dtypes is None: - dtypes = {"id": int, "x": float, "y": float} - state = np.random.RandomState(seed) - columns = {k: make[dt](nrows, state) for k, dt in dtypes.items()} - df = pd.DataFrame(columns, columns=sorted(columns)) - return cudf.from_pandas(df) - - -def make_float(n, rstate): - return rstate.rand(n) * 2 - 1 - - -def make_int(n, rstate): - return rstate.poisson(1000, size=n) - - -names = [ - "Alice", - "Bob", - "Charlie", - "Dan", - "Edith", - "Frank", - "George", - "Hannah", - "Ingrid", - "Jerry", - "Kevin", - "Laura", - "Michael", - "Norbert", - "Oliver", - "Patricia", - "Quinn", - "Ray", - "Sarah", - "Tim", - "Ursula", - "Victor", - "Wendy", - "Xavier", - "Yvonne", - "Zelda", -] - - -def make_string(n, rstate): - return rstate.choice(names, size=n) - - -def make_categorical(n, rstate): - return pd.Categorical.from_codes( - rstate.randint(0, len(names), size=n), names - ) - - -def make_bool(n, rstate): - return rstate.choice([True, False], size=n) - - -make = { - float: make_float, - int: make_int, - str: make_string, - object: make_string, - "category": make_categorical, - bool: make_bool, -} diff --git a/python/cudf/cudf/errors.py b/python/cudf/cudf/errors.py deleted file mode 100644 index 973e5b990b2..00000000000 --- a/python/cudf/cudf/errors.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - - -class UnsupportedCUDAError(Exception): - pass - - -class MixedTypeError(TypeError): - pass diff --git a/python/cudf/cudf/io/__init__.py b/python/cudf/cudf/io/__init__.py deleted file mode 100644 index 6d4b44d5ecc..00000000000 --- a/python/cudf/cudf/io/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. -from cudf.io.avro import read_avro -from cudf.io.csv import read_csv, to_csv -from cudf.io.dlpack import from_dlpack -from cudf.io.feather import read_feather -from cudf.io.hdf import read_hdf -from cudf.io.json import read_json -from cudf.io.orc import read_orc, read_orc_metadata, to_orc -from cudf.io.parquet import ( - ParquetDatasetWriter, - merge_parquet_filemetadata, - read_parquet, - read_parquet_metadata, - write_to_dataset, -) -from cudf.io.text import read_text diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py deleted file mode 100644 index 964bd02b03e..00000000000 --- a/python/cudf/cudf/io/avro.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import cudf -from cudf import _lib as libcudf -from cudf.utils import ioutils - - -@ioutils.doc_read_avro() -def read_avro( - filepath_or_buffer, - columns=None, - skiprows=None, - num_rows=None, - storage_options=None, -): - """{docstring}""" - - filepath_or_buffer = ioutils.get_reader_filepath_or_buffer( - path_or_data=filepath_or_buffer, - storage_options=storage_options, - ) - filepath_or_buffer = ioutils._select_single_source( - filepath_or_buffer, "read_avro" - ) - - return cudf.DataFrame._from_data( - *libcudf.avro.read_avro( - filepath_or_buffer, columns, skiprows, num_rows - ) - ) diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py deleted file mode 100644 index 3dc8915bfd1..00000000000 --- a/python/cudf/cudf/io/csv.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import warnings -from collections import abc -from io import BytesIO, StringIO - -import numpy as np - -import cudf -from cudf import _lib as libcudf -from cudf.api.types import is_scalar -from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type -from cudf.utils.performance_tracking import _performance_tracking - - -@_performance_tracking -@ioutils.doc_read_csv() -def read_csv( - filepath_or_buffer, - sep=",", - delimiter=None, - header="infer", - names=None, - index_col=None, - usecols=None, - prefix=None, - mangle_dupe_cols=True, - dtype=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=0, - skipfooter=0, - nrows=None, - na_values=None, - keep_default_na=True, - na_filter=True, - skip_blank_lines=True, - parse_dates=None, - dayfirst=False, - compression="infer", - thousands=None, - decimal=".", - lineterminator="\n", - quotechar='"', - quoting=0, - doublequote=True, - comment=None, - delim_whitespace=False, - byte_range=None, - storage_options=None, - bytes_per_thread=None, -): - """{docstring}""" - - if delim_whitespace is not False: - warnings.warn( - "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Use ``sep='\\s+'`` instead", - FutureWarning, - ) - - if bytes_per_thread is None: - bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT - - filepath_or_buffer = ioutils.get_reader_filepath_or_buffer( - path_or_data=filepath_or_buffer, - iotypes=(BytesIO, StringIO), - storage_options=storage_options, - bytes_per_thread=bytes_per_thread, - ) - filepath_or_buffer = ioutils._select_single_source( - filepath_or_buffer, "read_csv" - ) - - if na_values is not None and is_scalar(na_values): - na_values = [na_values] - - df = libcudf.csv.read_csv( - filepath_or_buffer, - lineterminator=lineterminator, - quotechar=quotechar, - quoting=quoting, - doublequote=doublequote, - header=header, - mangle_dupe_cols=mangle_dupe_cols, - usecols=usecols, - sep=sep, - delimiter=delimiter, - delim_whitespace=delim_whitespace, - skipinitialspace=skipinitialspace, - names=names, - dtype=dtype, - skipfooter=skipfooter, - skiprows=skiprows, - dayfirst=dayfirst, - compression=compression, - thousands=thousands, - decimal=decimal, - true_values=true_values, - false_values=false_values, - nrows=nrows, - byte_range=byte_range, - skip_blank_lines=skip_blank_lines, - parse_dates=parse_dates, - comment=comment, - na_values=na_values, - keep_default_na=keep_default_na, - na_filter=na_filter, - prefix=prefix, - index_col=index_col, - ) - - if dtype is None or isinstance(dtype, abc.Mapping): - # There exists some dtypes in the result columns that is inferred. - # Find them and map them to the default dtypes. - specified_dtypes = {} if dtype is None else dtype - unspecified_dtypes = { - name: dtype - for name, dtype in df._dtypes - if name not in specified_dtypes - } - default_dtypes = {} - - for name, dt in unspecified_dtypes.items(): - if dt == np.dtype("i1"): - # csv reader reads all null column as int8. - # The dtype should remain int8. - default_dtypes[name] = dt - else: - default_dtypes[name] = _maybe_convert_to_default_type(dt) - df = df.astype(default_dtypes) - - return df - - -@_performance_tracking -@ioutils.doc_to_csv() -def to_csv( - df, - path_or_buf=None, - sep=",", - na_rep="", - columns=None, - header=True, - index=True, - encoding=None, - compression=None, - lineterminator="\n", - chunksize=None, - storage_options=None, -): - """{docstring}""" - - if not isinstance(sep, str): - raise TypeError(f'"sep" must be string, not {type(sep).__name__}') - elif len(sep) > 1: - raise TypeError('"sep" must be a 1-character string') - - if encoding and encoding != "utf-8": - error_msg = ( - f"Encoding {encoding} is not supported. " - + "Currently, only utf-8 encoding is supported." - ) - raise NotImplementedError(error_msg) - - if compression: - error_msg = "Writing compressed csv is not currently supported in cudf" - raise NotImplementedError(error_msg) - - return_as_string = False - if path_or_buf is None: - path_or_buf = StringIO() - return_as_string = True - - path_or_buf = ioutils.get_writer_filepath_or_buffer( - path_or_data=path_or_buf, mode="w", storage_options=storage_options - ) - - if columns is not None: - try: - df = df[columns] - except KeyError: - raise NameError( - "Dataframe doesn't have the labels provided in columns" - ) - - for _, dtype in df._dtypes: - if isinstance(dtype, cudf.ListDtype): - raise NotImplementedError( - "Writing to csv format is not yet supported with " - "list columns." - ) - elif isinstance(dtype, cudf.StructDtype): - raise NotImplementedError( - "Writing to csv format is not yet supported with " - "Struct columns." - ) - - # TODO: Need to typecast categorical columns to the underlying - # categories dtype to write the actual data to csv. Remove this - # workaround once following issue is fixed: - # https://github.com/rapidsai/cudf/issues/6661 - if any( - isinstance(dtype, cudf.CategoricalDtype) for _, dtype in df._dtypes - ) or isinstance(df.index, cudf.CategoricalIndex): - df = df.copy(deep=False) - for col_name, col in df._column_labels_and_values: - if isinstance(col.dtype, cudf.CategoricalDtype): - df._data[col_name] = col.astype(col.categories.dtype) - - if isinstance(df.index, cudf.CategoricalIndex): - df.index = df.index.astype(df.index.categories.dtype) - - rows_per_chunk = chunksize if chunksize else len(df) - - if ioutils.is_fsspec_open_file(path_or_buf): - with path_or_buf as file_obj: - file_obj = ioutils.get_IOBase_writer(file_obj) - libcudf.csv.write_csv( - df, - path_or_buf=file_obj, - sep=sep, - na_rep=na_rep, - header=header, - lineterminator=lineterminator, - rows_per_chunk=rows_per_chunk, - index=index, - ) - else: - libcudf.csv.write_csv( - df, - path_or_buf=path_or_buf, - sep=sep, - na_rep=na_rep, - header=header, - lineterminator=lineterminator, - rows_per_chunk=rows_per_chunk, - index=index, - ) - - if return_as_string: - path_or_buf.seek(0) - return path_or_buf.read() diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py deleted file mode 100644 index fe8e446f9c0..00000000000 --- a/python/cudf/cudf/io/dlpack.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - - -import cudf -from cudf._lib import interop as libdlpack -from cudf.core.column import ColumnBase -from cudf.utils import ioutils - - -def from_dlpack(pycapsule_obj): - """Converts from a DLPack tensor to a cuDF object. - - DLPack is an open-source memory tensor structure: - `dmlc/dlpack `_. - - This function takes a PyCapsule object which contains a pointer to - a DLPack tensor as input, and returns a cuDF object. This function deep - copies the data in the DLPack tensor into a cuDF object. - - Parameters - ---------- - pycapsule_obj : PyCapsule - Input DLPack tensor pointer which is encapsulated in a PyCapsule - object. - - Returns - ------- - A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D - or 2D. - - Notes - ----- - cuDF from_dlpack() assumes column-major (Fortran order) input. If the input - tensor is row-major, transpose it before passing it to this function. - """ - - columns = libdlpack.from_dlpack(pycapsule_obj) - data = dict(enumerate(columns)) - - if len(columns) == 1: - return cudf.Series._from_data(data) - else: - return cudf.DataFrame._from_data(data) - - -@ioutils.doc_to_dlpack() -def to_dlpack(cudf_obj): - """Converts a cuDF object to a DLPack tensor. - - DLPack is an open-source memory tensor structure: - `dmlc/dlpack `_. - - This function takes a cuDF object as input, and returns a PyCapsule object - which contains a pointer to DLPack tensor. This function deep copies - the data in the cuDF object into the DLPack tensor. - - Parameters - ---------- - cudf_obj : cuDF Object - Input cuDF object. - - Returns - ------- - A DLPack tensor pointer which is encapsulated in a PyCapsule object. - - Notes - ----- - cuDF to_dlpack() produces column-major (Fortran order) output. If the - output tensor needs to be row major, transpose the output of this function. - """ - if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)): - gdf = cudf_obj - elif isinstance(cudf_obj, ColumnBase): - gdf = cudf.Series._from_column(cudf_obj) - else: - raise TypeError( - f"Input of type {type(cudf_obj)} cannot be converted " - "to DLPack tensor" - ) - - if any( - not cudf.api.types._is_non_decimal_numeric_dtype(dtype) - for _, dtype in gdf._dtypes - ): - raise TypeError("non-numeric data not yet supported") - - dtype = cudf.utils.dtypes.find_common_type( - [dtype for _, dtype in gdf._dtypes] - ) - gdf = gdf.astype(dtype) - - return libdlpack.to_dlpack([*gdf._columns]) diff --git a/python/cudf/cudf/io/feather.py b/python/cudf/cudf/io/feather.py deleted file mode 100644 index 3ba16c3261f..00000000000 --- a/python/cudf/cudf/io/feather.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. - -import warnings - -from pyarrow import feather - -from cudf.core.dataframe import DataFrame -from cudf.utils import ioutils - - -@ioutils.doc_read_feather() -def read_feather(path, *args, **kwargs): - """{docstring}""" - - warnings.warn( - "Using CPU via PyArrow to read feather dataset, this may " - "be GPU accelerated in the future" - ) - pa_table = feather.read_table(path, *args, **kwargs) - return DataFrame.from_arrow(pa_table) - - -@ioutils.doc_to_feather() -def to_feather(df, path, *args, **kwargs): - """{docstring}""" - warnings.warn( - "Using CPU via PyArrow to write Feather dataset, this may " - "be GPU accelerated in the future" - ) - # Feather doesn't support using an index - pa_table = df.to_arrow(preserve_index=False) - feather.write_feather(pa_table, path, *args, **kwargs) diff --git a/python/cudf/cudf/io/hdf.py b/python/cudf/cudf/io/hdf.py deleted file mode 100644 index 39f62a19f90..00000000000 --- a/python/cudf/cudf/io/hdf.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import warnings - -import pandas as pd - -import cudf -from cudf.utils import ioutils - - -@ioutils.doc_read_hdf() -def read_hdf(path_or_buf, *args, **kwargs): - """{docstring}""" - warnings.warn( - "Using CPU via Pandas to read HDF dataset, this may " - "be GPU accelerated in the future" - ) - pd_value = pd.read_hdf(path_or_buf, *args, **kwargs) - return cudf.from_pandas(pd_value) - - -@ioutils.doc_to_hdf() -def to_hdf(path_or_buf, key, value, *args, **kwargs): - """{docstring}""" - warnings.warn( - "Using CPU via Pandas to write HDF dataset, this may " - "be GPU accelerated in the future" - ) - pd_value = value.to_pandas() - pd_value.to_hdf(path_or_buf, key=key, *args, **kwargs) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py deleted file mode 100644 index d86db656fd0..00000000000 --- a/python/cudf/cudf/io/json.py +++ /dev/null @@ -1,214 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import warnings -from collections import abc -from io import BytesIO, StringIO - -import numpy as np -import pandas as pd - -import cudf -from cudf._lib import json as libjson -from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type - - -@ioutils.doc_read_json() -def read_json( - path_or_buf, - engine="auto", - orient=None, - dtype=None, - lines=False, - compression="infer", - byte_range=None, - keep_quotes=False, - storage_options=None, - mixed_types_as_string=False, - prune_columns=False, - on_bad_lines="error", - *args, - **kwargs, -): - """{docstring}""" - - if dtype is not None and not isinstance(dtype, (abc.Mapping, bool)): - raise TypeError( - "'dtype' parameter only supports " - "a dict of column names and types as key-value pairs, " - f"or a bool, or None. Got {type(dtype)}" - ) - - if engine == "auto": - engine = "cudf" if lines else "pandas" - if engine != "cudf" and keep_quotes: - raise ValueError( - "keep_quotes='True' is supported only with engine='cudf'" - ) - - if engine == "cudf": - if dtype is None: - dtype = True - - if kwargs: - raise ValueError( - "cudf engine doesn't support the " - f"following keyword arguments: {list(kwargs.keys())}" - ) - if args: - raise ValueError( - "cudf engine doesn't support the " - f"following positional arguments: {list(args)}" - ) - - filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer( - path_or_buf, - iotypes=(BytesIO, StringIO), - allow_raw_text_input=True, - storage_options=storage_options, - warn_on_raw_text_input=True, - warn_meta=("json", "read_json"), - expand_dir_pattern="*.json", - ) - - df = libjson.read_json( - filepaths_or_buffers=filepaths_or_buffers, - dtype=dtype, - lines=lines, - compression=compression, - byte_range=byte_range, - keep_quotes=keep_quotes, - mixed_types_as_string=mixed_types_as_string, - prune_columns=prune_columns, - on_bad_lines=on_bad_lines, - ) - else: - warnings.warn( - "Using CPU via Pandas to read JSON dataset, this may " - "be GPU accelerated in the future" - ) - - filepath_or_buffer = ioutils.get_reader_filepath_or_buffer( - path_or_data=path_or_buf, - iotypes=(BytesIO, StringIO), - allow_raw_text_input=True, - storage_options=storage_options, - ) - filepath_or_buffer = ioutils._select_single_source( - filepath_or_buffer, "read_json (via pandas)" - ) - - pd_value = pd.read_json( - filepath_or_buffer, - lines=lines, - dtype=dtype, - compression=compression, - storage_options=storage_options, - orient=orient, - *args, - **kwargs, - ) - df = cudf.from_pandas(pd_value) - - if dtype is None: - dtype = True - - if dtype is True or isinstance(dtype, abc.Mapping): - # There exists some dtypes in the result columns that is inferred. - # Find them and map them to the default dtypes. - specified_dtypes = {} if dtype is True else dtype - unspecified_dtypes = { - name: dtype - for name, dtype in df._dtypes - if name not in specified_dtypes - } - default_dtypes = {} - - for name, dt in unspecified_dtypes.items(): - if dt == np.dtype("i1"): - # csv reader reads all null column as int8. - # The dtype should remain int8. - default_dtypes[name] = dt - else: - default_dtypes[name] = _maybe_convert_to_default_type(dt) - df = df.astype(default_dtypes) - - return df - - -def maybe_return_nullable_pd_obj(cudf_obj): - try: - return cudf_obj.to_pandas(nullable=True) - except NotImplementedError: - return cudf_obj.to_pandas(nullable=False) - - -@ioutils.doc_to_json() -def to_json( - cudf_val, - path_or_buf=None, - engine="auto", - orient=None, - storage_options=None, - *args, - **kwargs, -): - """{docstring}""" - - if engine == "auto": - engine = "pandas" - - if engine == "cudf": - if orient not in {"records", None}: - raise ValueError( - f"Only the `orient='records'` is supported for JSON writer" - f" with `engine='cudf'`, got {orient}" - ) - - if path_or_buf is None: - path_or_buf = StringIO() - return_as_string = True - else: - path_or_buf = ioutils.get_writer_filepath_or_buffer( - path_or_data=path_or_buf, - mode="w", - storage_options=storage_options, - ) - return_as_string = False - - if ioutils.is_fsspec_open_file(path_or_buf): - with path_or_buf as file_obj: - file_obj = ioutils.get_IOBase_writer(file_obj) - libjson.write_json( - cudf_val, path_or_buf=file_obj, *args, **kwargs - ) - else: - libjson.write_json( - cudf_val, path_or_buf=path_or_buf, *args, **kwargs - ) - - if return_as_string: - path_or_buf.seek(0) - return path_or_buf.read() - elif engine == "pandas": - warnings.warn("Using CPU via Pandas to write JSON dataset") - if isinstance(cudf_val, cudf.DataFrame): - pd_data = { - col: maybe_return_nullable_pd_obj(series) - for col, series in cudf_val.items() - } - pd_value = pd.DataFrame(pd_data) - else: - pd_value = maybe_return_nullable_pd_obj(cudf_val) - return pd_value.to_json( - path_or_buf, - orient=orient, - storage_options=storage_options, - *args, - **kwargs, - ) - else: - raise ValueError( - f"`engine` only support {{'auto', 'cudf', 'pandas'}}, " - f"got: {engine}" - ) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py deleted file mode 100644 index 68b60809bb9..00000000000 --- a/python/cudf/cudf/io/orc.py +++ /dev/null @@ -1,441 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import datetime -import warnings - -import pyarrow as pa - -import cudf -from cudf._lib import orc as liborc -from cudf.api.types import is_list_like -from cudf.utils import ioutils - - -def _make_empty_df(filepath_or_buffer, columns): - from pyarrow import orc - - orc_file = orc.ORCFile(filepath_or_buffer) - schema = orc_file.schema - col_names = schema.names if columns is None else columns - return cudf.DataFrame._from_data( - data={ - col_name: cudf.core.column.column_empty( - row_count=0, - dtype=schema.field(col_name).type.to_pandas_dtype(), - ) - for col_name in col_names - } - ) - - -def _parse_column_statistics(cs, column_statistics_blob): - # Initialize stats to return and parse stats blob - column_statistics = {} - cs.ParseFromString(column_statistics_blob) - - # Load from parsed stats blob into stats to return - if cs.HasField("numberOfValues"): - column_statistics["number_of_values"] = cs.numberOfValues - if cs.HasField("hasNull"): - column_statistics["has_null"] = cs.hasNull - - if cs.HasField("intStatistics"): - column_statistics["minimum"] = ( - cs.intStatistics.minimum - if cs.intStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.intStatistics.maximum - if cs.intStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = ( - cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None - ) - - elif cs.HasField("doubleStatistics"): - column_statistics["minimum"] = ( - cs.doubleStatistics.minimum - if cs.doubleStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.doubleStatistics.maximum - if cs.doubleStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = ( - cs.doubleStatistics.sum - if cs.doubleStatistics.HasField("sum") - else None - ) - - elif cs.HasField("stringStatistics"): - column_statistics["minimum"] = ( - cs.stringStatistics.minimum - if cs.stringStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.stringStatistics.maximum - if cs.stringStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = cs.stringStatistics.sum - - elif cs.HasField("bucketStatistics"): - column_statistics["true_count"] = cs.bucketStatistics.count[0] - column_statistics["false_count"] = ( - column_statistics["number_of_values"] - - column_statistics["true_count"] - ) - - elif cs.HasField("decimalStatistics"): - column_statistics["minimum"] = ( - cs.decimalStatistics.minimum - if cs.decimalStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - cs.decimalStatistics.maximum - if cs.decimalStatistics.HasField("maximum") - else None - ) - column_statistics["sum"] = cs.decimalStatistics.sum - - elif cs.HasField("dateStatistics"): - column_statistics["minimum"] = ( - datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.minimum).total_seconds(), - datetime.timezone.utc, - ) - if cs.dateStatistics.HasField("minimum") - else None - ) - column_statistics["maximum"] = ( - datetime.datetime.fromtimestamp( - datetime.timedelta(cs.dateStatistics.maximum).total_seconds(), - datetime.timezone.utc, - ) - if cs.dateStatistics.HasField("maximum") - else None - ) - - elif cs.HasField("timestampStatistics"): - # Before ORC-135, the local timezone offset was included and they were - # stored as minimum and maximum. After ORC-135, the timestamp is - # adjusted to UTC before being converted to milliseconds and stored - # in minimumUtc and maximumUtc. - # TODO: Support minimum and maximum by reading writer's local timezone - if cs.timestampStatistics.HasField( - "minimumUtc" - ) and cs.timestampStatistics.HasField("maximumUtc"): - column_statistics["minimum"] = datetime.datetime.fromtimestamp( - cs.timestampStatistics.minimumUtc / 1000, datetime.timezone.utc - ) - column_statistics["maximum"] = datetime.datetime.fromtimestamp( - cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc - ) - - elif cs.HasField("binaryStatistics"): - column_statistics["sum"] = cs.binaryStatistics.sum - - return column_statistics - - -@ioutils.doc_read_orc_metadata() -def read_orc_metadata(path): - """{docstring}""" - from pyarrow import orc - - orc_file = orc.ORCFile(path) - - num_rows = orc_file.nrows - num_stripes = orc_file.nstripes - col_names = orc_file.schema.names - - return num_rows, num_stripes, col_names - - -@ioutils.doc_read_orc_statistics() -def read_orc_statistics( - filepaths_or_buffers, - columns=None, - **kwargs, -): - """{docstring}""" - - files_statistics = [] - stripes_statistics = [] - for source in filepaths_or_buffers: - path_or_buf = ioutils.get_reader_filepath_or_buffer( - path_or_data=source, **kwargs - ) - path_or_buf = ioutils._select_single_source( - path_or_buf, "read_orc_statistics" - ) - ( - column_names, - parsed_file_statistics, - parsed_stripes_statistics, - ) = liborc.read_parsed_orc_statistics(path_or_buf) - - # Parse file statistics - file_statistics = { - column_name: column_stats - for column_name, column_stats in zip( - column_names, parsed_file_statistics - ) - if columns is None or column_name in columns - } - files_statistics.append(file_statistics) - - # Parse stripe statistics - for parsed_stripe_statistics in parsed_stripes_statistics: - stripe_statistics = { - column_name: column_stats - for column_name, column_stats in zip( - column_names, parsed_stripe_statistics - ) - if columns is None or column_name in columns - } - if any( - not parsed_statistics - for parsed_statistics in stripe_statistics.values() - ): - continue - else: - stripes_statistics.append(stripe_statistics) - - return files_statistics, stripes_statistics - - -def _filter_stripes( - filters, filepath_or_buffer, stripes=None, skip_rows=None, num_rows=None -): - # Multiple sources are passed as a list. If a single source is passed, - # wrap it in a list for unified processing downstream. - if not is_list_like(filepath_or_buffer): - filepath_or_buffer = [filepath_or_buffer] - - # Prepare filters - filters = ioutils._prepare_filters(filters) - - # Get columns relevant to filtering - columns_in_predicate = [ - col for conjunction in filters for (col, op, val) in conjunction - ] - - # Read and parse file-level and stripe-level statistics - file_statistics, stripes_statistics = read_orc_statistics( - filepath_or_buffer, columns_in_predicate - ) - - file_stripe_map = [] - for file_stat in file_statistics: - # Filter using file-level statistics - if not ioutils._apply_filters(filters, file_stat): - continue - - # Filter using stripe-level statistics - selected_stripes = [] - num_rows_scanned = 0 - for i, stripe_statistics in enumerate(stripes_statistics): - num_rows_before_stripe = num_rows_scanned - num_rows_scanned += next( - iter(stripe_statistics.values()) - ).number_of_values - if stripes is not None and i not in stripes: - continue - if skip_rows is not None and num_rows_scanned <= skip_rows: - continue - else: - skip_rows = 0 - if ( - skip_rows is not None - and num_rows is not None - and num_rows_before_stripe >= skip_rows + num_rows - ): - continue - if ioutils._apply_filters(filters, stripe_statistics): - selected_stripes.append(i) - - file_stripe_map.append(selected_stripes) - - return file_stripe_map - - -@ioutils.doc_read_orc() -def read_orc( - filepath_or_buffer, - engine="cudf", - columns=None, - filters=None, - stripes=None, - skiprows=None, - num_rows=None, - use_index=True, - timestamp_type=None, - storage_options=None, - bytes_per_thread=None, -): - """{docstring}""" - from cudf import DataFrame - - if skiprows is not None: - # Do not remove until cuIO team approves its removal. - warnings.warn( - "skiprows is deprecated and will be removed.", - FutureWarning, - ) - - if num_rows is not None: - # Do not remove until cuIO team approves its removal. - warnings.warn( - "num_rows is deprecated and will be removed.", - FutureWarning, - ) - - # Multiple sources are passed as a list. If a single source is passed, - # wrap it in a list for unified processing downstream. - if not is_list_like(filepath_or_buffer): - filepath_or_buffer = [filepath_or_buffer] - - # Each source must have a correlating stripe list. If a single stripe list - # is provided rather than a list of list of stripes then extrapolate that - # stripe list across all input sources - if stripes is not None: - if any(not isinstance(stripe, list) for stripe in stripes): - stripes = [stripes] - - # Must ensure a stripe for each source is specified, unless None - if not len(stripes) == len(filepath_or_buffer): - raise ValueError( - "A list of stripes must be provided for each input source" - ) - - filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer( - path_or_data=filepath_or_buffer, - storage_options=storage_options, - bytes_per_thread=bytes_per_thread, - expand_dir_pattern="*.orc", - ) - - if filters is not None: - selected_stripes = _filter_stripes( - filters, filepaths_or_buffers, stripes, skiprows, num_rows - ) - - # Return empty if everything was filtered - if len(selected_stripes) == 0: - return _make_empty_df(filepaths_or_buffers[0], columns) - else: - stripes = selected_stripes - - if engine == "cudf": - return DataFrame._from_data( - *liborc.read_orc( - filepaths_or_buffers, - columns, - stripes, - skiprows, - num_rows, - use_index, - timestamp_type, - ) - ) - else: - from pyarrow import orc - - def read_orc_stripe(orc_file, stripe, columns): - pa_table = orc_file.read_stripe(stripe, columns) - if isinstance(pa_table, pa.RecordBatch): - pa_table = pa.Table.from_batches([pa_table]) - return pa_table - - warnings.warn("Using CPU via PyArrow to read ORC dataset.") - if len(filepath_or_buffer) > 1: - raise NotImplementedError( - "Using CPU via PyArrow only supports a single a " - "single input source" - ) - - orc_file = orc.ORCFile(filepath_or_buffer[0]) - if stripes is not None and len(stripes) > 0: - for stripe_source_file in stripes: - pa_tables = [ - read_orc_stripe(orc_file, i, columns) - for i in stripe_source_file - ] - pa_table = pa.concat_tables(pa_tables) - else: - pa_table = orc_file.read(columns=columns) - df = cudf.DataFrame.from_arrow(pa_table) - - return df - - -@ioutils.doc_to_orc() -def to_orc( - df, - fname, - compression="snappy", - statistics="ROWGROUP", - stripe_size_bytes=None, - stripe_size_rows=None, - row_index_stride=None, - cols_as_map_type=None, - storage_options=None, - index=None, -): - """{docstring}""" - - for _, dtype in df._dtypes: - if isinstance(dtype, cudf.CategoricalDtype): - raise NotImplementedError( - "Writing to ORC format is not yet supported with " - "Categorical columns." - ) - - if isinstance(df.index, cudf.CategoricalIndex): - raise NotImplementedError( - "Writing to ORC format is not yet supported with " - "Categorical columns." - ) - - if cols_as_map_type is not None and not isinstance(cols_as_map_type, list): - raise TypeError("cols_as_map_type must be a list of column names.") - - path_or_buf = ioutils.get_writer_filepath_or_buffer( - path_or_data=fname, mode="wb", storage_options=storage_options - ) - if ioutils.is_fsspec_open_file(path_or_buf): - with path_or_buf as file_obj: - file_obj = ioutils.get_IOBase_writer(file_obj) - liborc.write_orc( - df, - file_obj, - compression, - statistics, - stripe_size_bytes, - stripe_size_rows, - row_index_stride, - cols_as_map_type, - index, - ) - else: - liborc.write_orc( - df, - path_or_buf, - compression, - statistics, - stripe_size_bytes, - stripe_size_rows, - row_index_stride, - cols_as_map_type, - index, - ) - - -ORCWriter = liborc.ORCWriter diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py deleted file mode 100644 index ce99f98b559..00000000000 --- a/python/cudf/cudf/io/parquet.py +++ /dev/null @@ -1,1560 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import itertools -import math -import operator -import shutil -import tempfile -import warnings -from collections import defaultdict -from contextlib import ExitStack -from functools import partial, reduce -from typing import TYPE_CHECKING -from uuid import uuid4 - -import numpy as np -import pandas as pd -from pyarrow import dataset as ds - -import cudf -from cudf._lib import parquet as libparquet -from cudf.api.types import is_list_like -from cudf.core.column import as_column, column_empty -from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes -from cudf.utils import ioutils -from cudf.utils.performance_tracking import _performance_tracking - -if TYPE_CHECKING: - from collections.abc import Callable - - -BYTE_SIZES = { - "kb": 1000, - "mb": 1000000, - "gb": 1000000000, - "tb": 1000000000000, - "pb": 1000000000000000, - "kib": 1024, - "mib": 1048576, - "gib": 1073741824, - "tib": 1099511627776, - "pib": 1125899906842624, - "b": 1, - "": 1, - "k": 1000, - "m": 1000000, - "g": 1000000000, - "t": 1000000000000, - "p": 1000000000000000, - "ki": 1024, - "mi": 1048576, - "gi": 1073741824, - "ti": 1099511627776, - "pi": 1125899906842624, -} - - -@_performance_tracking -def _write_parquet( - df, - paths, - compression="snappy", - index=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - max_dictionary_size=None, - partitions_info=None, - storage_options=None, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, - write_arrow_schema=True, -): - if is_list_like(paths) and len(paths) > 1: - if partitions_info is None: - ValueError("partition info is required for multiple paths") - elif not is_list_like(partitions_info): - ValueError("partition info must be list-like for multiple paths") - elif not len(paths) == len(partitions_info): - ValueError("partitions_info and paths must be of same size") - if is_list_like(partitions_info) and len(partitions_info) > 1: - if not is_list_like(paths): - ValueError("paths must be list-like when partitions_info provided") - - paths_or_bufs = [ - ioutils.get_writer_filepath_or_buffer( - path_or_data=path, mode="wb", storage_options=storage_options - ) - for path in paths - ] - common_args = { - "index": index, - "compression": compression, - "statistics": statistics, - "metadata_file_path": metadata_file_path, - "int96_timestamps": int96_timestamps, - "row_group_size_bytes": row_group_size_bytes, - "row_group_size_rows": row_group_size_rows, - "max_page_size_bytes": max_page_size_bytes, - "max_page_size_rows": max_page_size_rows, - "max_dictionary_size": max_dictionary_size, - "partitions_info": partitions_info, - "force_nullable_schema": force_nullable_schema, - "header_version": header_version, - "use_dictionary": use_dictionary, - "skip_compression": skip_compression, - "column_encoding": column_encoding, - "column_type_length": column_type_length, - "output_as_binary": output_as_binary, - "write_arrow_schema": write_arrow_schema, - } - if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs): - with ExitStack() as stack: - fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs] - file_objs = [ - ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs - ] - write_parquet_res = libparquet.write_parquet( - df, filepaths_or_buffers=file_objs, **common_args - ) - else: - write_parquet_res = libparquet.write_parquet( - df, filepaths_or_buffers=paths_or_bufs, **common_args - ) - - return write_parquet_res - - -# Logic chosen to match: https://arrow.apache.org/ -# docs/_modules/pyarrow/parquet.html#write_to_dataset -@_performance_tracking -def write_to_dataset( - df, - root_path, - compression="snappy", - filename=None, - partition_cols=None, - fs=None, - preserve_index=False, - return_metadata=False, - statistics="ROWGROUP", - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - storage_options=None, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, - store_schema=False, -): - """Wraps `to_parquet` to write partitioned Parquet datasets. - For each combination of partition group and value, - subdirectories are created as follows: - - .. code-block:: bash - - root_dir/ - group=value1 - .parquet - ... - group=valueN - .parquet - - Parameters - ---------- - df : cudf.DataFrame - root_path : string, - The root directory of the dataset - compression : {'snappy', 'ZSTD', None}, default 'snappy' - Name of the compression to use. Use ``None`` for no compression. - filename : string, default None - The file name to use (within each partition directory). If None, - a random uuid4 hex string will be used for each file name. - partition_cols : list, - Column names by which to partition the dataset. - Columns are partitioned in the order they are given. - fs : FileSystem, default None - If nothing passed, paths assumed to be found in the local on-disk - filesystem - preserve_index : bool, default False - Preserve index values in each parquet file. - return_metadata : bool, default False - Return parquet metadata for written data. Returned metadata will - include the file-path metadata (relative to `root_path`). - int96_timestamps : bool, default False - If ``True``, write timestamps in int96 format. This will convert - timestamps from timestamp[ns], timestamp[ms], timestamp[s], and - timestamp[us] to the int96 format, which is the number of Julian - days and the number of nanoseconds since midnight of 1970-01-01. - If ``False``, timestamps will not be altered. - row_group_size_bytes: integer or None, default None - Maximum size of each stripe of the output. - If None, no limit on row group stripe size will be used. - row_group_size_rows: integer or None, default None - Maximum number of rows of each stripe of the output. - If None, 1000000 will be used. - max_page_size_bytes: integer or None, default None - Maximum uncompressed size of each page of the output. - If None, 524288 (512KB) will be used. - max_page_size_rows: integer or None, default None - Maximum number of rows of each page of the output. - If None, 20000 will be used. - storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details. - force_nullable_schema : bool, default False. - If True, writes all columns as `null` in schema. - If False, columns are written as `null` if they contain null values, - otherwise as `not null`. - header_version : {{'1.0', '2.0'}}, default "1.0" - Controls whether to use version 1.0 or version 2.0 page headers when - encoding. Version 1.0 is more portable, but version 2.0 enables the - use of newer encoding schemes. - force_nullable_schema : bool, default False. - If True, writes all columns as `null` in schema. - If False, columns are written as `null` if they contain null values, - otherwise as `not null`. - skip_compression : set, optional, default None - If a column name is present in the set, that column will not be compressed, - regardless of the ``compression`` setting. - column_encoding : dict, optional, default None - Sets the page encoding to use on a per-column basis. The key is a column - name, and the value is one of: 'PLAIN', 'DICTIONARY', 'DELTA_BINARY_PACKED', - 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'BYTE_STREAM_SPLIT', or - 'USE_DEFAULT'. - column_type_length : dict, optional, default None - Specifies the width in bytes of ``FIXED_LEN_BYTE_ARRAY`` column elements. - The key is a column name and the value is an integer. The named column - will be output as unannotated binary (i.e. the column will behave as if - ``output_as_binary`` was set). - output_as_binary : set, optional, default None - If a column name is present in the set, that column will be output as - unannotated binary, rather than the default 'UTF-8'. - store_schema : bool, default False - If ``True``, enable computing and writing arrow schema to Parquet - file footer's key-value metadata section for faithful round-tripping. - """ - - fs = ioutils._ensure_filesystem(fs, root_path, storage_options) - fs.mkdirs(root_path, exist_ok=True) - - if partition_cols is not None and len(partition_cols) > 0: - ( - full_paths, - metadata_file_paths, - grouped_df, - part_offsets, - _, - ) = _get_partitioned( - df=df, - root_path=root_path, - partition_cols=partition_cols, - filename=filename, - fs=fs, - preserve_index=preserve_index, - storage_options=storage_options, - ) - metadata_file_path = metadata_file_paths if return_metadata else None - metadata = to_parquet( - df=grouped_df, - path=full_paths, - compression=compression, - index=preserve_index, - partition_offsets=part_offsets, - storage_options=storage_options, - metadata_file_path=metadata_file_path, - statistics=statistics, - int96_timestamps=int96_timestamps, - row_group_size_bytes=row_group_size_bytes, - row_group_size_rows=row_group_size_rows, - max_page_size_bytes=max_page_size_bytes, - max_page_size_rows=max_page_size_rows, - force_nullable_schema=force_nullable_schema, - header_version=header_version, - use_dictionary=use_dictionary, - skip_compression=skip_compression, - column_encoding=column_encoding, - column_type_length=column_type_length, - output_as_binary=output_as_binary, - store_schema=store_schema, - ) - - else: - filename = filename or _generate_filename() - full_path = fs.sep.join([root_path, filename]) - - metadata_file_path = filename if return_metadata else None - - metadata = df.to_parquet( - path=full_path, - compression=compression, - index=preserve_index, - storage_options=storage_options, - metadata_file_path=metadata_file_path, - statistics=statistics, - int96_timestamps=int96_timestamps, - row_group_size_bytes=row_group_size_bytes, - row_group_size_rows=row_group_size_rows, - max_page_size_bytes=max_page_size_bytes, - max_page_size_rows=max_page_size_rows, - force_nullable_schema=force_nullable_schema, - header_version=header_version, - use_dictionary=use_dictionary, - skip_compression=skip_compression, - column_encoding=column_encoding, - column_type_length=column_type_length, - output_as_binary=output_as_binary, - store_schema=store_schema, - ) - - return metadata - - -@ioutils.doc_read_parquet_metadata() -@_performance_tracking -def read_parquet_metadata(filepath_or_buffer): - """{docstring}""" - - # List of filepaths or buffers - filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer( - path_or_data=filepath_or_buffer, - bytes_per_thread=None, - ) - - return libparquet.read_parquet_metadata(filepaths_or_buffers) - - -@_performance_tracking -def _process_dataset( - paths, - fs, - filters=None, - row_groups=None, - categorical_partitions=True, - dataset_kwargs=None, -): - # Returns: - # file_list - Expanded/filtered list of paths - # row_groups - Filtered list of row-group selections - # partition_keys - list of partition keys for each file - # partition_categories - Categories for each partition - - # The general purpose of this function is to (1) expand - # directory input into a list of paths (using the pyarrow - # dataset API), (2) to apply row-group filters, and (3) - # to discover directory-partitioning information - - # Deal with case that the user passed in a directory name - file_list = paths - if len(paths) == 1 and ioutils.is_directory(paths[0]): - paths = ioutils.stringify_pathlike(paths[0]) - - # Convert filters to ds.Expression - if filters is not None: - from pyarrow.parquet import filters_to_expression - - filters = filters_to_expression(filters) - - # Initialize ds.FilesystemDataset - # TODO: Remove the if len(paths) workaround after following bug is fixed: - # https://issues.apache.org/jira/browse/ARROW-16438 - dataset = ds.dataset( - source=paths[0] if len(paths) == 1 else paths, - filesystem=fs, - **( - dataset_kwargs - or { - "format": "parquet", - "partitioning": "hive", - } - ), - ) - - file_list = dataset.files - if len(file_list) == 0: - raise FileNotFoundError(f"{paths} could not be resolved to any files") - - # Deal with directory partitioning - # Get all partition keys (without filters) - partition_categories = defaultdict(list) - file_fragment = None - for file_fragment in dataset.get_fragments(): - keys = ds._get_partition_keys(file_fragment.partition_expression) - if not (keys or partition_categories): - # Bail - This is not a directory-partitioned dataset - break - for k, v in keys.items(): - if v not in partition_categories[k]: - partition_categories[k].append(v) - if not categorical_partitions: - # Bail - We don't need to discover all categories. - # We only need to save the partition keys from this - # first `file_fragment` - break - - if partition_categories and file_fragment is not None: - # Check/correct order of `categories` using last file_frag, - # because `_get_partition_keys` does NOT preserve the - # partition-hierarchy order of the keys. - cat_keys = [ - part.split("=")[0] - for part in file_fragment.path.split(fs.sep) - if "=" in part - ] - if set(partition_categories) == set(cat_keys): - partition_categories = { - k: partition_categories[k] - for k in cat_keys - if k in partition_categories - } - - # If we do not have partitioned data and - # are not filtering, we can return here - if filters is None and not partition_categories: - return file_list, row_groups, [], {} - - # Record initial row_groups input - row_groups_map = {} - if row_groups is not None: - # Make sure paths and row_groups map 1:1 - # and save the initial mapping - if len(paths) != len(file_list): - raise ValueError( - "Cannot specify a row_group selection for a directory path." - ) - row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)} - - # Apply filters and discover partition columns - partition_keys = [] - if partition_categories or filters is not None: - file_list = [] - if filters is not None: - row_groups = [] - for file_fragment in dataset.get_fragments(filter=filters): - path = file_fragment.path - - # Extract hive-partition keys, and make sure they - # are ordered the same as they are in `partition_categories` - if partition_categories: - raw_keys = ds._get_partition_keys( - file_fragment.partition_expression - ) - partition_keys.append( - [ - (name, raw_keys[name]) - for name in partition_categories.keys() - ] - ) - - # Apply row-group filtering - selection = row_groups_map.get(path, None) - if selection is not None or filters is not None: - filtered_row_groups = [ - rg_info.id - for rg_fragment in file_fragment.split_by_row_group( - filters, - schema=dataset.schema, - ) - for rg_info in rg_fragment.row_groups - ] - file_list.append(path) - if filters is not None: - if selection is None: - row_groups.append(filtered_row_groups) - else: - row_groups.append( - [ - rg_id - for rg_id in filtered_row_groups - if rg_id in selection - ] - ) - - return ( - file_list, - row_groups, - partition_keys, - partition_categories if categorical_partitions else {}, - ) - - -@ioutils.doc_read_parquet() -@_performance_tracking -def read_parquet( - filepath_or_buffer, - engine="cudf", - columns=None, - storage_options=None, - filesystem=None, - filters=None, - row_groups=None, - use_pandas_metadata=True, - categorical_partitions=True, - bytes_per_thread=None, - dataset_kwargs=None, - nrows=None, - skip_rows=None, - allow_mismatched_pq_schemas=False, - *args, - **kwargs, -): - """{docstring}""" - if engine not in {"cudf", "pyarrow"}: - raise ValueError( - f"Only supported engines are {{'cudf', 'pyarrow'}}, got {engine=}" - ) - if bytes_per_thread is None: - bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT - - # Multiple sources are passed as a list. If a single source is passed, - # wrap it in a list for unified processing downstream. - if not is_list_like(filepath_or_buffer): - filepath_or_buffer = [filepath_or_buffer] - - # a list of row groups per source should be passed. make the list of - # lists that is expected for multiple sources - if row_groups is not None: - if not is_list_like(row_groups): - row_groups = [[row_groups]] - elif not is_list_like(row_groups[0]): - row_groups = [row_groups] - - # Check columns input - if columns is not None: - if not is_list_like(columns): - raise ValueError("Expected list like for columns") - - # Start by trying construct a filesystem object, so we - # can apply filters on remote file-systems - fs, paths = ioutils._get_filesystem_and_paths( - path_or_data=filepath_or_buffer, - storage_options=storage_options, - filesystem=filesystem, - ) - - # Normalize and validate filters - filters = _normalize_filters(filters) - - # Use pyarrow dataset to detect/process directory-partitioned - # data and apply filters. Note that we can only support partitioned - # data and filtering if the input is a single directory or list of - # paths. - partition_keys = [] - partition_categories = {} - if fs and paths: - ( - paths, - row_groups, - partition_keys, - partition_categories, - ) = _process_dataset( - paths=paths, - fs=fs, - filters=filters, - row_groups=row_groups, - categorical_partitions=categorical_partitions, - dataset_kwargs=dataset_kwargs, - ) - filepath_or_buffer = paths if paths else filepath_or_buffer - - # Prepare remote-IO options - prefetch_options = kwargs.pop("prefetch_options", {}) - if not ioutils._is_local_filesystem(fs): - # The default prefetch method depends on the - # `row_groups` argument. In most cases we will use - # method="all" by default, because it is fastest - # when we need to read most of the file(s). - # If a (simple) `row_groups` selection is made, we - # use method="parquet" to avoid transferring the - # entire file over the network - method = prefetch_options.get("method") - _row_groups = None - if method in (None, "parquet"): - if row_groups is None: - # If the user didn't specify a method, don't use - # 'parquet' prefetcher for column projection alone. - method = method or "all" - elif all(r == row_groups[0] for r in row_groups): - # Row group selection means we are probably - # reading half the file or less. We should - # avoid a full file transfer by default. - method = "parquet" - _row_groups = row_groups[0] - elif (method := method or "all") == "parquet": - raise ValueError( - "The 'parquet' prefetcher requires a uniform " - "row-group selection for all paths within the " - "same `read_parquet` call. " - "Got: {row_groups}" - ) - if method == "parquet": - prefetch_options = prefetch_options.update( - { - "method": method, - "columns": columns, - "row_groups": _row_groups, - } - ) - - filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer( - path_or_data=filepath_or_buffer, - fs=fs, - storage_options=storage_options, - bytes_per_thread=bytes_per_thread, - prefetch_options=prefetch_options, - ) - - # Warn user if they are not using cudf for IO - # (There is a good chance this was not the intention) - if engine != "cudf": - warnings.warn( - "Using CPU via PyArrow to read Parquet dataset. " - "This option is both inefficient and unstable!" - ) - if filters is not None: - warnings.warn( - "Parquet row-group filtering is only supported with " - "'engine=cudf'. Use pandas or pyarrow API directly " - "for full CPU-based filtering functionality." - ) - - # Make sure we read in the columns needed for row-wise - # filtering after IO. This means that one or more columns - # will be dropped almost immediately after IO. However, - # we do NEED these columns for accurate filtering. - projected_columns = None - if columns and filters: - projected_columns = columns - columns = sorted( - set(v[0] for v in itertools.chain.from_iterable(filters)) - | set(columns) - ) - - # Convert parquet data to a cudf.DataFrame - df = _parquet_to_frame( - filepaths_or_buffers, - engine, - *args, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - partition_keys=partition_keys, - partition_categories=partition_categories, - dataset_kwargs=dataset_kwargs, - nrows=nrows, - skip_rows=skip_rows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, - **kwargs, - ) - # Apply filters row-wise (if any are defined), and return - df = _apply_post_filters(df, filters) - if projected_columns: - # Elements of `projected_columns` may now be in the index. - # We must filter these names from our projection - projected_columns = [ - col for col in projected_columns if col in df._column_names - ] - return df[projected_columns] - return df - - -def _normalize_filters(filters: list | None) -> list[list[tuple]] | None: - # Utility to normalize and validate the `filters` - # argument to `read_parquet` - if not filters: - return None - - msg = ( - f"filters must be None, or non-empty List[Tuple] " - f"or List[List[Tuple]]. Got {filters}" - ) - if not isinstance(filters, list): - raise TypeError(msg) - - def _validate_predicate(item): - if not isinstance(item, tuple) or len(item) != 3: - raise TypeError( - f"Predicate must be Tuple[str, str, Any], " f"got {predicate}." - ) - - filters = filters if isinstance(filters[0], list) else [filters] - for conjunction in filters: - if not conjunction or not isinstance(conjunction, list): - raise TypeError(msg) - for predicate in conjunction: - _validate_predicate(predicate) - - return filters - - -def _apply_post_filters( - df: cudf.DataFrame, filters: list[list[tuple]] | None -) -> cudf.DataFrame: - """Apply DNF filters to an in-memory DataFrame - - Disjunctive normal form (DNF) means that the inner-most - tuple describes a single column predicate. These inner - predicates are combined with an AND conjunction into a - larger predicate. The outer-most list then combines all - of the combined filters with an OR disjunction. - """ - - if not filters: - # No filters to apply - return df - - def _handle_in(column: cudf.Series, value, *, negate) -> cudf.Series: - if not isinstance(value, (list, set, tuple)): - raise TypeError( - "Value of 'in'/'not in' filter must be a list, set, or tuple." - ) - return ~column.isin(value) if negate else column.isin(value) - - def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: - if value not in {np.nan, None}: - raise TypeError( - "Value of 'is'/'is not' filter must be np.nan or None." - ) - return ~column.isna() if negate else column.isna() - - handlers: dict[str, Callable] = { - "==": operator.eq, - "!=": operator.ne, - "<": operator.lt, - "<=": operator.le, - ">": operator.gt, - ">=": operator.ge, - "in": partial(_handle_in, negate=False), - "not in": partial(_handle_in, negate=True), - "is": partial(_handle_is, negate=False), - "is not": partial(_handle_is, negate=True), - } - - # Can re-set the index before returning if we filter - # out rows from a DataFrame with a default RangeIndex - # (to reduce memory usage) - reset_index = ( - isinstance(df.index, cudf.RangeIndex) - and df.index.name is None - and df.index.start == 0 - and df.index.step == 1 - ) - - try: - selection: cudf.Series = reduce( - operator.or_, - ( - reduce( - operator.and_, - ( - handlers[op](df[column], value) - for (column, op, value) in expr - ), - ) - for expr in filters - ), - ) - if reset_index: - return df[selection].reset_index(drop=True) - return df[selection] - except (KeyError, TypeError): - warnings.warn( - f"Row-wise filtering failed in read_parquet for {filters}" - ) - return df - - -@_performance_tracking -def _parquet_to_frame( - paths_or_buffers, - *args, - row_groups=None, - partition_keys=None, - partition_categories=None, - dataset_kwargs=None, - nrows=None, - skip_rows=None, - **kwargs, -): - # If this is not a partitioned read, only need - # one call to `_read_parquet` - if not partition_keys: - return _read_parquet( - paths_or_buffers, - nrows=nrows, - skip_rows=skip_rows, - *args, - row_groups=row_groups, - **kwargs, - ) - - if nrows is not None or skip_rows is not None: - raise NotImplementedError( - "nrows/skip_rows is not supported when reading a partitioned parquet dataset" - ) - - partition_meta = None - partitioning = (dataset_kwargs or {}).get("partitioning", None) - if hasattr(partitioning, "schema"): - partition_meta = cudf.DataFrame.from_arrow( - partitioning.schema.empty_table() - ) - - # For partitioned data, we need a distinct read for each - # unique set of partition keys. Therefore, we start by - # aggregating all paths with matching keys using a dict - plan = {} - for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)): - rgs = row_groups[i] if row_groups else None - tkeys = tuple(keys) - if tkeys in plan: - plan[tkeys][0].append(path) - if rgs is not None: - plan[tkeys][1].append(rgs) - else: - plan[tkeys] = ([path], None if rgs is None else [rgs]) - - dfs = [] - for part_key, (key_paths, key_row_groups) in plan.items(): - # Add new DataFrame to our list - dfs.append( - _read_parquet( - key_paths, - *args, - row_groups=key_row_groups, - **kwargs, - ) - ) - # Add partition columns to the last DataFrame - for name, value in part_key: - _len = len(dfs[-1]) - if partition_categories and name in partition_categories: - # Build the categorical column from `codes` - codes = as_column( - partition_categories[name].index(value), - length=_len, - ) - codes = as_unsigned_codes( - len(partition_categories[name]), codes - ) - dfs[-1][name] = CategoricalColumn( - data=None, - size=codes.size, - dtype=cudf.CategoricalDtype( - categories=partition_categories[name], ordered=False - ), - offset=codes.offset, - children=(codes,), - ) - else: - # Not building categorical columns, so - # `value` is already what we want - _dtype = ( - partition_meta[name].dtype - if partition_meta is not None - else None - ) - if pd.isna(value): - dfs[-1][name] = column_empty( - row_count=_len, - dtype=_dtype, - masked=True, - ) - else: - dfs[-1][name] = as_column( - value, - dtype=_dtype, - length=_len, - ) - - if len(dfs) > 1: - # Concatenate dfs and return. - # Assume we can ignore the index if it has no name. - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - res = cudf.concat(dfs, ignore_index=dfs[-1].index.name is None) - return res - else: - return dfs[0] - - -@_performance_tracking -def _read_parquet( - filepaths_or_buffers, - engine, - columns=None, - row_groups=None, - use_pandas_metadata=None, - nrows=None, - skip_rows=None, - allow_mismatched_pq_schemas=False, - *args, - **kwargs, -): - # Simple helper function to dispatch between - # cudf and pyarrow to read parquet data - if engine == "cudf": - if kwargs: - raise ValueError( - "cudf engine doesn't support the " - f"following keyword arguments: {list(kwargs.keys())}" - ) - if args: - raise ValueError( - "cudf engine doesn't support the " - f"following positional arguments: {list(args)}" - ) - if cudf.get_option("io.parquet.low_memory"): - return libparquet.read_parquet_chunked( - filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - nrows=nrows if nrows is not None else -1, - skip_rows=skip_rows if skip_rows is not None else 0, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, - ) - else: - if nrows is None: - nrows = -1 - if skip_rows is None: - skip_rows = 0 - return libparquet.read_parquet( - filepaths_or_buffers, - columns=columns, - row_groups=row_groups, - use_pandas_metadata=use_pandas_metadata, - nrows=nrows, - skip_rows=skip_rows, - allow_mismatched_pq_schemas=allow_mismatched_pq_schemas, - ) - else: - if ( - isinstance(filepaths_or_buffers, list) - and len(filepaths_or_buffers) == 1 - ): - filepaths_or_buffers = filepaths_or_buffers[0] - - return cudf.DataFrame.from_pandas( - pd.read_parquet( - filepaths_or_buffers, - columns=columns, - engine=engine, - *args, - **kwargs, - ) - ) - - -@ioutils.doc_to_parquet() -@_performance_tracking -def to_parquet( - df, - path, - engine="cudf", - compression="snappy", - index=None, - partition_cols=None, - partition_file_name=None, - partition_offsets=None, - statistics="ROWGROUP", - metadata_file_path=None, - int96_timestamps=False, - row_group_size_bytes=None, - row_group_size_rows=None, - max_page_size_bytes=None, - max_page_size_rows=None, - max_dictionary_size=None, - storage_options=None, - return_metadata=False, - force_nullable_schema=False, - header_version="1.0", - use_dictionary=True, - skip_compression=None, - column_encoding=None, - column_type_length=None, - output_as_binary=None, - store_schema=False, - *args, - **kwargs, -): - """{docstring}""" - - if engine == "cudf": - if kwargs: - raise ValueError( - "cudf engine doesn't support the " - f"following keyword arguments: {list(kwargs.keys())}" - ) - if args: - raise ValueError( - "cudf engine doesn't support the " - f"following positional arguments: {list(args)}" - ) - # Ensure that no columns dtype is 'category' - for col in df._column_names: - if partition_cols is None or col not in partition_cols: - if df[col].dtype.name == "category": - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - - if partition_cols: - if metadata_file_path is not None: - warnings.warn( - "metadata_file_path will be ignored/overwritten when " - "partition_cols are provided. To request returning the " - "metadata binary blob, pass `return_metadata=True`" - ) - - return write_to_dataset( - df, - filename=partition_file_name, - partition_cols=partition_cols, - root_path=path, - preserve_index=index, - compression=compression, - statistics=statistics, - int96_timestamps=int96_timestamps, - row_group_size_bytes=row_group_size_bytes, - row_group_size_rows=row_group_size_rows, - max_page_size_bytes=max_page_size_bytes, - max_page_size_rows=max_page_size_rows, - return_metadata=return_metadata, - storage_options=storage_options, - force_nullable_schema=force_nullable_schema, - header_version=header_version, - use_dictionary=use_dictionary, - skip_compression=skip_compression, - column_encoding=column_encoding, - column_type_length=column_type_length, - output_as_binary=output_as_binary, - store_schema=store_schema, - ) - - partition_info = ( - [ - (i, j - i) - for i, j in zip(partition_offsets, partition_offsets[1:]) - ] - if partition_offsets is not None - else None - ) - return _write_parquet( - df, - paths=path if is_list_like(path) else [path], - compression=compression, - index=index, - statistics=statistics, - metadata_file_path=metadata_file_path, - int96_timestamps=int96_timestamps, - row_group_size_bytes=row_group_size_bytes, - row_group_size_rows=row_group_size_rows, - max_page_size_bytes=max_page_size_bytes, - max_page_size_rows=max_page_size_rows, - max_dictionary_size=max_dictionary_size, - partitions_info=partition_info, - storage_options=storage_options, - force_nullable_schema=force_nullable_schema, - header_version=header_version, - use_dictionary=use_dictionary, - skip_compression=skip_compression, - column_encoding=column_encoding, - column_type_length=column_type_length, - output_as_binary=output_as_binary, - write_arrow_schema=store_schema, - ) - - else: - import pyarrow.parquet as pq - - if partition_offsets is not None: - warnings.warn( - "partition_offsets will be ignored when engine is not cudf" - ) - - # If index is empty set it to the expected default value of True - if index is None: - index = True - - pa_table = df.to_arrow(preserve_index=index) - return pq.write_to_dataset( - pa_table, - root_path=path, - partition_cols=partition_cols, - *args, - **kwargs, - ) - - -@ioutils.doc_merge_parquet_filemetadata() -def merge_parquet_filemetadata(filemetadata_list): - """{docstring}""" - - return libparquet.merge_filemetadata(filemetadata_list) - - -def _generate_filename(): - return uuid4().hex + ".parquet" - - -def _get_estimated_file_size(df): - # NOTE: This is purely a guesstimation method - # and the y = mx+c has been arrived - # after extensive experimentation of parquet file size - # vs dataframe sizes. - df_mem_usage = df.memory_usage().sum() - # Parquet file size of a dataframe with all unique values - # seems to be 1/1.5 times as that of on GPU for >10000 rows - # and 0.6 times else-wise. - # Y(file_size) = M(0.6) * X(df_mem_usage) + C(705) - file_size = int((df_mem_usage * 0.6) + 705) - # 1000 Bytes accounted for row-group metadata. - # A parquet file takes roughly ~810 Bytes of metadata per column. - file_size = file_size + 1000 + (810 * df.shape[1]) - return file_size - - -@_performance_tracking -def _get_partitioned( - df, - root_path, - partition_cols, - filename=None, - fs=None, - preserve_index=False, - storage_options=None, -): - fs = ioutils._ensure_filesystem( - fs, root_path, storage_options=storage_options - ) - fs.mkdirs(root_path, exist_ok=True) - - part_names, grouped_df, part_offsets = _get_groups_and_offsets( - df, partition_cols, preserve_index - ) - - full_paths = [] - metadata_file_paths = [] - for keys in part_names.itertuples(index=False): - subdir = fs.sep.join( - [ - _hive_dirname(name, val) - for name, val in zip(partition_cols, keys) - ] - ) - prefix = fs.sep.join([root_path, subdir]) - fs.mkdirs(prefix, exist_ok=True) - filename = filename or _generate_filename() - full_path = fs.sep.join([prefix, filename]) - full_paths.append(full_path) - metadata_file_paths.append(fs.sep.join([subdir, filename])) - - return full_paths, metadata_file_paths, grouped_df, part_offsets, filename - - -@_performance_tracking -def _get_groups_and_offsets( - df, - partition_cols, - preserve_index=False, - **kwargs, -): - if not (set(df._data) - set(partition_cols)): - warnings.warn("No data left to save outside partition columns") - - _, part_offsets, part_keys, grouped_df = df.groupby( - partition_cols, - dropna=False, - )._grouped() - if not preserve_index: - grouped_df.reset_index(drop=True, inplace=True) - grouped_df.drop(columns=partition_cols, inplace=True) - # Copy the entire keys df in one operation rather than using iloc - part_names = ( - part_keys.take(part_offsets[:-1]) - .to_pandas(nullable=True) - .to_frame(index=False) - ) - return part_names, grouped_df, part_offsets - - -ParquetWriter = libparquet.ParquetWriter - - -def _parse_bytes(s): - """Parse byte string to numbers - - Utility function vendored from Dask. - - >>> _parse_bytes('100') - 100 - >>> _parse_bytes('100 MB') - 100000000 - >>> _parse_bytes('100M') - 100000000 - >>> _parse_bytes('5kB') - 5000 - >>> _parse_bytes('5.4 kB') - 5400 - >>> _parse_bytes('1kiB') - 1024 - >>> _parse_bytes('1e6') - 1000000 - >>> _parse_bytes('1e6 kB') - 1000000000 - >>> _parse_bytes('MB') - 1000000 - >>> _parse_bytes(123) - 123 - >>> _parse_bytes('5 foos') - Traceback (most recent call last): - ... - ValueError: Could not interpret 'foos' as a byte unit - """ - if isinstance(s, (int, float)): - return int(s) - s = s.replace(" ", "") - if not any(char.isdigit() for char in s): - s = "1" + s - - for i in range(len(s) - 1, -1, -1): - if not s[i].isalpha(): - break - index = i + 1 - - prefix = s[:index] - suffix = s[index:] - - try: - n = float(prefix) - except ValueError as e: - raise ValueError( - "Could not interpret '%s' as a number" % prefix - ) from e - - try: - multiplier = BYTE_SIZES[suffix.lower()] - except KeyError as e: - raise ValueError( - "Could not interpret '%s' as a byte unit" % suffix - ) from e - - result = n * multiplier - return int(result) - - -class ParquetDatasetWriter: - """ - Write a parquet file or dataset incrementally - - Parameters - ---------- - path : str - A local directory path or S3 URL. Will be used as root directory - path while writing a partitioned dataset. - partition_cols : list - Column names by which to partition the dataset - Columns are partitioned in the order they are given - index : bool, default None - If ``True``, include the dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. If ``None``, - index(es) other than RangeIndex will be saved as columns. - compression : {'snappy', None}, default 'snappy' - Name of the compression to use. Use ``None`` for no compression. - statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP' - Level at which column statistics should be included in file. - max_file_size : int or str, default None - A file size that cannot be exceeded by the writer. - It is in bytes, if the input is int. - Size can also be a str in form or "10 MB", "1 GB", etc. - If this parameter is used, it is mandatory to pass - `file_name_prefix`. - file_name_prefix : str - This is a prefix to file names generated only when - `max_file_size` is specified. - storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details. - - - Examples - -------- - Using a context - - >>> df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) - >>> df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) - >>> with ParquetDatasetWriter("./dataset", partition_cols=["a"]) as cw: - ... cw.write_table(df1) - ... cw.write_table(df2) - - By manually calling ``close()`` - - >>> cw = ParquetDatasetWriter("./dataset", partition_cols=["a"]) - >>> cw.write_table(df1) - >>> cw.write_table(df2) - >>> cw.close() - - Both the methods will generate the same directory structure - - .. code-block:: none - - dataset/ - a=1 - .parquet - a=2 - .parquet - a=3 - .parquet - - """ - - @_performance_tracking - def __init__( - self, - path, - partition_cols, - index=None, - compression="snappy", - statistics="ROWGROUP", - max_file_size=None, - file_name_prefix=None, - storage_options=None, - ) -> None: - if isinstance(path, str) and path.startswith("s3://"): - self.fs_meta = {"is_s3": True, "actual_path": path} - self.dir_: tempfile.TemporaryDirectory | None = ( - tempfile.TemporaryDirectory() - ) - self.path = self.dir_.name - else: - self.fs_meta = {} - self.dir_ = None - self.path = path - - self.common_args = { - "index": index, - "compression": compression, - "statistics": statistics, - } - self.partition_cols = partition_cols - # Collection of `ParquetWriter`s, and the corresponding - # partition_col values they're responsible for - self._chunked_writers: list[ - tuple[libparquet.ParquetWriter, list[str], str] - ] = [] - # Map of partition_col values to their ParquetWriter's index - # in self._chunked_writers for reverse lookup - self.path_cw_map: dict[str, int] = {} - self.storage_options = storage_options - self.filename = file_name_prefix - self.max_file_size = max_file_size - if max_file_size is not None: - if file_name_prefix is None: - raise ValueError( - "file_name_prefix cannot be None if max_file_size is " - "passed" - ) - self.max_file_size = _parse_bytes(max_file_size) - - self._file_sizes: dict[str, int] = {} - - @_performance_tracking - def write_table(self, df): - """ - Write a dataframe to the file/dataset - """ - part_names, grouped_df, part_offsets = _get_groups_and_offsets( - df=df, - partition_cols=self.partition_cols, - preserve_index=self.common_args["index"], - ) - fs = ioutils._ensure_filesystem(None, self.path, None) - fs.mkdirs(self.path, exist_ok=True) - - full_paths = [] - metadata_file_paths = [] - full_offsets = [0] - - for idx, keys in enumerate(part_names.itertuples(index=False)): - subdir = fs.sep.join( - [ - f"{name}={val}" - for name, val in zip(self.partition_cols, keys) - ] - ) - prefix = fs.sep.join([self.path, subdir]) - fs.mkdirs(prefix, exist_ok=True) - current_offset = (part_offsets[idx], part_offsets[idx + 1]) - num_chunks = 1 - parts = 1 - - if self.max_file_size is not None: - # get the current partition - start, end = current_offset - sliced_df = grouped_df[start:end] - - current_file_size = _get_estimated_file_size(sliced_df) - if current_file_size > self.max_file_size: - # if the file is too large, compute metadata for - # smaller chunks - parts = math.ceil(current_file_size / self.max_file_size) - new_offsets = list( - range(start, end, int((end - start) / parts)) - )[1:] - new_offsets.append(end) - num_chunks = len(new_offsets) - parts = len(new_offsets) - full_offsets.extend(new_offsets) - else: - full_offsets.append(end) - - curr_file_num = 0 - num_chunks = 0 - while num_chunks < parts: - new_file_name = f"{self.filename}_{curr_file_num}.parquet" - new_full_path = fs.sep.join([prefix, new_file_name]) - - # Check if the same `new_file_name` exists and - # generate a `new_file_name` - while new_full_path in self._file_sizes and ( - self._file_sizes[new_full_path] - + (current_file_size / parts) - ) > (self.max_file_size): - curr_file_num += 1 - new_file_name = ( - f"{self.filename}_{curr_file_num}.parquet" - ) - new_full_path = fs.sep.join([prefix, new_file_name]) - - self._file_sizes[new_full_path] = self._file_sizes.get( - new_full_path, 0 - ) + (current_file_size / parts) - full_paths.append(new_full_path) - metadata_file_paths.append( - fs.sep.join([subdir, new_file_name]) - ) - num_chunks += 1 - curr_file_num += 1 - else: - self.filename = self.filename or _generate_filename() - full_path = fs.sep.join([prefix, self.filename]) - full_paths.append(full_path) - metadata_file_paths.append( - fs.sep.join([subdir, self.filename]) - ) - full_offsets.append(current_offset[1]) - - paths, metadata_file_paths, offsets = ( - full_paths, - metadata_file_paths, - full_offsets, - ) - existing_cw_batch = defaultdict(dict) - new_cw_paths = [] - partition_info = [(i, j - i) for i, j in zip(offsets, offsets[1:])] - - for path, part_info, meta_path in zip( - paths, - partition_info, - metadata_file_paths, - ): - if path in self.path_cw_map: # path is a currently open file - cw_idx = self.path_cw_map[path] - existing_cw_batch[cw_idx][path] = part_info - else: # path not currently handled by any chunked writer - new_cw_paths.append((path, part_info, meta_path)) - - # Write out the parts of grouped_df currently handled by existing cw's - for cw_idx, path_to_part_info_map in existing_cw_batch.items(): - cw = self._chunked_writers[cw_idx][0] - # match found paths with this cw's paths and nullify partition info - # for partition_col values not in this batch - this_cw_part_info = [ - path_to_part_info_map.get(path, (0, 0)) - for path in self._chunked_writers[cw_idx][1] - ] - cw.write_table(grouped_df, this_cw_part_info) - - if new_cw_paths: - # Create new cw for unhandled paths encountered in this write_table - new_paths, part_info, meta_paths = zip(*new_cw_paths) - self._chunked_writers.append( - ( - ParquetWriter(new_paths, **self.common_args), - new_paths, - meta_paths, - ) - ) - new_cw_idx = len(self._chunked_writers) - 1 - self.path_cw_map.update({k: new_cw_idx for k in new_paths}) - self._chunked_writers[-1][0].write_table(grouped_df, part_info) - - @_performance_tracking - def close(self, return_metadata=False): - """ - Close all open files and optionally return footer metadata as a binary - blob - """ - - metadata = [ - cw.close(metadata_file_path=meta_path if return_metadata else None) - for cw, _, meta_path in self._chunked_writers - ] - - if self.fs_meta.get("is_s3", False): - local_path = self.path - s3_path = self.fs_meta["actual_path"] - s3_file, _ = ioutils._get_filesystem_and_paths( - s3_path, storage_options=self.storage_options - ) - s3_file.put(local_path, s3_path, recursive=True) - shutil.rmtree(self.path) - - if self.dir_ is not None: - self.dir_.cleanup() - - if return_metadata: - return ( - merge_parquet_filemetadata(metadata) - if len(metadata) > 1 - else metadata[0] - ) - - def __enter__(self): - return self - - def __exit__(self, *args): - self.close() - - -def _hive_dirname(name, val): - # Simple utility to produce hive directory name - if pd.isna(val): - val = "__HIVE_DEFAULT_PARTITION__" - return f"{name}={val}" diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py deleted file mode 100644 index 5ce738cae0e..00000000000 --- a/python/cudf/cudf/io/text.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from io import BytesIO, StringIO - -import cudf -from cudf._lib import text as libtext -from cudf.utils import ioutils -from cudf.utils.performance_tracking import _performance_tracking - - -@_performance_tracking -@ioutils.doc_read_text() -def read_text( - filepath_or_buffer, - delimiter=None, - byte_range=None, - strip_delimiters=False, - compression=None, - compression_offsets=None, - storage_options=None, -): - """{docstring}""" - - if delimiter is None: - raise ValueError("delimiter needs to be provided") - - filepath_or_buffer = ioutils.get_reader_filepath_or_buffer( - path_or_data=filepath_or_buffer, - iotypes=(BytesIO, StringIO), - storage_options=storage_options, - ) - filepath_or_buffer = ioutils._select_single_source( - filepath_or_buffer, "read_text" - ) - - return cudf.Series._from_column( - libtext.read_text( - filepath_or_buffer, - delimiter=delimiter, - byte_range=byte_range, - strip_delimiters=strip_delimiters, - compression=compression, - compression_offsets=compression_offsets, - ) - ) diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py deleted file mode 100644 index df7bbe22a61..00000000000 --- a/python/cudf/cudf/options.py +++ /dev/null @@ -1,385 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import os -import textwrap -from contextlib import ContextDecorator -from dataclasses import dataclass -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from collections.abc import Callable, Container - - -@dataclass -class Option: - default: Any - value: Any - description: str - validator: Callable - - -_OPTIONS: dict[str, Option] = {} - - -def _env_get_int(name, default): - try: - return int(os.getenv(name, default)) - except (ValueError, TypeError): - return default - - -def _env_get_bool(name, default): - env = os.getenv(name) - if env is None: - return default - as_a_int = _env_get_int(name, None) - env = env.lower().strip() - if env == "true" or env == "on" or as_a_int: - return True - if env == "false" or env == "off" or as_a_int == 0: - return False - return default - - -def _register_option( - name: str, default_value: Any, description: str, validator: Callable -): - """Register an option. - - Parameters - ---------- - name : str - The name of the option. - default_value : Any - The default value of the option. - description : str - A text description of the option. - validator : Callable - Called on the option value to check its validity. Should raise an - error if the value is invalid. - - Raises - ------ - BaseException - Raised by validator if the value is invalid. - """ - validator(default_value) - _OPTIONS[name] = Option( - default_value, default_value, description, validator - ) - - -def get_option(name: str) -> Any: - """Get the value of option. - - Parameters - ---------- - key : str - The name of the option. - - Returns - ------- - The value of the option. - - Raises - ------ - KeyError - If option ``name`` does not exist. - """ - try: - return _OPTIONS[name].value - except KeyError: - raise KeyError(f'"{name}" does not exist.') - - -def set_option(name: str, val: Any): - """Set the value of option. - - Parameters - ---------- - name : str - The name of the option. - val : Any - The value to set. - - Raises - ------ - KeyError - If option ``name`` does not exist. - BaseException - Raised by validator if the value is invalid. - """ - try: - option = _OPTIONS[name] - except KeyError: - raise KeyError(f'"{name}" does not exist.') - option.validator(val) - option.value = val - - -def _build_option_description(name, opt): - return ( - f"{name}:\n" - f"\t{opt.description}\n" - f"\t[Default: {opt.default}] [Current: {opt.value}]" - ) - - -def describe_option(name: str | None = None): - """Prints the description of an option. - - If `name` is unspecified, prints the description of all available options. - - Parameters - ---------- - name : Optional[str] - The name of the option. - """ - names = _OPTIONS.keys() if name is None else [name] - for name in names: - print(_build_option_description(name, _OPTIONS[name])) - - -def _make_contains_validator(valid_options: Container) -> Callable: - """Return a validator that checks if a value is in `valid_options`.""" - - def _validator(val): - if val not in valid_options: - raise ValueError( - f"{val} is not a valid option. " - f"Must be one of {set(valid_options)}." - ) - - return _validator - - -def _cow_validator(val): - if val not in {False, True}: - raise ValueError( - f"{val} is not a valid option. Must be one of {{False, True}}." - ) - - -def _spill_validator(val): - if val not in {False, True}: - raise ValueError( - f"{val} is not a valid option. Must be one of {{False, True}}." - ) - - -def _integer_validator(val): - try: - int(val) - return True - except ValueError: - raise ValueError( - f"{val} is not a valid option. " f"Must be an integer." - ) - - -def _integer_and_none_validator(val): - try: - if val is None or int(val): - return - except ValueError: - raise ValueError( - f"{val} is not a valid option. " f"Must be an integer or None." - ) - - -_register_option( - "default_integer_bitwidth", - None, - textwrap.dedent( - """ - Default bitwidth when the dtype of an integer needs to be - inferred. If set to `None`, the API will align dtype with pandas. - APIs that respect this option include: - \t- cudf object constructors - \t- cudf.read_csv and cudf.read_json when `dtype` is not specified. - \t- APIs that require implicit conversion of cudf.RangeIndex to an - \t integer index. - \tValid values are None, 32 or 64. Default is None. - """ - ), - _make_contains_validator([None, 32, 64]), -) - - -_register_option( - "default_float_bitwidth", - None, - textwrap.dedent( - """ - Default bitwidth when the dtype of a float needs to be - inferred. If set to `None`, the API will align dtype with pandas. - APIs that respect this option include: - \t- cudf object constructors - \t- cudf.read_csv and cudf.read_json when `dtype` is not specified. - \tValid values are None, 32 or 64. Default is None. - """ - ), - _make_contains_validator([None, 32, 64]), -) - -_register_option( - "spill", - _env_get_bool("CUDF_SPILL", False), - textwrap.dedent( - """ - Enables spilling. - \tValid values are True or False. Default is False. - """ - ), - _spill_validator, -) - - -_register_option( - "copy_on_write", - _env_get_bool("CUDF_COPY_ON_WRITE", False), - textwrap.dedent( - """ - If set to `False`, disables copy-on-write. - If set to `True`, enables copy-on-write. - Read more at: :ref:`copy-on-write-user-doc` - \tValid values are True or False. Default is False. - """ - ), - _cow_validator, -) - - -_register_option( - "spill_on_demand", - _env_get_bool("CUDF_SPILL_ON_DEMAND", True), - textwrap.dedent( - """ - Enables spilling on demand using an RMM out-of-memory error handler. - This has no effect if spilling is disabled, see the "spill" option. - \tValid values are True or False. Default is True. - """ - ), - _make_contains_validator([False, True]), -) - -_register_option( - "spill_device_limit", - _env_get_int("CUDF_SPILL_DEVICE_LIMIT", None), - textwrap.dedent( - """ - Enforce a device memory limit in bytes. - This has no effect if spilling is disabled, see the "spill" option. - \tValid values are any positive integer or None (disabled). - \tDefault is None. - """ - ), - _integer_and_none_validator, -) - -_register_option( - "spill_stats", - _env_get_int("CUDF_SPILL_STATS", 0), - textwrap.dedent( - """ - If not 0, enables statistics at the specified level: - 0 - disabled (no overhead). - 1+ - duration and number of bytes spilled (very low overhead). - 2+ - a traceback for each time a spillable buffer is exposed - permanently (potential high overhead). - - Valid values are any positive integer. - Default is 0 (disabled). - """ - ), - _integer_validator, -) - -_register_option( - "mode.pandas_compatible", - False, - textwrap.dedent( - """ - If set to `False`, retains `cudf` specific behavior. - If set to `True`, enables pandas compatibility mode, - which will try to match pandas API behaviors in case of - any inconsistency. - \tValid values are True or False. Default is False. - """ - ), - _make_contains_validator([False, True]), -) - -_register_option( - "memory_profiling", - _env_get_bool("CUDF_MEMORY_PROFILING", False), - textwrap.dedent( - """ - If set to `False`, disables memory profiling. - If set to `True`, enables memory profiling. - Read more at: :ref:`memory-profiling-user-doc` - \tValid values are True or False. Default is False. - """ - ), - _make_contains_validator([False, True]), -) - -_register_option( - "io.parquet.low_memory", - False, - textwrap.dedent( - """ - If set to `False`, reads entire parquet in one go. - If set to `True`, reads parquet file in chunks. - \tValid values are True or False. Default is False. - """ - ), - _make_contains_validator([False, True]), -) - -_register_option( - "io.json.low_memory", - False, - textwrap.dedent( - """ - If set to `False`, reads entire json in one go. - If set to `True`, reads json file in chunks. - \tValid values are True or False. Default is False. - """ - ), - _make_contains_validator([False, True]), -) - - -class option_context(ContextDecorator): - """ - Context manager to temporarily set options in the `with` statement context. - - You need to invoke as ``option_context(pat, val, [(pat, val), ...])``. - - - Examples - -------- - >>> from cudf import option_context - >>> with option_context('mode.pandas_compatible', True, 'default_float_bitwidth', 32): - ... pass - """ # noqa: E501 - - def __init__(self, *args) -> None: - if len(args) % 2 != 0: - raise ValueError( - "Need to invoke as option_context(pat, val, " - "[(pat, val), ...])." - ) - - self.ops = tuple(zip(args[::2], args[1::2])) - - def __enter__(self) -> None: - self.undo = tuple((pat, get_option(pat)) for pat, _ in self.ops) - for pat, val in self.ops: - set_option(pat, val) - - def __exit__(self, *args) -> None: - for pat, val in self.undo: - set_option(pat, val) diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py deleted file mode 100644 index bacf1f7e77b..00000000000 --- a/python/cudf/cudf/pandas/__init__.py +++ /dev/null @@ -1,104 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import os -import warnings - -import pylibcudf -import rmm.mr - -from .fast_slow_proxy import is_proxy_object -from .magics import load_ipython_extension -from .profiler import Profiler - -__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"] - - -LOADED = False - -_SUPPORTED_PREFETCHES = { - "column_view::get_data", - "mutable_column_view::get_data", - "gather", - "hash_join", -} - - -def _enable_managed_prefetching(rmm_mode, managed_memory_is_supported): - if managed_memory_is_supported and "managed" in rmm_mode: - for key in _SUPPORTED_PREFETCHES: - pylibcudf.experimental.enable_prefetching(key) - - -def install(): - """Enable Pandas Accelerator Mode.""" - from .module_accelerator import ModuleAccelerator - - loader = ModuleAccelerator.install("pandas", "cudf", "pandas") - global LOADED - LOADED = loader is not None - - # The default mode is "managed_pool" if UVM is supported, otherwise "pool" - managed_memory_is_supported = ( - pylibcudf.utils._is_concurrent_managed_access_supported() - ) - default_rmm_mode = ( - "managed_pool" if managed_memory_is_supported else "pool" - ) - rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", default_rmm_mode) - - if "managed" in rmm_mode and not managed_memory_is_supported: - raise ValueError( - f"Managed memory is not supported on this system, so the requested {rmm_mode=} is invalid." - ) - - # Check if a non-default memory resource is set - current_mr = rmm.mr.get_current_device_resource() - if not isinstance(current_mr, rmm.mr.CudaMemoryResource): - warnings.warn( - f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}", - UserWarning, - ) - return - - free_memory, _ = rmm.mr.available_device_memory() - free_memory = int(round(float(free_memory) * 0.80 / 256) * 256) - new_mr = current_mr - - if rmm_mode == "pool": - new_mr = rmm.mr.PoolMemoryResource( - current_mr, - initial_pool_size=free_memory, - ) - elif rmm_mode == "async": - new_mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory) - elif rmm_mode == "managed": - new_mr = rmm.mr.PrefetchResourceAdaptor(rmm.mr.ManagedMemoryResource()) - elif rmm_mode == "managed_pool": - new_mr = rmm.mr.PrefetchResourceAdaptor( - rmm.mr.PoolMemoryResource( - rmm.mr.ManagedMemoryResource(), - initial_pool_size=free_memory, - ) - ) - elif rmm_mode != "cuda": - raise ValueError(f"Unsupported {rmm_mode=}") - - rmm.mr.set_current_device_resource(new_mr) - - _enable_managed_prefetching(rmm_mode, managed_memory_is_supported) - - -def pytest_load_initial_conftests(early_config, parser, args): - # We need to install ourselves before conftest.py import (which - # might import pandas) This hook is guaranteed to run before that - # happens see - # https://docs.pytest.org/en/7.1.x/reference/\ - # reference.html#pytest.hookspec.pytest_load_initial_conftests - try: - install() - except RuntimeError: - raise RuntimeError( - "An existing plugin has already loaded pandas. Interposing failed." - ) diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py deleted file mode 100644 index e0d3d9101a9..00000000000 --- a/python/cudf/cudf/pandas/__main__.py +++ /dev/null @@ -1,123 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -""" -Usage: - -python -m cudf.pandas -python -m cudf.pandas -m module -""" - -import argparse -import code -import runpy -import sys -import tempfile -from contextlib import contextmanager - -from . import install -from .profiler import Profiler, lines_with_profiling - - -@contextmanager -def profile(function_profile, line_profile, fn): - if fn is None and (line_profile or function_profile): - raise RuntimeError("Enabling the profiler requires a script name.") - if line_profile: - with open(fn) as f: - lines = f.readlines() - - with tempfile.NamedTemporaryFile(mode="w+b", suffix=".py") as f: - f.write(lines_with_profiling(lines, function_profile).encode()) - f.seek(0) - - yield f.name - elif function_profile: - with Profiler() as profiler: - yield fn - profiler.print_per_function_stats() - else: - yield fn - - -def main(): - parser = argparse.ArgumentParser( - prog="python -m cudf.pandas", - description=( - "Run a Python script with Pandas Accelerator Mode enabled. " - "In Pandas Accelerator Mode, all imports of pandas will " - "automatically use GPU accelerated cuDF equivalents where " - "possible." - ), - ) - - parser.add_argument( - "-m", - dest="module", - nargs=1, - ) - parser.add_argument( - "-c", - dest="cmd", - nargs=1, - ) - parser.add_argument( - "--profile", - action="store_true", - help="Perform per-function profiling of this script.", - ) - parser.add_argument( - "--line-profile", - action="store_true", - help="Perform per-line profiling of this script.", - ) - parser.add_argument( - "args", - nargs=argparse.REMAINDER, - help="Arguments to pass on to the script", - ) - - args = parser.parse_args() - - if args.cmd: - f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py") - f.write(args.cmd[0].encode()) - f.seek(0) - args.args.insert(0, f.name) - - install() - - script_name = args.args[0] if len(args.args) > 0 else None - with profile(args.profile, args.line_profile, script_name) as fn: - if script_name is not None: - args.args[0] = fn - if args.module: - (module,) = args.module - # run the module passing the remaining arguments - # as if it were run with python -m - sys.argv[:] = [module] + args.args # not thread safe? - runpy.run_module(module, run_name="__main__") - elif len(args.args) >= 1: - # Remove ourself from argv and continue - sys.argv[:] = args.args - runpy.run_path(args.args[0], run_name="__main__") - else: - if sys.stdin.isatty(): - banner = f"Python {sys.version} on {sys.platform}" - site_import = not sys.flags.no_site - if site_import: - cprt = 'Type "help", "copyright", "credits" or "license" for more information.' - banner += "\n" + cprt - else: - # Don't show prompts or banners if stdin is not a TTY - sys.ps1 = "" - sys.ps2 = "" - banner = "" - - # Launch an interactive interpreter - code.interact(banner=banner, exitmsg="") - - -if __name__ == "__main__": - main() diff --git a/python/cudf/cudf/pandas/_logger.py b/python/cudf/cudf/pandas/_logger.py deleted file mode 100644 index 68923c3e35c..00000000000 --- a/python/cudf/cudf/pandas/_logger.py +++ /dev/null @@ -1,80 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import json -import logging - -logging.basicConfig( - filename="cudf_pandas_unit_tests_debug.log", level=logging.INFO -) -logger = logging.getLogger() - - -class StructuredMessage: - # https://docs.python.org/3/howto/logging-cookbook.html#implementing-structured-logging - def __init__(self, debug_type: str, /, **kwargs) -> None: - self.debug_type = debug_type - self.kwargs = kwargs - - def __str__(self) -> str: - log = {"debug_type": self.debug_type} - return json.dumps({**log, **self.kwargs}) - - -def reprify(arg) -> str: - """Attempt to return arg's repr for logging.""" - try: - return repr(arg) - except Exception: - return "" - - -def log_fallback( - slow_args: tuple, slow_kwargs: dict, exception: Exception -) -> None: - """Log when a fast call falls back to the slow path.""" - caller = slow_args[0] - module = getattr(caller, "__module__", "") - obj_name = getattr(caller, "__qualname__", type(caller).__qualname__) - if module: - slow_object = f"{module}.{obj_name}" - else: - slow_object = obj_name - # TODO: Maybe use inspect.signature to map called args and kwargs - # to their keyword names, but a user calling an API incorrectly would - # break this. - caller_args = slow_args[1] - args_passed = ", ".join((reprify(arg) for arg in caller_args)) - args_types_passed = ", ".join((type(arg).__name__ for arg in caller_args)) - kwargs_passed = {} - kwargs_types_passed = "" - if len(slow_args) == 3: - caller_kwargs = slow_args[2] - if caller_kwargs: - fmt_kwargs = ", ".join( - f"{kwarg}={reprify(value)}" - for kwarg, value in caller_kwargs.items() - ) - kwargs_types_passed = ", ".join( - f"{kwarg}={type(value).__name__}" - for kwarg, value in caller_kwargs.items() - ) - args_passed = f"{args_passed}, {fmt_kwargs}" - kwargs_passed = { - kwarg: reprify(value) for kwarg, value in caller_kwargs.items() - } - message = StructuredMessage( - "LOG_FAST_FALLBACK", - failed_call=f"{slow_object}({args_passed})", - exception=type(exception).__name__, - exception_message=str(exception), - slow_object=slow_object, - args_passed=args_passed, - kwargs_passed=kwargs_passed, - args_types_passed=args_types_passed, - kwargs_types_passed=kwargs_types_passed, - ) - logger.info(message) diff --git a/python/cudf/cudf/pandas/_wrappers/__init__.py b/python/cudf/cudf/pandas/_wrappers/__init__.py deleted file mode 100644 index 1f7fe181d0e..00000000000 --- a/python/cudf/cudf/pandas/_wrappers/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from . import numpy, pandas diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py deleted file mode 100644 index 66a51a83896..00000000000 --- a/python/cudf/cudf/pandas/_wrappers/common.py +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Utility custom overrides for special methods/properties -from ..fast_slow_proxy import ( - _FastSlowAttribute, - _FastSlowProxy, - _maybe_wrap_result, - _slow_arg, -) - - -def array_method(self: _FastSlowProxy, *args, **kwargs): - return self._fsproxy_slow.__array__(*args, **kwargs) - - -def array_function_method(self, func, types, args, kwargs): - try: - return _FastSlowAttribute("__array_function__").__get__( - self, type(self) - )(func, types, args, kwargs) - except Exception: - # if something went wrong with __array_function__ we - # attempt to call the function directly on the slow - # object. This ensures that the function call is - # handled in the same way as if the slow object was - # passed directly to the function. - slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) - return _maybe_wrap_result( - func(*slow_args, **slow_kwargs), func, *args, **kwargs - ) - - -def arrow_array_method(self: _FastSlowProxy, *args, **kwargs): - import pyarrow as pa - - try: - return self._fsproxy_fast.to_arrow(*args, **kwargs) - except Exception: - return pa.array(self._fsproxy_slow, *args, **kwargs) - - -@property # type: ignore -def cuda_array_interface(self: _FastSlowProxy): - return self._fsproxy_fast.__cuda_array_interface__ - - -@property # type: ignore -def array_interface(self: _FastSlowProxy): - return self._fsproxy_slow.__array_interface__ - - -def custom_iter(self: _FastSlowProxy): - return iter(self._fsproxy_slow) diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py deleted file mode 100644 index d5e669cb58f..00000000000 --- a/python/cudf/cudf/pandas/_wrappers/numpy.py +++ /dev/null @@ -1,178 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import cupy -import cupy._core.flags -import numpy -from packaging import version - -from ..fast_slow_proxy import ( - _fast_slow_function_call, - _FastSlowAttribute, - is_proxy_object, - make_final_proxy_type, - make_intermediate_proxy_type, -) -from ..proxy_base import ProxyNDarrayBase -from .common import ( - array_interface, - array_method, - arrow_array_method, - cuda_array_interface, - custom_iter, -) - -# https://docs.cupy.dev/en/stable/reference/creation.html -_CONSTRUCTORS = frozenset( - [ - cupy.empty, - cupy.empty_like, - cupy.eye, - cupy.identity, - cupy.ones, - cupy.ones_like, - cupy.zeros, - cupy.zeros_like, - cupy.full, - cupy.full_like, - cupy.array, - cupy.asarray, - cupy.asanyarray, - cupy.ascontiguousarray, - cupy.copy, - cupy.frombuffer, - cupy.fromfile, - cupy.fromfunction, - cupy.fromiter, - cupy.fromstring, - cupy.loadtxt, - cupy.arange, - cupy.linspace, - cupy.logspace, - cupy.meshgrid, - cupy.diag, - cupy.diagflat, - cupy.tri, - cupy.tril, - cupy.triu, - cupy.vander, - ] -) - - -def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor): - """Wrap an ndarray in a proxy type - - Parameters - ---------- - cls - Proxy type for ndarray - arr - Concrete result ndarray (cupy or numpy) - constructor - Function that was called to construct the concrete array, used - to check against a denylist to avoid unwrapping. - - Returns - ------- - The scalar .item() wrapped in its numpy dtype if arr is a - zero-dimensional cupy array (and wasn't just constructed as such), - a new proxy type otherwise. - - Notes - ----- - Axis-reducing operations in numpy return scalar objects but - zero-dimensional arrays in cupy. This confuses downstream - libraries when they get a fast (device-based) zero-dim array when - they were expecting a scalar. To avoid this, if the provided array - is a cupy array, and its shape is zero, unwrap it. - """ - if ( - isinstance(arr, cupy.ndarray) - and arr.shape == () - and constructor not in _CONSTRUCTORS - ): - return arr.dtype.type(arr.item()) - else: - # Note, this super call means that the constructed ndarray - # class cannot be subclassed (because then super(cls, - # cls)._fsproxy_wrap produces an infinite loop). Really this - # should be super(ndarray, cls), but we don't have access to - # the ndarray type until after we need to pass this function - # in. So it works for now since without subclassing, - # super(ndarray, cls) == super(ndarray, ndarray) == super(cls, - # cls) - return super(cls, cls)._fsproxy_wrap(arr, constructor) - - -def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs): - result, _ = _fast_slow_function_call( - getattr(ufunc, method), - *inputs, - **kwargs, - ) - if isinstance(result, tuple): - if is_proxy_object(result[0]) and isinstance( - result[0]._fsproxy_wrapped, numpy.ndarray - ): - return tuple(numpy.asarray(x) for x in result) - elif is_proxy_object(result) and isinstance( - result._fsproxy_wrapped, numpy.ndarray - ): - return numpy.asarray(result) - return result - - -ndarray = make_final_proxy_type( - "ndarray", - cupy.ndarray, - numpy.ndarray, - fast_to_slow=cupy.ndarray.get, - slow_to_fast=cupy.asarray, - bases=(ProxyNDarrayBase,), - additional_attributes={ - "__array__": array_method, - # So that pa.array(wrapped-numpy-array) works - "__arrow_array__": arrow_array_method, - "__cuda_array_interface__": cuda_array_interface, - "__array_interface__": array_interface, - "__array_ufunc__": ndarray__array_ufunc__, - # ndarrays are unhashable - "__hash__": None, - # iter(cupy-array) produces an iterable of zero-dim device - # arrays, which is not usable in many settings (whereas - # iter(numpy-array) produces an iterable of scalars) - "__iter__": custom_iter, - # Special wrapping to handle scalar values - "_fsproxy_wrap": classmethod(wrap_ndarray), - "base": _FastSlowAttribute("base", private=True), - }, -) - - -flatiter = make_final_proxy_type( - "flatiter", - cupy.flatiter, - numpy.flatiter, - fast_to_slow=lambda fast: cupy.asnumpy(fast.base).flat, - slow_to_fast=lambda slow: cupy.asarray(slow).flat, - additional_attributes={ - "__array__": array_method, - }, -) - -if version.parse(numpy.__version__) >= version.parse("2.0"): - # NumPy 2 introduced `_core` and gives warnings for access to `core`. - from numpy._core.multiarray import flagsobj as _numpy_flagsobj -else: - from numpy.core.multiarray import flagsobj as _numpy_flagsobj - -# Mapping flags between slow and fast types -_ndarray_flags = make_intermediate_proxy_type( - "_ndarray_flags", - cupy._core.flags.Flags, - _numpy_flagsobj, -) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py deleted file mode 100644 index 6d03063fa27..00000000000 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ /dev/null @@ -1,1719 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -import abc -import copyreg -import importlib -import os -import pickle -import sys - -import pandas as pd -from pandas.tseries.holiday import ( - AbstractHolidayCalendar as pd_AbstractHolidayCalendar, - EasterMonday as pd_EasterMonday, - GoodFriday as pd_GoodFriday, - Holiday as pd_Holiday, - HolidayCalendarFactory as pd_HolidayCalendarFactory, - HolidayCalendarMetaClass as pd_HolidayCalendarMetaClass, - USColumbusDay as pd_USColumbusDay, - USFederalHolidayCalendar as pd_USFederalHolidayCalendar, - USLaborDay as pd_USLaborDay, - USMartinLutherKingJr as pd_USMartinLutherKingJr, - USMemorialDay as pd_USMemorialDay, - USPresidentsDay as pd_USPresidentsDay, - USThanksgivingDay as pd_USThanksgivingDay, -) - -import cudf -import cudf.core._compat - -from ..annotation import nvtx -from ..fast_slow_proxy import ( - _CUDF_PANDAS_NVTX_COLORS, - _DELETE, - _fast_slow_function_call, - _FastSlowAttribute, - _FunctionProxy, - _Unusable, - make_final_proxy_type as _make_final_proxy_type, - make_intermediate_proxy_type as _make_intermediate_proxy_type, - register_proxy_func, -) -from .common import ( - array_function_method, - array_method, - arrow_array_method, - cuda_array_interface, - custom_iter, -) - -from pandas.io.sas.sas7bdat import ( # isort: skip - SAS7BDATReader as pd_SAS7BDATReader, -) -from pandas.io.sas.sas_xport import ( # isort: skip - XportReader as pd_XportReader, -) - -# TODO(pandas2.1): Can import from pandas.api.typing -from pandas.core.resample import ( # isort: skip - Resampler as pd_Resampler, - TimeGrouper as pd_TimeGrouper, -) - -try: - from IPython import get_ipython - - ipython_shell = get_ipython() -except ImportError: - ipython_shell = None - -cudf.set_option("mode.pandas_compatible", True) - - -def _pandas_util_dir(): - # In pandas 2.0, pandas.util contains public APIs under - # __getattr__ but no __dir__ to find them - # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py - return list(importlib.import_module("pandas.util").__dict__.keys()) + [ - "hash_array", - "hash_pandas_object", - "Appender", - "Substitution", - "cache_readonly", - ] - - -pd.util.__dir__ = _pandas_util_dir - - -def make_final_proxy_type( - name, - fast_type, - slow_type, - **kwargs, -): - assert "module" not in kwargs - return _make_final_proxy_type( - name, fast_type, slow_type, module=slow_type.__module__, **kwargs - ) - - -def make_intermediate_proxy_type(name, fast_type, slow_type): - return _make_intermediate_proxy_type( - name, fast_type, slow_type, module=slow_type.__module__ - ) - - -class _AccessorAttr: - """ - Descriptor that ensures that accessors like `.dt` and `.str` - return the corresponding accessor types when accessed on `Series` - and `Index` _types_ (not instances).n - - Attribute access for _instances_ uses the regular fast-then-slow - lookup defined in `__getattr__`. - """ - - def __init__(self, typ): - self._typ = typ - - def __set_name__(self, owner, name): - self._name = name - - def __get__(self, obj, cls=None): - if obj is None: - return self._typ - else: - return _FastSlowAttribute(self._name).__get__(obj, type(obj)) - - -def Timestamp_Timedelta__new__(cls, *args, **kwargs): - # Call fast/slow constructor - # This takes care of running __init__ as well, but must be paired - # with a removal of the defaulted __init__ that - # make_final_proxy_type provides. - # Timestamp & Timedelta don't always return same types as self, - # hence this method is needed. - self, _ = _fast_slow_function_call( - lambda cls, args, kwargs: cls(*args, **kwargs), - cls, - args, - kwargs, - ) - return self - - -Timedelta = make_final_proxy_type( - "Timedelta", - _Unusable, - pd.Timedelta, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "__hash__": _FastSlowAttribute("__hash__"), - "__new__": Timestamp_Timedelta__new__, - "__init__": _DELETE, - }, -) - - -Timestamp = make_final_proxy_type( - "Timestamp", - _Unusable, - pd.Timestamp, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "__hash__": _FastSlowAttribute("__hash__"), - "__new__": Timestamp_Timedelta__new__, - "__init__": _DELETE, - }, -) - -DatetimeProperties = make_intermediate_proxy_type( - "DatetimeProperties", - cudf.core.series.DatetimeProperties, - pd.core.indexes.accessors.DatetimeProperties, -) - -TimedeltaProperties = make_intermediate_proxy_type( - "TimedeltaProperties", - cudf.core.series.TimedeltaProperties, - pd.core.indexes.accessors.TimedeltaProperties, -) - -CombinedDatetimelikeProperties = make_intermediate_proxy_type( - "CombinedDatetimelikeProperties", - cudf.core.series.DatetimeProperties, - pd.core.indexes.accessors.CombinedDatetimelikeProperties, -) - -StringMethods = make_intermediate_proxy_type( - "StringMethods", - cudf.core.column.string.StringMethods, - pd.core.strings.accessor.StringMethods, -) - -_CategoricalAccessor = make_intermediate_proxy_type( - "CategoricalAccessor", - cudf.core.column.categorical.CategoricalAccessor, - pd.core.arrays.categorical.CategoricalAccessor, -) - - -def _DataFrame__dir__(self): - # Column names that are string identifiers are added to the dir of the - # DataFrame - # See https://github.com/pandas-dev/pandas/blob/43691a2f5d235b08f0f3aa813d8fdcb7c4ce1e47/pandas/core/indexes/base.py#L878 # noqa: E501 - _pd_df_dir = dir(pd.DataFrame) - return _pd_df_dir + [ - colname - for colname in self.columns - if isinstance(colname, str) and colname.isidentifier() - ] - - -def ignore_ipython_canary_check(self, **kwargs): - raise AttributeError( - "_ipython_canary_method_should_not_exist_ doesn't exist" - ) - - -DataFrame = make_final_proxy_type( - "DataFrame", - cudf.DataFrame, - pd.DataFrame, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - additional_attributes={ - "__array__": array_method, - "__dir__": _DataFrame__dir__, - "_constructor": _FastSlowAttribute("_constructor"), - "_constructor_sliced": _FastSlowAttribute("_constructor_sliced"), - "_accessors": set(), - "_ipython_canary_method_should_not_exist_": ignore_ipython_canary_check, - }, -) - - -def custom_repr_html(obj): - # This custom method is need to register a html format - # for ipython - return _fast_slow_function_call( - lambda obj: obj._repr_html_(), - obj, - )[0] - - -if ipython_shell: - # See: https://ipython.readthedocs.io/en/stable/config/integrating.html#formatters-for-third-party-types - html_formatter = ipython_shell.display_formatter.formatters["text/html"] - html_formatter.for_type(DataFrame, custom_repr_html) - - -Series = make_final_proxy_type( - "Series", - cudf.Series, - pd.Series, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - additional_attributes={ - "__array__": array_method, - "__array_function__": array_function_method, - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), - "__arrow_array__": arrow_array_method, - "__cuda_array_interface__": cuda_array_interface, - "__iter__": custom_iter, - "dt": _AccessorAttr(CombinedDatetimelikeProperties), - "str": _AccessorAttr(StringMethods), - "cat": _AccessorAttr(_CategoricalAccessor), - "_constructor": _FastSlowAttribute("_constructor"), - "_constructor_expanddim": _FastSlowAttribute("_constructor_expanddim"), - "_accessors": set(), - }, -) - - -def Index__new__(cls, *args, **kwargs): - # Call fast/slow constructor - # This takes care of running __init__ as well, but must be paired - # with a removal of the defaulted __init__ that - # make_final_proxy_type provides. - self, _ = _fast_slow_function_call( - lambda cls, args, kwargs: cls(*args, **kwargs), - cls, - args, - kwargs, - ) - return self - - -def Index__setattr__(self, name, value): - if name.startswith("_"): - object.__setattr__(self, name, value) - return - if name == "name": - setattr(self._fsproxy_wrapped, "name", value) - if name == "names": - setattr(self._fsproxy_wrapped, "names", value) - return _FastSlowAttribute("__setattr__").__get__(self, type(self))( - name, value - ) - - -Index = make_final_proxy_type( - "Index", - cudf.Index, - pd.Index, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - additional_attributes={ - "__array__": array_method, - "__array_function__": array_function_method, - "__arrow_array__": arrow_array_method, - "__cuda_array_interface__": cuda_array_interface, - "dt": _AccessorAttr(CombinedDatetimelikeProperties), - "str": _AccessorAttr(StringMethods), - "cat": _AccessorAttr(_CategoricalAccessor), - "__iter__": custom_iter, - "__init__": _DELETE, - "__new__": Index__new__, - "__setattr__": Index__setattr__, - "_constructor": _FastSlowAttribute("_constructor"), - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), - "_accessors": set(), - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - "name": _FastSlowAttribute("name"), - }, -) - -RangeIndex = make_final_proxy_type( - "RangeIndex", - cudf.RangeIndex, - pd.RangeIndex, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={ - "__init__": _DELETE, - "__setattr__": Index__setattr__, - "name": _FastSlowAttribute("name"), - }, -) - -SparseDtype = make_final_proxy_type( - "SparseDtype", - _Unusable, - pd.SparseDtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -SparseArray = make_final_proxy_type( - "SparseDtype", - _Unusable, - pd.arrays.SparseArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), -) - -CategoricalIndex = make_final_proxy_type( - "CategoricalIndex", - cudf.CategoricalIndex, - pd.CategoricalIndex, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={ - "__init__": _DELETE, - "__setattr__": Index__setattr__, - "name": _FastSlowAttribute("name"), - }, -) - -Categorical = make_final_proxy_type( - "Categorical", - _Unusable, - pd.Categorical, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), -) - -CategoricalDtype = make_final_proxy_type( - "CategoricalDtype", - cudf.CategoricalDtype, - pd.CategoricalDtype, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -DatetimeIndex = make_final_proxy_type( - "DatetimeIndex", - cudf.DatetimeIndex, - pd.DatetimeIndex, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={ - "__init__": _DELETE, - "__setattr__": Index__setattr__, - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - "name": _FastSlowAttribute("name"), - }, -) - -DatetimeArray = make_final_proxy_type( - "DatetimeArray", - _Unusable, - pd.arrays.DatetimeArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - }, -) - -DatetimeTZDtype = make_final_proxy_type( - "DatetimeTZDtype", - _Unusable, - pd.DatetimeTZDtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -TimedeltaIndex = make_final_proxy_type( - "TimedeltaIndex", - cudf.TimedeltaIndex, - pd.TimedeltaIndex, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={ - "__init__": _DELETE, - "__setattr__": Index__setattr__, - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - "name": _FastSlowAttribute("name"), - }, -) - -try: - from pandas.arrays import NumpyExtensionArray as pd_NumpyExtensionArray - - NumpyExtensionArray = make_final_proxy_type( - "NumpyExtensionArray", - _Unusable, - pd_NumpyExtensionArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "_ndarray": _FastSlowAttribute("_ndarray"), - "_dtype": _FastSlowAttribute("_dtype"), - }, - ) - -except ImportError: - from pandas.arrays import PandasArray as pd_PandasArray - - PandasArray = make_final_proxy_type( - "PandasArray", - _Unusable, - pd_PandasArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "_ndarray": _FastSlowAttribute("_ndarray"), - "_dtype": _FastSlowAttribute("_dtype"), - }, - ) - -TimedeltaArray = make_final_proxy_type( - "TimedeltaArray", - _Unusable, - pd.arrays.TimedeltaArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - }, -) - -PeriodIndex = make_final_proxy_type( - "PeriodIndex", - _Unusable, - pd.PeriodIndex, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - bases=(Index,), - additional_attributes={ - "__init__": _DELETE, - "__setattr__": Index__setattr__, - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - "name": _FastSlowAttribute("name"), - }, -) - -PeriodArray = make_final_proxy_type( - "PeriodArray", - _Unusable, - pd.arrays.PeriodArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), - }, -) - -PeriodDtype = make_final_proxy_type( - "PeriodDtype", - _Unusable, - pd.PeriodDtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), -) - -Period = make_final_proxy_type( - "Period", - _Unusable, - pd.Period, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - - -MultiIndex = make_final_proxy_type( - "MultiIndex", - cudf.MultiIndex, - pd.MultiIndex, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={ - "__init__": _DELETE, - "__setattr__": Index__setattr__, - "names": _FastSlowAttribute("names"), - }, -) - -TimeGrouper = make_intermediate_proxy_type( - "TimeGrouper", - _Unusable, - pd_TimeGrouper, -) - -Grouper = make_final_proxy_type( - "Grouper", - cudf.Grouper, - pd.Grouper, - fast_to_slow=lambda fast: pd.Grouper( - **{ - k: getattr(fast, k) - for k in {"key", "level", "freq", "closed", "label"} - if getattr(fast, k) is not None - } - ), - slow_to_fast=lambda slow: cudf.Grouper( - **{ - k: getattr(slow, k) - for k in {"key", "level", "freq", "closed", "label"} - if getattr(slow, k) is not None - } - ), -) - -StringArray = make_final_proxy_type( - "StringArray", - _Unusable, - pd.arrays.StringArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - }, -) - -if cudf.core._compat.PANDAS_GE_210: - ArrowStringArrayNumpySemantics = make_final_proxy_type( - "ArrowStringArrayNumpySemantics", - _Unusable, - pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - ) - -ArrowStringArray = make_final_proxy_type( - "ArrowStringArray", - _Unusable, - pd.core.arrays.string_arrow.ArrowStringArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), -) - -StringDtype = make_final_proxy_type( - "StringDtype", - _Unusable, - pd.StringDtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "__hash__": _FastSlowAttribute("__hash__"), - "storage": _FastSlowAttribute("storage"), - }, -) - -BooleanArray = make_final_proxy_type( - "BooleanArray", - _Unusable, - pd.arrays.BooleanArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), - }, -) - -BooleanDtype = make_final_proxy_type( - "BooleanDtype", - _Unusable, - pd.BooleanDtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -IntegerArray = make_final_proxy_type( - "IntegerArray", - _Unusable, - pd.arrays.IntegerArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - }, -) - -Int8Dtype = make_final_proxy_type( - "Int8Dtype", - _Unusable, - pd.Int8Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - - -Int16Dtype = make_final_proxy_type( - "Int16Dtype", - _Unusable, - pd.Int16Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Int32Dtype = make_final_proxy_type( - "Int32Dtype", - _Unusable, - pd.Int32Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Int64Dtype = make_final_proxy_type( - "Int64Dtype", - _Unusable, - pd.Int64Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -UInt8Dtype = make_final_proxy_type( - "UInt8Dtype", - _Unusable, - pd.UInt8Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -UInt16Dtype = make_final_proxy_type( - "UInt16Dtype", - _Unusable, - pd.UInt16Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -UInt32Dtype = make_final_proxy_type( - "UInt32Dtype", - _Unusable, - pd.UInt32Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -UInt64Dtype = make_final_proxy_type( - "UInt64Dtype", - _Unusable, - pd.UInt64Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -IntervalIndex = make_final_proxy_type( - "IntervalIndex", - cudf.IntervalIndex, - pd.IntervalIndex, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - bases=(Index,), - additional_attributes={ - "__init__": _DELETE, - "__setattr__": Index__setattr__, - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - "name": _FastSlowAttribute("name"), - }, -) - -IntervalArray = make_final_proxy_type( - "IntervalArray", - _Unusable, - pd.arrays.IntervalArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - }, -) - -IntervalDtype = make_final_proxy_type( - "IntervalDtype", - cudf.IntervalDtype, - pd.IntervalDtype, - fast_to_slow=lambda fast: fast.to_pandas(), - slow_to_fast=cudf.from_pandas, - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Interval = make_final_proxy_type( - "Interval", - _Unusable, - pd.Interval, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -FloatingArray = make_final_proxy_type( - "FloatingArray", - _Unusable, - pd.arrays.FloatingArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), - "_data": _FastSlowAttribute("_data", private=True), - "_mask": _FastSlowAttribute("_mask", private=True), - }, -) - -Float32Dtype = make_final_proxy_type( - "Float32Dtype", - _Unusable, - pd.Float32Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Float64Dtype = make_final_proxy_type( - "Float64Dtype", - _Unusable, - pd.Float64Dtype, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -SeriesGroupBy = make_intermediate_proxy_type( - "SeriesGroupBy", - cudf.core.groupby.groupby.SeriesGroupBy, - pd.core.groupby.SeriesGroupBy, -) - -DataFrameGroupBy = make_intermediate_proxy_type( - "DataFrameGroupBy", - cudf.core.groupby.groupby.DataFrameGroupBy, - pd.core.groupby.DataFrameGroupBy, -) - -RollingGroupBy = make_intermediate_proxy_type( - "RollingGroupBy", - cudf.core.window.rolling.RollingGroupby, - pd.core.window.rolling.RollingGroupby, -) - -_SeriesIlocIndexer = make_intermediate_proxy_type( - "_SeriesIlocIndexer", - cudf.core.series._SeriesIlocIndexer, - pd.core.indexing._iLocIndexer, -) - -_DataFrameIlocIndexer = make_intermediate_proxy_type( - "_SeriesIlocIndexer", - cudf.core.dataframe._DataFrameIlocIndexer, - pd.core.indexing._iLocIndexer, -) - -_SeriesLocIndexer = make_intermediate_proxy_type( - "_SeriesLocIndexer", - cudf.core.series._SeriesLocIndexer, - pd.core.indexing._LocIndexer, -) - -_DataFrameLocIndexer = make_intermediate_proxy_type( - "_DataFrameLocIndexer", - cudf.core.dataframe._DataFrameLocIndexer, - pd.core.indexing._LocIndexer, -) - -_AtIndexer = make_intermediate_proxy_type( - "_AtIndexer", - cudf.core.dataframe._DataFrameAtIndexer, - pd.core.indexing._AtIndexer, -) - -_iAtIndexer = make_intermediate_proxy_type( - "_iAtIndexer", - cudf.core.dataframe._DataFrameiAtIndexer, - pd.core.indexing._iAtIndexer, -) - -FixedForwardWindowIndexer = make_final_proxy_type( - "FixedForwardWindowIndexer", - _Unusable, - pd.api.indexers.FixedForwardWindowIndexer, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), -) - -VariableOffsetWindowIndexer = make_final_proxy_type( - "VariableOffsetWindowIndexer", - _Unusable, - pd.api.indexers.VariableOffsetWindowIndexer, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), -) - -Window = make_intermediate_proxy_type( - "Window", - _Unusable, - pd.core.window.rolling.Window, -) - -Rolling = make_intermediate_proxy_type( - "Rolling", - cudf.core.window.Rolling, - pd.core.window.Rolling, -) - -ExponentialMovingWindow = make_intermediate_proxy_type( - "ExponentialMovingWindow", - cudf.core.window.ewm.ExponentialMovingWindow, - pd.core.window.ewm.ExponentialMovingWindow, -) - -ExponentialMovingWindowGroupby = make_intermediate_proxy_type( - "ExponentialMovingWindowGroupby", - _Unusable, - pd.core.window.ewm.ExponentialMovingWindowGroupby, -) - -EWMMeanState = make_intermediate_proxy_type( - "EWMMeanState", - _Unusable, - pd.core.window.online.EWMMeanState, -) - -Expanding = make_intermediate_proxy_type( - "Expanding", - _Unusable, - pd.core.window.expanding.Expanding, -) - -ExpandingGroupby = make_intermediate_proxy_type( - "ExpandingGroupby", - _Unusable, - pd.core.window.expanding.ExpandingGroupby, -) - -Resampler = make_intermediate_proxy_type( - "Resampler", cudf.core.resample._Resampler, pd_Resampler -) - -DataFrameResampler = make_intermediate_proxy_type( - "DataFrameResampler", cudf.core.resample.DataFrameResampler, pd_Resampler -) - -SeriesResampler = make_intermediate_proxy_type( - "SeriesResampler", cudf.core.resample.SeriesResampler, pd_Resampler -) - -StataReader = make_intermediate_proxy_type( - "StataReader", - _Unusable, - pd.io.stata.StataReader, -) - -HDFStore = make_final_proxy_type( - "HDFStore", - _Unusable, - pd.HDFStore, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -ExcelFile = make_final_proxy_type( - "ExcelFile", - _Unusable, - pd.ExcelFile, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -ExcelWriter = make_final_proxy_type( - "ExcelWriter", - _Unusable, - pd.ExcelWriter, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "__hash__": _FastSlowAttribute("__hash__"), - "__fspath__": _FastSlowAttribute("__fspath__"), - }, - bases=(os.PathLike,), - metaclasses=(abc.ABCMeta,), -) - -try: - from pandas.io.formats.style import Styler as pd_Styler # isort: skip - - Styler = make_final_proxy_type( - "Styler", - _Unusable, - pd_Styler, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "css": _FastSlowAttribute("css"), - "ctx": _FastSlowAttribute("ctx"), - "index": _FastSlowAttribute("ctx"), - "data": _FastSlowAttribute("data"), - "_display_funcs": _FastSlowAttribute("_display_funcs"), - "table_styles": _FastSlowAttribute("table_styles"), - }, - ) -except ImportError: - # Styler requires Jinja to be installed - pass - -_eval_func = _FunctionProxy(_Unusable(), pd.eval) - -register_proxy_func(pd.read_pickle)( - _FunctionProxy(_Unusable(), pd.read_pickle) -) - -register_proxy_func(pd.to_pickle)(_FunctionProxy(_Unusable(), pd.to_pickle)) - - -def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None): - frame = sys._getframe(level + 3) - local_dict = frame.f_locals if local_dict is None else local_dict - global_dict = frame.f_globals if global_dict is None else global_dict - return local_dict, global_dict - - -@register_proxy_func(pd.core.computation.eval.eval) -@nvtx.annotate( - "CUDF_PANDAS_EVAL", - color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], - domain="cudf_pandas", -) -def _eval( - *args, - parser="pandas", - engine=None, - local_dict=None, - global_dict=None, - **kwargs, -): - # Custom implementation of to pre-process globals and - # locals before calling pd.eval. - level = kwargs.get("level", 0) - local_dict, global_dict = _get_eval_locals_and_globals( - level, local_dict, global_dict - ) - return _eval_func( - *args, - parser=parser, - engine=engine, - local_dict=local_dict, - global_dict=global_dict, - **kwargs, - ) - - -_orig_df_eval_method = DataFrame.eval - - -@register_proxy_func(pd.core.accessor.register_dataframe_accessor) -def _register_dataframe_accessor(name): - return pd.core.accessor._register_accessor(name, DataFrame) - - -@register_proxy_func(pd.core.accessor.register_series_accessor) -def _register_series_accessor(name): - return pd.core.accessor._register_accessor(name, Series) - - -@register_proxy_func(pd.core.accessor.register_index_accessor) -def _register_index_accessor(name): - return pd.core.accessor._register_accessor(name, Index) - - -@nvtx.annotate( - "CUDF_PANDAS_DATAFRAME_EVAL", - color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], - domain="cudf_pandas", -) -def _df_eval_method(self, *args, local_dict=None, global_dict=None, **kwargs): - level = kwargs.get("level", 0) - local_dict, global_dict = _get_eval_locals_and_globals( - level, local_dict, global_dict - ) - return _orig_df_eval_method( - self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs - ) - - -_orig_query_eval_method = DataFrame.query - - -@nvtx.annotate( - "CUDF_PANDAS_DATAFRAME_QUERY", - color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], - domain="cudf_pandas", -) -def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs): - # `query` API internally calls `eval`, hence we are making use of - # helps of `eval` to populate locals and globals dict. - level = kwargs.get("level", 0) - local_dict, global_dict = _get_eval_locals_and_globals( - level, local_dict, global_dict - ) - return _orig_query_eval_method( - self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs - ) - - -DataFrame.eval = _df_eval_method # type: ignore -DataFrame.query = _df_query_method # type: ignore - -_JsonReader = make_intermediate_proxy_type( - "_JsonReader", - _Unusable, - pd.io.json._json.JsonReader, -) - -_TextFileReader = make_intermediate_proxy_type( - "_TextFileReader", _Unusable, pd.io.parsers.readers.TextFileReader -) - -_XportReader = make_intermediate_proxy_type( - "_XportReader", _Unusable, pd_XportReader -) - -_SAS7BDATReader = make_intermediate_proxy_type( - "_SAS7BDATReader", _Unusable, pd_SAS7BDATReader -) - -USFederalHolidayCalendar = make_final_proxy_type( - "USFederalHolidayCalendar", - _Unusable, - pd_USFederalHolidayCalendar, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -HolidayCalendarMetaClass = make_final_proxy_type( - "HolidayCalendarMetaClass", - _Unusable, - pd_HolidayCalendarMetaClass, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - - -@register_proxy_func(pd_HolidayCalendarFactory) -def holiday_calendar_factory_wrapper(*args, **kwargs): - # Call the original HolidayCalendarFactory - result = _FunctionProxy(_Unusable(), pd_HolidayCalendarFactory)( - *args, **kwargs - ) - # Return the slow proxy of the result - return result._fsproxy_slow - - -AbstractHolidayCalendar = make_final_proxy_type( - "AbstractHolidayCalendar", - _Unusable, - pd_AbstractHolidayCalendar, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, - metaclasses=(pd_HolidayCalendarMetaClass,), -) - -Holiday = make_final_proxy_type( - "Holiday", - _Unusable, - pd_Holiday, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) -USThanksgivingDay = make_final_proxy_type( - "USThanksgivingDay", - _Unusable, - pd_USThanksgivingDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -USColumbusDay = make_final_proxy_type( - "USColumbusDay", - _Unusable, - pd_USColumbusDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -USLaborDay = make_final_proxy_type( - "USLaborDay", - _Unusable, - pd_USLaborDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -USMemorialDay = make_final_proxy_type( - "USMemorialDay", - _Unusable, - pd_USMemorialDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -USMartinLutherKingJr = make_final_proxy_type( - "USMartinLutherKingJr", - _Unusable, - pd_USMartinLutherKingJr, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -USPresidentsDay = make_final_proxy_type( - "USPresidentsDay", - _Unusable, - pd_USPresidentsDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - - -GoodFriday = make_final_proxy_type( - "GoodFriday", - _Unusable, - pd_GoodFriday, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -EasterMonday = make_final_proxy_type( - "EasterMonday", - _Unusable, - pd_EasterMonday, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -FY5253 = make_final_proxy_type( - "FY5253", - _Unusable, - pd.offsets.FY5253, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BDay = make_final_proxy_type( - "BDay", - _Unusable, - pd.offsets.BDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BMonthBegin = make_final_proxy_type( - "BMonthBegin", - _Unusable, - pd.offsets.BMonthBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BMonthEnd = make_final_proxy_type( - "BMonthEnd", - _Unusable, - pd.offsets.BMonthEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BQuarterBegin = make_final_proxy_type( - "BQuarterBegin", - _Unusable, - pd.offsets.BQuarterBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BQuarterEnd = make_final_proxy_type( - "BQuarterEnd", - _Unusable, - pd.offsets.BQuarterEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BusinessDay = make_final_proxy_type( - "BusinessDay", - _Unusable, - pd.offsets.BusinessDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BusinessHour = make_final_proxy_type( - "BusinessHour", - _Unusable, - pd.offsets.BusinessHour, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BusinessMonthBegin = make_final_proxy_type( - "BusinessMonthBegin", - _Unusable, - pd.offsets.BusinessMonthBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BusinessMonthEnd = make_final_proxy_type( - "BusinessMonthEnd", - _Unusable, - pd.offsets.BusinessMonthEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BYearBegin = make_final_proxy_type( - "BYearBegin", - _Unusable, - pd.offsets.BYearBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BYearEnd = make_final_proxy_type( - "BYearEnd", - _Unusable, - pd.offsets.BYearEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -CBMonthBegin = make_final_proxy_type( - "CBMonthBegin", - _Unusable, - pd.offsets.CBMonthBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -CBMonthEnd = make_final_proxy_type( - "CBMonthEnd", - _Unusable, - pd.offsets.CBMonthEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -CDay = make_final_proxy_type( - "CDay", - _Unusable, - pd.offsets.CDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -CustomBusinessDay = make_final_proxy_type( - "CustomBusinessDay", - _Unusable, - pd.offsets.CustomBusinessDay, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -CustomBusinessHour = make_final_proxy_type( - "CustomBusinessHour", - _Unusable, - pd.offsets.CustomBusinessHour, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -CustomBusinessMonthBegin = make_final_proxy_type( - "CustomBusinessMonthBegin", - _Unusable, - pd.offsets.CustomBusinessMonthBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -CustomBusinessMonthEnd = make_final_proxy_type( - "CustomBusinessMonthEnd", - _Unusable, - pd.offsets.CustomBusinessMonthEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -DateOffset = make_final_proxy_type( - "DateOffset", - _Unusable, - pd.offsets.DateOffset, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -BaseOffset = make_final_proxy_type( - "BaseOffset", - _Unusable, - pd.offsets.BaseOffset, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Day = make_final_proxy_type( - "Day", - _Unusable, - pd.offsets.Day, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Easter = make_final_proxy_type( - "Easter", - _Unusable, - pd.offsets.Easter, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -FY5253Quarter = make_final_proxy_type( - "FY5253Quarter", - _Unusable, - pd.offsets.FY5253Quarter, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Hour = make_final_proxy_type( - "Hour", - _Unusable, - pd.offsets.Hour, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -LastWeekOfMonth = make_final_proxy_type( - "LastWeekOfMonth", - _Unusable, - pd.offsets.LastWeekOfMonth, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Micro = make_final_proxy_type( - "Micro", - _Unusable, - pd.offsets.Micro, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Milli = make_final_proxy_type( - "Milli", - _Unusable, - pd.offsets.Milli, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Minute = make_final_proxy_type( - "Minute", - _Unusable, - pd.offsets.Minute, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - - -MonthBegin = make_final_proxy_type( - "MonthBegin", - _Unusable, - pd.offsets.MonthBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -MonthEnd = make_final_proxy_type( - "MonthEnd", - _Unusable, - pd.offsets.MonthEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Nano = make_final_proxy_type( - "Nano", - _Unusable, - pd.offsets.Nano, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -QuarterBegin = make_final_proxy_type( - "QuarterBegin", - _Unusable, - pd.offsets.QuarterBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -QuarterEnd = make_final_proxy_type( - "QuarterEnd", - _Unusable, - pd.offsets.QuarterEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Second = make_final_proxy_type( - "Second", - _Unusable, - pd.offsets.Second, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -SemiMonthBegin = make_final_proxy_type( - "SemiMonthBegin", - _Unusable, - pd.offsets.SemiMonthBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -SemiMonthEnd = make_final_proxy_type( - "SemiMonthEnd", - _Unusable, - pd.offsets.SemiMonthEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Tick = make_final_proxy_type( - "Tick", - _Unusable, - pd.offsets.Tick, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Week = make_final_proxy_type( - "Week", - _Unusable, - pd.offsets.Week, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -WeekOfMonth = make_final_proxy_type( - "WeekOfMonth", - _Unusable, - pd.offsets.WeekOfMonth, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -YearBegin = make_final_proxy_type( - "YearBegin", - _Unusable, - pd.offsets.YearBegin, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -YearEnd = make_final_proxy_type( - "YearEnd", - _Unusable, - pd.offsets.YearEnd, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -Flags = make_final_proxy_type( - "Flags", - _Unusable, - pd.Flags, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -NamedAgg = make_final_proxy_type( - "NamedAgg", - _Unusable, - pd.NamedAgg, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={"__hash__": _FastSlowAttribute("__hash__")}, -) - -ArrowExtensionArray = make_final_proxy_type( - "ExtensionArray", - _Unusable, - pd.arrays.ArrowExtensionArray, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), -) - - -# The following are subclasses of `pandas.core.base.PandasObj`, -# excluding subclasses defined in `pandas.core.internals`. These are -# not strictly part of the Pandas public API, but they do appear as -# return types. - -_PANDAS_OBJ_FINAL_TYPES = [ - pd.core.arrays.sparse.array.SparseArray, - pd.core.indexes.frozen.FrozenList, - pd.core.indexes.category.CategoricalIndex, - pd.core.indexes.datetimelike.DatetimeTimedeltaMixin, - pd.core.indexes.datetimelike.DatetimeIndexOpsMixin, - pd.core.indexes.extension.NDArrayBackedExtensionIndex, - pd.core.generic.NDFrame, - pd.core.indexes.accessors.PeriodProperties, - pd.core.indexes.accessors.Properties, - pd.plotting._core.PlotAccessor, - pd.io.sql.SQLiteTable, - pd.io.sql.SQLTable, - pd.io.sql.SQLDatabase, - pd.io.sql.SQLiteDatabase, - pd.io.sql.PandasSQL, -] - -_PANDAS_OBJ_INTERMEDIATE_TYPES = [ - pd.core.groupby.groupby.GroupByPlot, - pd.core.groupby.groupby.GroupBy, - pd.core.groupby.groupby.BaseGroupBy, -] - -for typ in _PANDAS_OBJ_FINAL_TYPES: - if typ.__name__ in globals(): - # if we already defined a proxy type - # corresponding to this type, use that. - continue - globals()[typ.__name__] = make_final_proxy_type( - typ.__name__, - _Unusable, - typ, - fast_to_slow=_Unusable(), - slow_to_fast=_Unusable(), - additional_attributes={ - "__array__": array_method, - "__array_function__": array_function_method, - "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"), - "__hash__": _FastSlowAttribute("__hash__"), - }, - ) - - -for typ in _PANDAS_OBJ_INTERMEDIATE_TYPES: - if typ.__name__ in globals(): - # if we already defined a proxy type - # corresponding to this type, use that. - continue - globals()[typ.__name__] = make_intermediate_proxy_type( - typ.__name__, - _Unusable, - typ, - ) - - -# timestamps and timedeltas are not proxied, but non-proxied -# pandas types are currently not picklable. Thus, we define -# custom reducer/unpicker functions for these types: -def _reduce_obj(obj): - from cudf.pandas.module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - # args can contain objects that are unpicklable - # when the module accelerator is disabled - # (freq is of a proxy type): - pickled_args = pickle.dumps(obj.__reduce__()) - - return _unpickle_obj, (pickled_args,) - - -def _unpickle_obj(pickled_args): - from cudf.pandas.module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - unpickler, args = pickle.loads(pickled_args) - obj = unpickler(*args) - return obj - - -copyreg.dispatch_table[pd.Timestamp] = _reduce_obj -# same reducer/unpickler can be used for Timedelta: -copyreg.dispatch_table[pd.Timedelta] = _reduce_obj diff --git a/python/cudf/cudf/pandas/annotation.py b/python/cudf/cudf/pandas/annotation.py deleted file mode 100644 index 30e2f3c5717..00000000000 --- a/python/cudf/cudf/pandas/annotation.py +++ /dev/null @@ -1,35 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -try: - import nvtx -except ImportError: - - class nvtx: # type: ignore - """Noop-stub with the same API as nvtx.""" - - push_range = lambda *args, **kwargs: None # noqa: E731 - pop_range = lambda *args, **kwargs: None # noqa: E731 - - class annotate: - """No-op annotation/context-manager""" - - def __init__( - self, - message: str | None = None, - color: str | None = None, - domain: str | None = None, - category: str | int | None = None, - ): - pass - - def __enter__(self): - return self - - def __exit__(self, *exc): - return False - - __call__ = lambda self, fn: fn # noqa: E731 diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py deleted file mode 100644 index c364d55e677..00000000000 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ /dev/null @@ -1,1330 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # noqa: E501 -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import functools -import inspect -import operator -import pickle -import types -import warnings -from collections.abc import Callable, Iterator -from enum import IntEnum -from typing import Any, Literal, Mapping - -import numpy as np - -from ..options import _env_get_bool -from ..testing import assert_eq -from .annotation import nvtx -from .proxy_base import ProxyNDarrayBase - - -def call_operator(fn, args, kwargs): - return fn(*args, **kwargs) - - -_CUDF_PANDAS_NVTX_COLORS = { - "COPY_SLOW_TO_FAST": 0xCA0020, - "COPY_FAST_TO_SLOW": 0xF4A582, - "EXECUTE_FAST": 0x92C5DE, - "EXECUTE_SLOW": 0x0571B0, -} - - -_WRAPPER_ASSIGNMENTS = tuple( - attr - for attr in functools.WRAPPER_ASSIGNMENTS - # Skip __doc__ because we assign it on class creation using exec_body - # callable that updates the namespace of the class. - # Skip __annotations__ because there are differences between Python - # versions on how it is initialized for a class that doesn't explicitly - # define it and we don't want to force eager evaluation of anything that - # would normally be lazy (mostly for consistency, shouldn't cause any - # significant issues). - if attr not in ("__annotations__", "__doc__") -) - - -def callers_module_name(): - # Call f_back twice since this function adds an extra frame - return inspect.currentframe().f_back.f_back.f_globals["__name__"] - - -class _State(IntEnum): - """Simple enum to track the type of wrapped object of a final proxy""" - - SLOW = 0 - FAST = 1 - - -class _Unusable: - """ - A totally unusable type. When a "fast" object is not available, - it's useful to set it to _Unusable() so that any operations - on it fail, and ensure fallback to the corresponding - "slow" object. - """ - - def __call__(self, *args: Any, **kwds: Any) -> Any: - raise NotImplementedError( - "Fast implementation not available. " - "Falling back to the slow implementation" - ) - - def __getattribute__(self, name: str) -> Any: - if name in {"__class__"}: # needed for type introspection - return super().__getattribute__(name) - raise TypeError("Unusable type. Falling back to the slow object") - - def __repr__(self) -> str: - raise AttributeError("Unusable type. Falling back to the slow object") - - -class _PickleConstructor: - """A pickleable object to support construction in __reduce__. - - This object is used to avoid having unpickling call __init__ on the - objects, instead only invoking __new__. __init__ may have required - arguments or otherwise perform invalid initialization that we could skip - altogether since we're going to overwrite the wrapped object. - """ - - def __init__(self, type_): - self._type = type_ - - def __call__(self): - return object.__new__(self._type) - - -_DELETE = object() - - -def make_final_proxy_type( - name: str, - fast_type: type, - slow_type: type, - *, - fast_to_slow: Callable, - slow_to_fast: Callable, - module: str | None = None, - additional_attributes: Mapping[str, Any] | None = None, - postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None, - bases: tuple = (), - metaclasses: tuple = (), -) -> type[_FinalProxy]: - """ - Defines a fast-slow proxy type for a pair of "final" fast and slow - types. Final types are types for which known operations exist for - converting an object of "fast" type to "slow" and vice-versa. - - Parameters - ---------- - name: str - The name of the class returned - fast_type: type - slow_type: type - fast_to_slow: callable - Function that accepts a single argument of type `fast_type` - and returns an object of type `slow_type` - slow_to_fast: callable - Function that accepts a single argument of type `slow_type` - and returns an object of type `fast_type` - additional_attributes - Mapping of additional attributes to add to the class - (optional), these will override any defaulted attributes (e.g. - ``__init__`). If you want to remove a defaulted attribute - completely, pass the special sentinel ``_DELETE`` as a value. - postprocess - Optional function called to allow the proxy to postprocess - itself when being wrapped up, called with the proxy object, - the unwrapped result object, and the function that was used to - construct said unwrapped object. See also `_maybe_wrap_result`. - bases - Optional tuple of base classes to insert into the mro. - metaclasses - Optional tuple of metaclasses to unify with the base proxy metaclass. - - Notes - ----- - As a side-effect, this function adds `fast_type` and `slow_type` - to a global mapping of final types to their corresponding proxy - types, accessible via `get_final_type_map()`. - """ - - def __init__(self, *args, **kwargs): - _fast_slow_function_call( - lambda cls, args, kwargs: setattr( - self, "_fsproxy_wrapped", cls(*args, **kwargs) - ), - type(self), - args, - kwargs, - ) - - @nvtx.annotate( - "COPY_SLOW_TO_FAST", - color=_CUDF_PANDAS_NVTX_COLORS["COPY_SLOW_TO_FAST"], - domain="cudf_pandas", - ) - def _fsproxy_slow_to_fast(self): - # if we are wrapping a slow object, - # convert it to a fast one - if self._fsproxy_state is _State.SLOW: - return slow_to_fast(self._fsproxy_wrapped) - return self._fsproxy_wrapped - - @nvtx.annotate( - "COPY_FAST_TO_SLOW", - color=_CUDF_PANDAS_NVTX_COLORS["COPY_FAST_TO_SLOW"], - domain="cudf_pandas", - ) - def _fsproxy_fast_to_slow(self): - # if we are wrapping a fast object, - # convert it to a slow one - if self._fsproxy_state is _State.FAST: - return fast_to_slow(self._fsproxy_wrapped) - return self._fsproxy_wrapped - - @property # type: ignore - def _fsproxy_state(self) -> _State: - return ( - _State.FAST - if isinstance(self._fsproxy_wrapped, self._fsproxy_fast_type) - else _State.SLOW - ) - - slow_dir = dir(slow_type) - cls_dict = { - "__init__": __init__, - "__doc__": inspect.getdoc(slow_type), - "_fsproxy_slow_dir": slow_dir, - "_fsproxy_fast_type": fast_type, - "_fsproxy_slow_type": slow_type, - "_fsproxy_slow_to_fast": _fsproxy_slow_to_fast, - "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow, - "_fsproxy_state": _fsproxy_state, - } - - if additional_attributes is None: - additional_attributes = {} - for method in _SPECIAL_METHODS: - if getattr(slow_type, method, False): - cls_dict[method] = _FastSlowAttribute(method) - for k, v in additional_attributes.items(): - if v is _DELETE and k in cls_dict: - del cls_dict[k] - elif v is not _DELETE: - cls_dict[k] = v - - for slow_name in dir(slow_type): - if slow_name in cls_dict or slow_name.startswith("__"): - continue - else: - cls_dict[slow_name] = _FastSlowAttribute( - slow_name, private=slow_name.startswith("_") - ) - - metaclass = _FastSlowProxyMeta - if metaclasses: - metaclass = types.new_class( # type: ignore - f"{name}_Meta", - metaclasses + (_FastSlowProxyMeta,), - {}, - ) - cls = types.new_class( - name, - (*bases, _FinalProxy), - {"metaclass": metaclass}, - lambda ns: ns.update(cls_dict), - ) - functools.update_wrapper( - cls, - slow_type, - assigned=_WRAPPER_ASSIGNMENTS, - updated=(), - ) - cls.__module__ = module if module is not None else callers_module_name() - - final_type_map = get_final_type_map() - if fast_type is not _Unusable: - final_type_map[fast_type] = cls - final_type_map[slow_type] = cls - - return cls - - -def make_intermediate_proxy_type( - name: str, - fast_type: type, - slow_type: type, - *, - module: str | None = None, -) -> type[_IntermediateProxy]: - """ - Defines a proxy type for a pair of "intermediate" fast and slow - types. Intermediate types are the types of the results of - operations invoked on final types. - - As a side-effect, this function adds `fast_type` and `slow_type` - to a global mapping of intermediate types to their corresponding - proxy types, accessible via `get_intermediate_type_map()`. - - Parameters - ---------- - name: str - The name of the class returned - fast_type: type - slow_type: type - """ - - def __init__(self, *args, **kwargs): - # disallow __init__. An intermediate proxy type can only be - # instantiated from (possibly chained) operations on a final - # proxy type. - raise TypeError( - f"Cannot directly instantiate object of type {type(self)}" - ) - - @property # type: ignore - def _fsproxy_state(self): - return ( - _State.FAST - if isinstance(self._fsproxy_wrapped, self._fsproxy_fast_type) - else _State.SLOW - ) - - @nvtx.annotate( - "COPY_SLOW_TO_FAST", - color=_CUDF_PANDAS_NVTX_COLORS["COPY_SLOW_TO_FAST"], - domain="cudf_pandas", - ) - def _fsproxy_slow_to_fast(self): - if self._fsproxy_state is _State.SLOW: - return super(type(self), self)._fsproxy_slow_to_fast() - return self._fsproxy_wrapped - - @nvtx.annotate( - "COPY_FAST_TO_SLOW", - color=_CUDF_PANDAS_NVTX_COLORS["COPY_FAST_TO_SLOW"], - domain="cudf_pandas", - ) - def _fsproxy_fast_to_slow(self): - if self._fsproxy_state is _State.FAST: - return super(type(self), self)._fsproxy_fast_to_slow() - return self._fsproxy_wrapped - - slow_dir = dir(slow_type) - cls_dict = { - "__init__": __init__, - "__doc__": inspect.getdoc(slow_type), - "_fsproxy_slow_dir": slow_dir, - "_fsproxy_fast_type": fast_type, - "_fsproxy_slow_type": slow_type, - "_fsproxy_slow_to_fast": _fsproxy_slow_to_fast, - "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow, - "_fsproxy_state": _fsproxy_state, - } - for method in _SPECIAL_METHODS: - if getattr(slow_type, method, False): - cls_dict[method] = _FastSlowAttribute(method) - - for slow_name in dir(slow_type): - if slow_name in cls_dict or slow_name.startswith("__"): - continue - else: - cls_dict[slow_name] = _FastSlowAttribute( - slow_name, private=slow_name.startswith("_") - ) - - for slow_name in getattr(slow_type, "_attributes", []): - if slow_name in cls_dict: - continue - else: - cls_dict[slow_name] = _FastSlowAttribute( - slow_name, private=slow_name.startswith("_") - ) - - cls = types.new_class( - name, - (_IntermediateProxy,), - {"metaclass": _FastSlowProxyMeta}, - lambda ns: ns.update(cls_dict), - ) - functools.update_wrapper( - cls, - slow_type, - assigned=_WRAPPER_ASSIGNMENTS, - updated=(), - ) - cls.__module__ = module if module is not None else callers_module_name() - - intermediate_type_map = get_intermediate_type_map() - if fast_type is not _Unusable: - intermediate_type_map[fast_type] = cls - intermediate_type_map[slow_type] = cls - - return cls - - -def register_proxy_func(slow_func: Callable): - """ - Decorator to register custom function as a proxy for slow_func. - - Parameters - ---------- - slow_func: Callable - The function to register a wrapper for. - - Returns - ------- - Callable - """ - - def wrapper(func): - registered_functions = get_registered_functions() - registered_functions[slow_func] = func - functools.update_wrapper(func, slow_func) - return func - - return wrapper - - -@functools.lru_cache(maxsize=None) -def get_final_type_map(): - """ - Return the mapping of all known fast and slow final types to their - corresponding proxy types. - """ - return dict() - - -@functools.lru_cache(maxsize=None) -def get_intermediate_type_map(): - """ - Return a mapping of all known fast and slow intermediate types to their - corresponding proxy types. - """ - return dict() - - -@functools.lru_cache(maxsize=None) -def get_registered_functions(): - return dict() - - -def _raise_attribute_error(obj, name): - """ - Raise an AttributeError with a message that is consistent with - the error raised by Python for a non-existent attribute on a - proxy object. - """ - raise AttributeError(f"'{obj}' object has no attribute '{name}'") - - -class _FastSlowProxyMeta(type): - """ - Metaclass used to dynamically find class attributes and - classmethods of fast-slow proxy types. - """ - - _fsproxy_slow_dir: list - _fsproxy_slow_type: type - _fsproxy_fast_type: type - - @property - def _fsproxy_slow(self) -> type: - return self._fsproxy_slow_type - - @property - def _fsproxy_fast(self) -> type: - return self._fsproxy_fast_type - - def __dir__(self): - # Try to return the cached dir of the slow object, but if it - # doesn't exist, fall back to the default implementation. - try: - return self._fsproxy_slow_dir - except AttributeError: - return type.__dir__(self) - - def __subclasscheck__(self, __subclass: type) -> bool: - if super().__subclasscheck__(__subclass): - return True - if hasattr(__subclass, "_fsproxy_slow"): - return issubclass(__subclass._fsproxy_slow, self._fsproxy_slow) - return False - - def __instancecheck__(self, __instance: Any) -> bool: - if super().__instancecheck__(__instance): - return True - elif hasattr(type(__instance), "_fsproxy_slow"): - return issubclass(type(__instance), self) - return False - - -class _FastSlowProxy: - """ - Base class for all fast=slow proxy types. - - A fast-slow proxy is proxy for a pair of types that provide "fast" - and "slow" implementations of the same API. At any time, a - fast-slow proxy wraps an object of either "fast" type, or "slow" - type. Operations invoked on the fast-slow proxy are first - delegated to the "fast" type, and if that fails, to the "slow" - type. - """ - - _fsproxy_wrapped: Any - - def _fsproxy_fast_to_slow(self) -> Any: - """ - If the wrapped object is of "fast" type, returns the - corresponding "slow" object. Otherwise, returns the wrapped - object as-is. - """ - raise NotImplementedError("Abstract base class") - - def _fsproxy_slow_to_fast(self) -> Any: - """ - If the wrapped object is of "slow" type, returns the - corresponding "fast" object. Otherwise, returns the wrapped - object as-is. - """ - raise NotImplementedError("Abstract base class") - - @property - def _fsproxy_fast(self) -> Any: - """ - Returns the wrapped object. If the wrapped object is of "slow" - type, replaces it with the corresponding "fast" object before - returning it. - """ - self._fsproxy_wrapped = self._fsproxy_slow_to_fast() - return self._fsproxy_wrapped - - @property - def _fsproxy_slow(self) -> Any: - """ - Returns the wrapped object. If the wrapped object is of "fast" - type, replaces it with the corresponding "slow" object before - returning it. - """ - self._fsproxy_wrapped = self._fsproxy_fast_to_slow() - return self._fsproxy_wrapped - - def __dir__(self): - # Try to return the cached dir of the slow object, but if it - # doesn't exist, fall back to the default implementation. - try: - return self._fsproxy_slow_dir - except AttributeError: - return object.__dir__(self) - - def __setattr__(self, name, value): - if name.startswith("_"): - object.__setattr__(self, name, value) - return - return _FastSlowAttribute("__setattr__").__get__(self, type(self))( - name, value - ) - - -class _FinalProxy(_FastSlowProxy): - """ - Proxy type for a pair of fast and slow "final" types for which - there is a known conversion from fast to slow, and vice-versa. - The conversion between fast and slow types is done using - user-provided conversion functions. - - Do not attempt to use this class directly. Instead, use - `make_final_proxy_type` to create subtypes. - """ - - @classmethod - def _fsproxy_wrap(cls, value, func): - """Default mechanism to wrap a value in a proxy type - - Parameters - ---------- - cls - The proxy type - value - The value to wrap up - func - The function called that constructed value - - Returns - ------- - A new proxied object - - Notes - ----- - _FinalProxy subclasses can override this classmethod if they - need particular behaviour when wrapped up. - """ - # TODO: Replace the if-elif-else using singledispatch helper function - base_class = _get_proxy_base_class(cls) - if base_class is object: - proxy = base_class.__new__(cls) - elif base_class is ProxyNDarrayBase: - proxy = base_class.__new__(cls, value) - else: - raise TypeError( - f"Cannot create an proxy instance of {cls.__name__} using base class {base_class.__name__}. " - f"Expected either 'object' or another type in 'PROXY_BASE_CLASSES'" - ) - proxy._fsproxy_wrapped = value - return proxy - - def __reduce__(self): - """ - In conjunction with `__proxy_setstate__`, this effectively enables - proxy types to be pickled and unpickled by pickling and unpickling - the underlying wrapped types. - """ - # Need a local import to avoid circular import issues - from .module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - pickled_wrapped_obj = pickle.dumps(self._fsproxy_wrapped) - return (_PickleConstructor(type(self)), (), pickled_wrapped_obj) - - def __setstate__(self, state): - # Need a local import to avoid circular import issues - from .module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - unpickled_wrapped_obj = pickle.loads(state) - self._fsproxy_wrapped = unpickled_wrapped_obj - - -class _IntermediateProxy(_FastSlowProxy): - """ - Proxy type for a pair of "intermediate" types that appear as - intermediate values when invoking operations on "final" types. - The conversion between fast and slow types is done by keeping - track of the sequence of operations that created the wrapped - object, and "playing back" that sequence starting from the "slow" - version of the originating _FinalProxy. - - Do not attempt to use this class directly. Instead, use - `make_intermediate_proxy_type` to create subtypes. - """ - - _method_chain: tuple[Callable, tuple, dict] - - @classmethod - def _fsproxy_wrap( - cls, - obj: Any, - method_chain: tuple[Callable, tuple, dict], - ): - """ - Parameters - ---------- - obj: The object to wrap - method_chain: A tuple of the form (func, args, kwargs) where - `func` is the function that was called to create `obj`, - and `args` and `kwargs` are the arguments that were passed - to `func`. - """ - proxy = object.__new__(cls) - proxy._fsproxy_wrapped = obj - proxy._method_chain = method_chain - return proxy - - @nvtx.annotate( - "COPY_SLOW_TO_FAST", - color=_CUDF_PANDAS_NVTX_COLORS["COPY_SLOW_TO_FAST"], - domain="cudf_pandas", - ) - def _fsproxy_slow_to_fast(self) -> Any: - func, args, kwargs = self._method_chain - args, kwargs = _fast_arg(args), _fast_arg(kwargs) - return func(*args, **kwargs) - - @nvtx.annotate( - "COPY_FAST_TO_SLOW", - color=_CUDF_PANDAS_NVTX_COLORS["COPY_FAST_TO_SLOW"], - domain="cudf_pandas", - ) - def _fsproxy_fast_to_slow(self) -> Any: - func, args, kwargs = self._method_chain - args, kwargs = _slow_arg(args), _slow_arg(kwargs) - return func(*args, **kwargs) - - def __reduce__(self): - """ - In conjunction with `__proxy_setstate__`, this effectively enables - proxy types to be pickled and unpickled by pickling and unpickling - the underlying wrapped types. - """ - # Need a local import to avoid circular import issues - from .module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - pickled_wrapped_obj = pickle.dumps(self._fsproxy_wrapped) - pickled_method_chain = pickle.dumps(self._method_chain) - return ( - _PickleConstructor(type(self)), - (), - (pickled_wrapped_obj, pickled_method_chain), - ) - - def __setstate__(self, state): - # Need a local import to avoid circular import issues - from .module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - unpickled_wrapped_obj = pickle.loads(state[0]) - unpickled_method_chain = pickle.loads(state[1]) - self._fsproxy_wrapped = unpickled_wrapped_obj - self._method_chain = unpickled_method_chain - - -class _CallableProxyMixin: - """ - Mixin class that implements __call__ for fast-slow proxies. - """ - - # For wrapped callables isinstance(self, FunctionType) should return True - __class__ = types.FunctionType # type: ignore - - def __call__(self, *args, **kwargs) -> Any: - result, _ = _fast_slow_function_call( - # We cannot directly call self here because we need it to be - # converted into either the fast or slow object (by - # _fast_slow_function_call) to avoid infinite recursion. - # TODO: When Python 3.11 is the minimum supported Python version - # this can use operator.call - call_operator, - self, - args, - kwargs, - ) - return result - - -class _FunctionProxy(_CallableProxyMixin): - """ - Proxy for a pair of fast and slow functions. - """ - - __name__: str - - def __init__( - self, - fast: Callable | _Unusable, - slow: Callable, - *, - assigned=None, - updated=None, - ): - self._fsproxy_fast = fast - self._fsproxy_slow = slow - if assigned is None: - assigned = functools.WRAPPER_ASSIGNMENTS - if updated is None: - updated = functools.WRAPPER_UPDATES - functools.update_wrapper( - self, - slow, - assigned=assigned, - updated=updated, - ) - - def __reduce__(self): - """ - In conjunction with `__proxy_setstate__`, this effectively enables - proxy types to be pickled and unpickled by pickling and unpickling - the underlying wrapped types. - """ - # Need a local import to avoid circular import issues - from .module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - pickled_fast = pickle.dumps(self._fsproxy_fast) - pickled_slow = pickle.dumps(self._fsproxy_slow) - return ( - _PickleConstructor(type(self)), - (), - (pickled_fast, pickled_slow), - ) - - def __setstate__(self, state): - # Need a local import to avoid circular import issues - from .module_accelerator import disable_module_accelerator - - with disable_module_accelerator(): - unpickled_fast = pickle.loads(state[0]) - unpickled_slow = pickle.loads(state[1]) - self._fsproxy_fast = unpickled_fast - self._fsproxy_slow = unpickled_slow - - -def is_bound_method(obj): - return inspect.ismethod(obj) and not inspect.isfunction(obj) - - -def is_function(obj): - return inspect.isfunction(obj) or isinstance(obj, types.FunctionType) - - -class _FastSlowAttribute: - """ - A descriptor type used to define attributes of fast-slow proxies. - """ - - _attr: Any - - def __init__(self, name: str, *, private: bool = False): - self._name = name - self._private = private - self._attr = None - self._doc = None - self._dir = None - - def __get__(self, instance, owner) -> Any: - from .module_accelerator import disable_module_accelerator - - if self._attr is None: - if self._private: - fast_attr = _Unusable() - else: - fast_attr = getattr( - owner._fsproxy_fast, self._name, _Unusable() - ) - - try: - slow_attr = getattr(owner._fsproxy_slow, self._name) - except AttributeError as e: - if instance is not None: - return _maybe_wrap_result( - getattr(instance._fsproxy_slow, self._name), - None, # type: ignore - ) - else: - raise e - - if _is_function_or_method(slow_attr): - self._attr = _MethodProxy(fast_attr, slow_attr) - else: - # for anything else, use a fast-slow attribute: - self._attr, _ = _fast_slow_function_call( - getattr, - owner, - self._name, - ) - - if isinstance( - self._attr, (property, functools.cached_property) - ): - with disable_module_accelerator(): - self._attr.__doc__ = inspect.getdoc(slow_attr) - - if instance is not None: - if isinstance(self._attr, _MethodProxy): - if is_bound_method(self._attr._fsproxy_slow): - return self._attr - else: - return types.MethodType(self._attr, instance) - else: - if self._private: - return _maybe_wrap_result( - getattr(instance._fsproxy_slow, self._name), - None, # type: ignore - ) - return _fast_slow_function_call( - getattr, - instance, - self._name, - )[0] - return self._attr - - -class _MethodProxy(_FunctionProxy): - def __init__(self, fast, slow): - super().__init__( - fast, - slow, - updated=functools.WRAPPER_UPDATES, - assigned=( - tuple(filter(lambda x: x != "__name__", _WRAPPER_ASSIGNMENTS)) - ), - ) - - def __dir__(self): - return self._fsproxy_slow.__dir__() - - @property - def __doc__(self): - return self._fsproxy_slow.__doc__ - - @property - def __name__(self): - return self._fsproxy_slow.__name__ - - @__name__.setter - def __name__(self, value): - try: - setattr(self._fsproxy_fast, "__name__", value) - except AttributeError: - pass - setattr(self._fsproxy_slow, "__name__", value) - - -def _assert_fast_slow_eq(left, right): - if _is_final_type(type(left)) or type(left) in NUMPY_TYPES: - assert_eq(left, right) - - -class ProxyFallbackError(Exception): - """Raised when fallback occurs""" - - pass - - -def _fast_function_call(): - """ - Placeholder fast function for pytest profiling purposes. - """ - return None - - -def _slow_function_call(): - """ - Placeholder slow function for pytest profiling purposes. - """ - return None - - -def _fast_slow_function_call( - func: Callable, - /, - *args, - **kwargs, -) -> Any: - """ - Call `func` with all `args` and `kwargs` converted to their - respective fast type. If that fails, call `func` with all - `args` and `kwargs` converted to their slow type. - - Wrap the result in a fast-slow proxy if it is a type we know how - to wrap. - """ - from .module_accelerator import disable_module_accelerator - - fast = False - try: - with nvtx.annotate( - "EXECUTE_FAST", - color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_FAST"], - domain="cudf_pandas", - ): - fast_args, fast_kwargs = _fast_arg(args), _fast_arg(kwargs) - result = func(*fast_args, **fast_kwargs) - if result is NotImplemented: - # try slow path - raise Exception() - fast = True - _fast_function_call() - if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): - try: - with nvtx.annotate( - "EXECUTE_SLOW_DEBUG", - color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], - domain="cudf_pandas", - ): - slow_args, slow_kwargs = ( - _slow_arg(args), - _slow_arg(kwargs), - ) - with disable_module_accelerator(): - slow_result = func(*slow_args, **slow_kwargs) - except Exception as e: - warnings.warn( - "The result from pandas could not be computed. " - f"The exception was {e}." - ) - else: - try: - _assert_fast_slow_eq(result, slow_result) - except AssertionError as e: - warnings.warn( - "The results from cudf and pandas were different. " - f"The exception was {e}." - ) - except Exception as e: - warnings.warn( - "Pandas debugging mode failed. " - f"The exception was {e}." - ) - except Exception as err: - if _env_get_bool("CUDF_PANDAS_FAIL_ON_FALLBACK", False): - raise ProxyFallbackError( - f"The operation failed with cuDF, the reason was {type(err)}: {err}" - ) from err - with nvtx.annotate( - "EXECUTE_SLOW", - color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"], - domain="cudf_pandas", - ): - slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) - if _env_get_bool("LOG_FAST_FALLBACK", False): - from ._logger import log_fallback - - log_fallback(slow_args, slow_kwargs, err) - _slow_function_call() - with disable_module_accelerator(): - result = func(*slow_args, **slow_kwargs) - return _maybe_wrap_result(result, func, *args, **kwargs), fast - - -def _transform_arg( - arg: Any, - attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], - seen: set[int], -) -> Any: - """ - Transform "arg" into its corresponding slow (or fast) type. - """ - import numpy as np - - if isinstance(arg, (_FastSlowProxy, _FastSlowProxyMeta, _FunctionProxy)): - typ = getattr(arg, attribute_name) - if typ is _Unusable: - raise Exception("Cannot transform _Unusable") - return typ - elif isinstance(arg, types.ModuleType) and attribute_name in arg.__dict__: - return arg.__dict__[attribute_name] - elif isinstance(arg, list): - return type(arg)(_transform_arg(a, attribute_name, seen) for a in arg) - elif isinstance(arg, tuple): - # This attempts to handle arbitrary subclasses of tuple by - # assuming that if you've subclassed tuple with some special - # behaviour you'll also make the object pickleable by - # implementing the custom pickle protocol interface (either - # __getnewargs_ex__ or __getnewargs__). Perhaps this should - # use __reduce_ex__ instead... - if type(arg) is tuple: - # Must come first to avoid infinite recursion - return tuple(_transform_arg(a, attribute_name, seen) for a in arg) - elif hasattr(arg, "__getnewargs_ex__"): - # Partial implementation of to reconstruct with - # transformed pieces - # This handles scipy._lib._bunch._make_tuple_bunch - args, kwargs = ( - _transform_arg(a, attribute_name, seen) - for a in arg.__getnewargs_ex__() - ) - obj = type(arg).__new__(type(arg), *args, **kwargs) - if hasattr(obj, "__setstate__"): - raise NotImplementedError( - "Transforming tuple-like with __getnewargs_ex__ and " - "__setstate__ not implemented" - ) - if not hasattr(obj, "__dict__") and kwargs: - raise NotImplementedError( - "Transforming tuple-like with kwargs from " - "__getnewargs_ex__ and no __dict__ not implemented" - ) - obj.__dict__.update(kwargs) - return obj - elif hasattr(arg, "__getnewargs__"): - # This handles namedtuple, and would catch tuple if we - # didn't handle it above. - args = _transform_arg(arg.__getnewargs__(), attribute_name, seen) - return type(arg).__new__(type(arg), *args) - else: - # Hope we can just call the constructor with transformed entries. - return type(arg)( - _transform_arg(a, attribute_name, seen) for a in args - ) - elif isinstance(arg, dict): - return { - _transform_arg(k, attribute_name, seen): _transform_arg( - a, attribute_name, seen - ) - for k, a in arg.items() - } - elif isinstance(arg, np.ndarray) and arg.dtype == "O": - transformed = [ - _transform_arg(a, attribute_name, seen) for a in arg.flat - ] - # Keep the same memory layout as arg (the default is C_CONTIGUOUS) - if arg.flags["F_CONTIGUOUS"] and not arg.flags["C_CONTIGUOUS"]: - order = "F" - else: - order = "C" - result = np.empty(int(np.prod(arg.shape)), dtype=object, order=order) - result[...] = transformed - return result.reshape(arg.shape) - elif isinstance(arg, Iterator) and attribute_name == "_fsproxy_fast": - # this may include consumable objects like generators or - # IOBase objects, which we don't want unavailable to the slow - # path in case of fallback. So, we raise here and ensure the - # slow path is taken: - raise Exception() - elif isinstance(arg, types.FunctionType): - if id(arg) in seen: - # `arg` is mutually recursive with another function. We - # can't handle these cases yet: - return arg - seen.add(id(arg)) - return _replace_closurevars(arg, attribute_name, seen) - else: - return arg - - -def _fast_arg(arg: Any) -> Any: - """ - Transform "arg" into its corresponding fast type. - """ - seen: set[int] = set() - return _transform_arg(arg, "_fsproxy_fast", seen) - - -def _slow_arg(arg: Any) -> Any: - """ - Transform "arg" into its corresponding slow type. - """ - seen: set[int] = set() - return _transform_arg(arg, "_fsproxy_slow", seen) - - -def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: - """ - Wraps "result" in a fast-slow proxy if is a "proxiable" object. - """ - if _is_final_type(result): - typ = get_final_type_map()[type(result)] - return typ._fsproxy_wrap(result, func) - elif _is_intermediate_type(result): - typ = get_intermediate_type_map()[type(result)] - return typ._fsproxy_wrap(result, method_chain=(func, args, kwargs)) - elif _is_final_class(result): - return get_final_type_map()[result] - elif isinstance(result, list): - return type(result)( - [ - _maybe_wrap_result(r, operator.getitem, result, i) - for i, r in enumerate(result) - ] - ) - elif isinstance(result, tuple): - wrapped = ( - _maybe_wrap_result(r, operator.getitem, result, i) - for i, r in enumerate(result) - ) - if hasattr(result, "_make"): - # namedtuple - return type(result)._make(wrapped) - else: - return type(result)(wrapped) - elif isinstance(result, Iterator): - return (_maybe_wrap_result(r, lambda x: x, r) for r in result) - else: - return result - - -def _is_final_type(result: Any) -> bool: - return type(result) in get_final_type_map() - - -def _is_final_class(result: Any) -> bool: - if not isinstance(result, type): - return False - return result in get_final_type_map() - - -def _is_intermediate_type(result: Any) -> bool: - return type(result) in get_intermediate_type_map() - - -def _is_function_or_method(obj: Any) -> bool: - res = isinstance( - obj, - ( - types.FunctionType, - types.BuiltinFunctionType, - types.MethodType, - types.WrapperDescriptorType, - types.MethodWrapperType, - types.MethodDescriptorType, - types.BuiltinMethodType, - ), - ) - if not res: - try: - return "cython_function_or_method" in str(type(obj)) - except Exception: - return False - return res - - -def _replace_closurevars( - f: types.FunctionType, - attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], - seen: set[int], -) -> Callable[..., Any]: - """ - Return a copy of `f` with its closure variables replaced with - their corresponding slow (or fast) types. - """ - if f.__closure__: - # GH #254: If empty cells are present - which can happen in - # situations like when `f` is a method that invokes the - # "empty" `super()` - the call to `getclosurevars` below will - # fail. For now, we just return `f` in this case. If needed, - # we can consider populating empty cells with a placeholder - # value to allow the call to `getclosurevars` to succeed. - if any(c == types.CellType() for c in f.__closure__): - return f - - f_nonlocals, f_globals, _, _ = inspect.getclosurevars(f) - - g_globals = _transform_arg(f_globals, attribute_name, seen) - g_nonlocals = _transform_arg(f_nonlocals, attribute_name, seen) - - # if none of the globals/nonlocals were transformed, we - # can just return f: - if all(f_globals[k] is g_globals[k] for k in f_globals) and all( - g_nonlocals[k] is f_nonlocals[k] for k in f_nonlocals - ): - return f - - g_closure = tuple(types.CellType(val) for val in g_nonlocals.values()) - - # https://github.com/rapidsai/cudf/issues/15548 - new_g_globals = f.__globals__.copy() - new_g_globals.update(g_globals) - - g = types.FunctionType( - f.__code__, - new_g_globals, - name=f.__name__, - argdefs=f.__defaults__, - closure=g_closure, - ) - return functools.update_wrapper( - g, - f, - assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",), - ) - - -def is_proxy_object(obj: Any) -> bool: - """Determine if an object is proxy object - - Parameters - ---------- - obj : object - Any python object. - - """ - if _FastSlowProxyMeta in type(type(obj)).__mro__: - return True - return False - - -def _get_proxy_base_class(cls): - """Returns the proxy base class if one exists""" - for proxy_class in PROXY_BASE_CLASSES: - if proxy_class in cls.__mro__: - return proxy_class - return object - - -PROXY_BASE_CLASSES: set[type] = { - ProxyNDarrayBase, -} - - -NUMPY_TYPES: set[str] = set(np.sctypeDict.values()) - - -_SPECIAL_METHODS: set[str] = { - "__abs__", - "__add__", - "__and__", - "__bool__", - "__call__", - "__getattr__", - "__complex__", - "__contains__", - "__copy__", - "__dataframe__", - "__deepcopy__", - "__delitem__", - "__delslice__", - "__divmod__", - "__enter__", - "__eq__", - "__exit__", - "__float__", - "__floordiv__", - "__format__", - "__ge__", - "__getitem__", - "__getslice__", - "__gt__", - # Added on a per-proxy basis - # https://github.com/rapidsai/xdf/pull/306#pullrequestreview-1636155428 - # "__hash__", - "__iadd__", - "__iand__", - "__iconcat__", - "__ifloordiv__", - "__ilshift__", - "__imatmul__", - "__imod__", - "__imul__", - "__int__", - "__invert__", - "__ior__", - "__ipow__", - "__irshift__", - "__isub__", - "__iter__", - "__itruediv__", - "__ixor__", - "__le__", - "__len__", - "__lshift__", - "__lt__", - "__matmul__", - "__mod__", - "__mul__", - "__ne__", - "__neg__", - "__next__", - "__or__", - "__pos__", - "__pow__", - "__radd__", - "__rand__", - "__rdivmod__", - "__repr__", - "__rfloordiv__", - "__rlshift__", - "__rmatmul__", - "__rmod__", - "__rmul__", - "__ror__", - "__round__", - "__rpow__", - "__rrshift__", - "__rshift__", - "__rsub__", - "__rtruediv__", - "__rxor__", - "__setitem__", - "__setslice__", - "__str__", - "__sub__", - "__truediv__", - "__xor__", -} diff --git a/python/cudf/cudf/pandas/magics.py b/python/cudf/cudf/pandas/magics.py deleted file mode 100644 index 1573682492d..00000000000 --- a/python/cudf/cudf/pandas/magics.py +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - - -try: - from IPython.core.magic import Magics, cell_magic, magics_class - - from .profiler import Profiler, lines_with_profiling - - @magics_class - class CudfPandasMagics(Magics): - @cell_magic("cudf.pandas.profile") - def profile(self, _, cell): - with Profiler() as profiler: - get_ipython().run_cell(cell) # noqa: F821 - profiler.print_per_function_stats() - - @cell_magic("cudf.pandas.line_profile") - def line_profile(self, _, cell): - new_cell = lines_with_profiling(cell.split("\n")) - get_ipython().run_cell(new_cell) # noqa: F821 - - def load_ipython_extension(ip): - from . import install - - install() - ip.register_magics(CudfPandasMagics) - -except ImportError: - - def load_ipython_extension(ip): - pass diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py deleted file mode 100644 index f82e300e83d..00000000000 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ /dev/null @@ -1,631 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import contextlib -import functools -import importlib -import importlib.abc -import importlib.machinery -import os -import pathlib -import sys -import threading -import warnings -from abc import abstractmethod -from importlib._bootstrap import _ImportLockContext as ImportLock -from types import ModuleType -from typing import Any, ContextManager, NamedTuple - -from typing_extensions import Self - -from .fast_slow_proxy import ( - _FunctionProxy, - _is_function_or_method, - _Unusable, - get_final_type_map, - get_intermediate_type_map, - get_registered_functions, -) - - -def rename_root_module(module: str, root: str, new_root: str) -> str: - """ - Rename a module to a new root. - - Parameters - ---------- - module - Module to rename - root - Original root - new_root - New root - - Returns - ------- - New module name (if it matches root) otherwise original name. - """ - if module.startswith(root): - return new_root + module[len(root) :] - else: - return module - - -class DeducedMode(NamedTuple): - use_fast_lib: bool - slow_lib: str - fast_lib: str - - -def deduce_cudf_pandas_mode(slow_lib: str, fast_lib: str) -> DeducedMode: - """ - Determine if cudf.pandas should use the requested fast library. - - Parameters - ---------- - slow_lib - Name of the slow library - fast_lib - Name of the fast library - - Returns - ------- - Whether the fast library is being used, and the resulting names of - the "slow" and "fast" libraries. - """ - if "CUDF_PANDAS_FALLBACK_MODE" not in os.environ: - try: - importlib.import_module(fast_lib) - return DeducedMode( - use_fast_lib=True, slow_lib=slow_lib, fast_lib=fast_lib - ) - except Exception as e: - warnings.warn( - f"Exception encountered importing {fast_lib}: {e}." - f"Falling back to only using {slow_lib}." - ) - return DeducedMode( - use_fast_lib=False, slow_lib=slow_lib, fast_lib=slow_lib - ) - - -class ModuleAcceleratorBase( - importlib.abc.MetaPathFinder, importlib.abc.Loader -): - _instance: ModuleAcceleratorBase | None = None - mod_name: str - fast_lib: str - slow_lib: str - - # When walking the module tree and wrapping module attributes, - # we often will come across the same object more than once. We - # don't want to create separate wrappers for each - # instance, so we keep a registry of all module attributes - # that we can look up to see if we have already wrapped an - # attribute before - _wrapped_objs: dict[Any, Any] - - def __new__( - cls, - mod_name: str, - fast_lib: str, - slow_lib: str, - ): - """Build a custom module finder that will provide wrapped modules - on demand. - - Parameters - ---------- - mod_name - Import name to deliver modules under. - fast_lib - Name of package that provides "fast" implementation - slow_lib - Name of package that provides "slow" fallback implementation - """ - if ModuleAcceleratorBase._instance is not None: - raise RuntimeError( - "Only one instance of ModuleAcceleratorBase allowed" - ) - self = object.__new__(cls) - self.mod_name = mod_name - self.fast_lib = fast_lib - self.slow_lib = slow_lib - - # When walking the module tree and wrapping module attributes, - # we often will come across the same object more than once. We - # don't want to create separate wrappers for each - # instance, so we keep a registry of all module attributes - # that we can look up to see if we have already wrapped an - # attribute before - self._wrapped_objs = {} - self._wrapped_objs.update(get_final_type_map()) - self._wrapped_objs.update(get_intermediate_type_map()) - self._wrapped_objs.update(get_registered_functions()) - - ModuleAcceleratorBase._instance = self - return self - - def __repr__(self) -> str: - return ( - f"{self.__class__.__name__}" - f"(fast={self.fast_lib}, slow={self.slow_lib})" - ) - - def find_spec( - self, fullname: str, path, target=None - ) -> importlib.machinery.ModuleSpec | None: - """Provide ourselves as a module loader. - - Parameters - ---------- - fullname - Name of module to be imported, if it starts with the name - that we are using to wrap, we will deliver ourselves as a - loader, otherwise defer to the standard Python loaders. - - Returns - ------- - A ModuleSpec with ourself as loader if we're interposing, - otherwise None to pass off to the next loader. - """ - if fullname == self.mod_name or fullname.startswith( - f"{self.mod_name}." - ): - return importlib.machinery.ModuleSpec( - name=fullname, - loader=self, - # Note, this influences the repr of the module, so we may want - # to change it if we ever want to control that. - origin=None, - loader_state=None, - is_package=True, - ) - return None - - def create_module(self, spec) -> ModuleType | None: - return None - - def exec_module(self, mod: ModuleType): - # importlib calls this function with the global import lock held. - self._populate_module(mod) - - @abstractmethod - def disabled(self) -> ContextManager: - pass - - def _postprocess_module( - self, - mod: ModuleType, - slow_mod: ModuleType, - fast_mod: ModuleType | None, - ) -> ModuleType: - """Ensure that the wrapped module satisfies required invariants. - - Parameters - ---------- - mod - Wrapped module to postprocess - slow_mod - Slow version that we are mimicking - fast_mod - Fast module that provides accelerated implementations (may - be None - - Returns - ------- - Checked and validated module - - Notes - ----- - The implementation of fast-slow proxies imposes certain - requirements on the wrapped modules that it delivers. This - function encodes those requirements and raises if the module - does not satisfy them. - - This post-processing routine should be kept up to date with any - requirements encoded by fast_slow_proxy.py - """ - mod.__dict__["_fsproxy_slow"] = slow_mod - if fast_mod is not None: - mod.__dict__["_fsproxy_fast"] = fast_mod - return mod - - @abstractmethod - def _populate_module(self, mod: ModuleType) -> ModuleType: - """Populate given module with appropriate attributes. - - This traverses the attributes of the slow module corresponding - to mod and mirrors those in the provided module in a wrapped - mode that attempts to execute them using the fast module first. - - Parameters - ---------- - mod - Module to populate - - Returns - ------- - ModuleType - Populated module - - Notes - ----- - In addition to the attributes of the slow module, - the returned module must have the following attributes: - - - '_fsproxy_slow': the corresponding slow module - - '_fsproxy_fast': the corresponding fast module - - This is necessary for correct rewriting of UDFs when calling - to the respective fast/slow libraries. - - The necessary invariants are checked and applied in - :meth:`_postprocess_module`. - """ - pass - - def _wrap_attribute( - self, - slow_attr: Any, - fast_attr: Any | _Unusable, - name: str, - ) -> Any: - """ - Return the wrapped version of an attribute. - - Parameters - ---------- - slow_attr : Any - The attribute from the slow module - fast_mod : Any (or None) - The same attribute from the fast module, if it exists - name - Name of attribute - - Returns - ------- - Wrapped attribute - """ - wrapped_attr: Any - # TODO: what else should we make sure not to get from the fast - # library? - if name in {"__all__", "__dir__", "__file__", "__doc__"}: - wrapped_attr = slow_attr - elif self.fast_lib == self.slow_lib: - # no need to create a fast-slow wrapper - wrapped_attr = slow_attr - if any( - [ - slow_attr in get_registered_functions(), - slow_attr in get_final_type_map(), - slow_attr in get_intermediate_type_map(), - ] - ): - # attribute already registered in self._wrapped_objs - return self._wrapped_objs[slow_attr] - if isinstance(slow_attr, ModuleType) and slow_attr.__name__.startswith( - self.slow_lib - ): - # attribute is a submodule of the slow library, - # replace the string "{slow_lib}" in the submodule's - # name with "{self.mod_name}" - # now, attempt to import the wrapped module, which will - # recursively wrap all of its attributes: - return importlib.import_module( - rename_root_module( - slow_attr.__name__, self.slow_lib, self.mod_name - ) - ) - if slow_attr in self._wrapped_objs: - if type(fast_attr) is _Unusable: - # we don't want to replace a wrapped object that - # has a usable fast object with a wrapped object - # with a an unusable fast object. - return self._wrapped_objs[slow_attr] - if _is_function_or_method(slow_attr): - wrapped_attr = _FunctionProxy(fast_attr, slow_attr) - else: - wrapped_attr = slow_attr - return wrapped_attr - - @classmethod - @abstractmethod - def install( - cls, destination_module: str, fast_lib: str, slow_lib: str - ) -> Self | None: - """ - Install the loader in sys.meta_path. - - Parameters - ---------- - destination_module - Name under which the importer will kick in - fast_lib - Name of fast module - slow_lib - Name of slow module we are trying to mimic - - Returns - ------- - Instance of the class (or None if the loader was not installed) - - Notes - ----- - This function is idempotent. If called with the same arguments - a second time, it does not create a new loader, but instead - returns the existing loader from ``sys.meta_path``. - - """ - pass - - -class ModuleAccelerator(ModuleAcceleratorBase): - """ - A finder and loader that produces "accelerated" modules. - - When someone attempts to import the specified slow library with - this finder enabled, we intercept the import and deliver an - equivalent, accelerated, version of the module. This provides - attributes and modules that check if they are being used from - "within" the slow (or fast) library themselves. If this is the - case, the implementation is forwarded to the actual slow library - implementation, otherwise a proxy implementation is used (which - attempts to call the fast version first). - """ - - _denylist: tuple[str] - _use_fast_lib: bool - _use_fast_lib_lock: threading.RLock - _module_cache_prefix: str = "_slow_lib_" - - # TODO: Add possibility for either an explicit allow-list of - # libraries where the slow_lib should be wrapped, or, more likely - # a block-list that adds to the set of libraries where no proxying occurs. - def __new__( - cls, - fast_lib, - slow_lib, - ): - self = super().__new__( - cls, - slow_lib, - fast_lib, - slow_lib, - ) - # Import the real versions of the modules so that we can - # rewrite the sys.modules cache. - slow_module = importlib.import_module(slow_lib) - fast_module = importlib.import_module(fast_lib) - # Note, this is not thread safe, but install() below grabs the - # lock for the whole initialisation and modification of - # sys.meta_path. - for mod in sys.modules.copy(): - if mod.startswith(self.slow_lib): - sys.modules[self._module_cache_prefix + mod] = sys.modules[mod] - del sys.modules[mod] - self._denylist = (*slow_module.__path__, *fast_module.__path__) - - # Lock to manage temporarily disabling delivering wrapped attributes - self._use_fast_lib_lock = threading.RLock() - self._use_fast_lib = True - return self - - def _populate_module(self, mod: ModuleType): - mod_name = mod.__name__ - - # Here we attempt to import "_fsproxy_slow_lib.x.y.z", but - # "_fsproxy_slow_lib" does not exist anywhere as a real file, so - # how does this work? - # The importer attempts to import ".z" by first importing - # "_fsproxy_slow_lib.x.y", this recurses until we find - # "_fsproxy_slow_lib.x" (say), which does exist because we set that up - # in __init__. Now the importer looks at the __path__ - # attribute of "x" and uses that to find the relative location - # to look for "y". This __path__ points to the real location - # of "slow_lib.x". So, as long as we rewire the _already imported_ - # slow_lib modules in sys.modules to _fsproxy_slow_lib, when we - # get here this will find the right thing. - # The above exposition is for lazily imported submodules (e.g. - # avoiding circular imports by putting an import at function - # level). For everything that is eagerly imported when we do - # "import slow_lib" this import line is trivial because we - # immediately pull the correct result out of sys.modules. - slow_mod = importlib.import_module( - rename_root_module( - mod_name, - self.slow_lib, - self._module_cache_prefix + self.slow_lib, - ) - ) - try: - fast_mod = importlib.import_module( - rename_root_module(mod_name, self.slow_lib, self.fast_lib) - ) - except Exception: - fast_mod = None - - # The version that will be used if called within a denylist - # package - real_attributes = {} - # The version that will be used outside denylist packages - for key in slow_mod.__dir__(): - with warnings.catch_warnings(): - warnings.simplefilter("ignore", FutureWarning) - slow_attr = getattr(slow_mod, key) - fast_attr = getattr(fast_mod, key, _Unusable()) - real_attributes[key] = slow_attr - try: - wrapped_attr = self._wrap_attribute(slow_attr, fast_attr, key) - self._wrapped_objs[slow_attr] = wrapped_attr - except TypeError: - # slow_attr is not hashable - pass - - # Our module has (basically) no static attributes and instead - # always delivers them dynamically where the behaviour is - # dependent on the calling module. - setattr( - mod, - "__getattr__", - functools.partial( - self.getattr_real_or_wrapped, - real=real_attributes, - wrapped_objs=self._wrapped_objs, - loader=self, - ), - ) - - # ...but, we want to pretend like we expose the same attributes - # as the equivalent slow module - setattr(mod, "__dir__", slow_mod.__dir__) - - # We set __path__ to the real path so that importers like - # jinja2.PackageLoader("slow_mod") work correctly. - if getattr(slow_mod, "__path__", False): - assert mod.__spec__ - mod.__path__ = slow_mod.__path__ - mod.__spec__.submodule_search_locations = [*slow_mod.__path__] - return self._postprocess_module(mod, slow_mod, fast_mod) - - @contextlib.contextmanager - def disabled(self): - """Return a context manager for disabling the module accelerator. - - Within the block, any wrapped objects will instead deliver - attributes from their real counterparts (as if the current - nested block were in the denylist). - - Returns - ------- - Context manager for disabling things - """ - try: - self._use_fast_lib_lock.acquire() - # The same thread might enter this context manager - # multiple times, so we need to remember the previous - # value - saved = self._use_fast_lib - self._use_fast_lib = False - yield - finally: - self._use_fast_lib = saved - self._use_fast_lib_lock.release() - - @staticmethod - def getattr_real_or_wrapped( - name: str, - *, - real: dict[str, Any], - wrapped_objs, - loader: ModuleAccelerator, - ) -> Any: - """ - Obtain an attribute from a module from either the real or - wrapped namespace. - - Parameters - ---------- - name - Attribute to return - real - Unwrapped "original" attributes - wrapped - Wrapped attributes - loader - Loader object that manages denylist and other skipping - - Returns - ------- - The requested attribute (either real or wrapped) - """ - with loader._use_fast_lib_lock: - # Have to hold the lock to read this variable since - # another thread might modify it. - # Modification has to happen with the lock held for the - # duration, so if someone else has modified things, then - # we block trying to acquire the lock (hence it is safe to - # release the lock after reading this value) - use_real = not loader._use_fast_lib - if not use_real: - # Only need to check the denylist if we're not turned off. - frame = sys._getframe() - # We cannot possibly be at the top level. - assert frame.f_back - calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename) - use_real = _caller_in_denylist( - calling_module, tuple(loader._denylist) - ) - try: - if use_real: - return real[name] - else: - return wrapped_objs[real[name]] - except KeyError: - raise AttributeError(f"No attribute '{name}'") - except TypeError: - # real[name] is an unhashable type - return real[name] - - @classmethod - def install( - cls, - destination_module: str, - fast_lib: str, - slow_lib: str, - ) -> Self | None: - # This grabs the global _import_ lock to avoid concurrent - # threads modifying sys.modules. - # We also make sure that we finish installing ourselves in - # sys.meta_path before releasing the lock so that there isn't - # a race between our modification of sys.modules and someone - # else importing the slow_lib before we have added ourselves - # to the meta_path - with ImportLock(): - if destination_module != slow_lib: - raise RuntimeError( - f"Destination module '{destination_module}' must match" - f"'{slow_lib}' for this to work." - ) - mode = deduce_cudf_pandas_mode(slow_lib, fast_lib) - if mode.use_fast_lib: - importlib.import_module( - f".._wrappers.{mode.slow_lib}", __name__ - ) - try: - (self,) = ( - p - for p in sys.meta_path - if isinstance(p, cls) - and p.slow_lib == mode.slow_lib - and p.fast_lib == mode.fast_lib - ) - except ValueError: - self = cls(mode.fast_lib, mode.slow_lib) - sys.meta_path.insert(0, self) - return self - - -def disable_module_accelerator() -> contextlib.ExitStack: - """ - Temporarily disable any module acceleration. - """ - with contextlib.ExitStack() as stack: - for finder in sys.meta_path: - if isinstance(finder, ModuleAcceleratorBase): - stack.enter_context(finder.disabled()) - return stack.pop_all() - assert False # pacify type checker - - -# because this function gets called so often and is quite -# expensive to run, we cache the results: -@functools.lru_cache(maxsize=1024) -def _caller_in_denylist(calling_module, denylist): - CUDF_PANDAS_PATH = __file__.rsplit("/", 1)[0] - return not calling_module.is_relative_to(CUDF_PANDAS_PATH) and any( - calling_module.is_relative_to(path) for path in denylist - ) diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py deleted file mode 100644 index 0fb41fc0b26..00000000000 --- a/python/cudf/cudf/pandas/profiler.py +++ /dev/null @@ -1,313 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -from __future__ import annotations - -import inspect -import operator -import pickle -import sys -import time -from collections import defaultdict - -from rich.console import Console -from rich.syntax import Syntax -from rich.table import Table - -from .fast_slow_proxy import ( - _FinalProxy, - _FunctionProxy, - _IntermediateProxy, - _MethodProxy, -) - -# This text is used in contexts where the profiler is injected into the -# original code. The profiler is injected at the top of the cell, so the line -# numbers in the profiler results are offset by 2. -_profile_injection_text = """\ -from cudf.pandas import Profiler -with Profiler() as profiler: -{original_lines} - -# Patch the results to shift the line numbers back to the original before the -# profiler injection. -new_results = {{}} - -for (lineno, currfile, line), v in profiler._results.items(): - new_results[(lineno - 2, currfile, line)] = v - -profiler._results = new_results -profiler.print_per_line_stats() -{function_profile_printer} -""" - -_cpu_issue_text = """\ -Not all pandas operations ran on the GPU. \ -The following functions required CPU fallback: - -{cpu_functions_used} -""" - - -def format_cpu_functions_used(cpu_funcs): - output_str = "" - for each in cpu_funcs: - output_str += f"- {each}\n" - - # remove final newline character - output_str = output_str[:-1] - return output_str - - -def lines_with_profiling(lines, print_function_profile=False): - """Inject profiling code into the given lines of code.""" - cleaned_lines = "\n".join( - [(" " * 4) + line.replace("\t", " " * 4) for line in lines] - ) - return _profile_injection_text.format( - original_lines=cleaned_lines, - function_profile_printer="profiler.print_per_function_stats()" - if print_function_profile - else "", - ) - - -class Profiler: - _IGNORE_LIST = ["Profiler()", "settrace(None)"] - - def __init__(self): - self._results = {} - # Map func-name to list of calls (was_fast, time) - self._per_func_results = defaultdict(lambda: defaultdict(list)) - # Current fast_slow_function_call stack frame recording name - # and start time - self._call_stack = [] - self._currkey = None - self._timer = {} - self._currfile = None - self.start_time = None - self.end_time = None - - def __enter__(self, *args, **kwargs): - self.start_time = time.perf_counter() - self._oldtrace = sys.gettrace() - # Setting the global trace function with sys.settrace does not affect - # the current call stack, so in addition to this we must also set the - # current frame's f_trace attribute as done below. - sys.settrace(self._tracefunc) - - # Following excerpt from: - # https://docs.python.org/3/library/sys.html#sys.settrace - # For more fine-grained usage, it is possible - # to set a trace function by assigning - # frame.f_trace = tracefunc explicitly, rather than - # relying on it being set indirectly via the return - # value from an already installed trace function - # Hence we need to perform `f_trace = self._tracefunc` - # we need to `f_back` because current frame will be - # of this file. - frame = inspect.currentframe().f_back - self._currfile = frame.f_code.co_filename - self._f_back_oldtrace = frame.f_trace - frame.f_trace = self._tracefunc - return self - - def __exit__(self, *args, **kwargs): - sys.settrace(self._oldtrace) - inspect.currentframe().f_back.f_trace = self._f_back_oldtrace - self.end_time = time.perf_counter() - - @staticmethod - def get_namespaced_function_name( - func_obj: _FunctionProxy - | _MethodProxy - | type[_FinalProxy] - | type[_IntermediateProxy], - ): - if isinstance(func_obj, _MethodProxy): - return func_obj._fsproxy_slow.__qualname__ - elif isinstance(func_obj, _FunctionProxy) or issubclass( - func_obj, (_FinalProxy, _IntermediateProxy) - ): - return func_obj.__name__ - else: - raise NotImplementedError( - f"Don't know how to get namespaced name for {func_obj}" - ) - - def _tracefunc(self, frame, event, arg): - if event == "line" and frame.f_code.co_filename == self._currfile: - key = "".join(inspect.stack()[1].code_context) - if not any( - ignore_word in key for ignore_word in Profiler._IGNORE_LIST - ): - self._currkey = (frame.f_lineno, self._currfile, key) - self._results.setdefault(self._currkey, {}) - self._timer[self._currkey] = time.perf_counter() - elif ( - event == "call" - and frame.f_code.co_name == "_fast_slow_function_call" - ): - if self._currkey is not None: - self._timer[self._currkey] = time.perf_counter() - - # Store per-function information for free functions and methods - frame_locals = inspect.getargvalues(frame).locals - if ( - isinstance( - func_obj := frame_locals["args"][0], - (_MethodProxy, _FunctionProxy), - ) - or isinstance(func_obj, type) - and issubclass(func_obj, (_FinalProxy, _IntermediateProxy)) - ): - func_name = self.get_namespaced_function_name(func_obj) - self._call_stack.append((func_name, time.perf_counter())) - elif ( - event == "return" - and frame.f_code.co_name == "_fast_slow_function_call" - ): - if self._currkey is not None and arg is not None: - if arg[1]: # fast - run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey]["gpu_time"] = ( - run_time - + self._results[self._currkey].get("gpu_time", 0) - ) - else: - run_time = time.perf_counter() - self._timer[self._currkey] - self._results[self._currkey]["cpu_time"] = ( - run_time - + self._results[self._currkey].get("cpu_time", 0) - ) - - frame_locals = inspect.getargvalues(frame).locals - if ( - isinstance( - func_obj := frame_locals["args"][0], - (_MethodProxy, _FunctionProxy), - ) - or isinstance(func_obj, type) - and issubclass(func_obj, (_FinalProxy, _IntermediateProxy)) - ): - func_name, start = self._call_stack.pop() - if arg is not None: - key = "gpu" if arg[1] else "cpu" - self._per_func_results[func_name][key].append( - time.perf_counter() - start - ) - - return self._tracefunc - - @property - def per_line_stats(self): - list_data = [] - for key, val in self._results.items(): - cpu_time = val.get("cpu_time", 0) - gpu_time = val.get("gpu_time", 0) - line_no, _, line = key - list_data.append([line_no, line, gpu_time, cpu_time]) - - return sorted(list_data, key=operator.itemgetter(0)) - - @property - def per_function_stats(self): - return self._per_func_results - - def print_per_line_stats(self): - table = Table() - table.add_column("Line no.") - table.add_column("Line") - table.add_column("GPU TIME(s)") - table.add_column("CPU TIME(s)") - for line_no, line, gpu_time, cpu_time in self.per_line_stats: - table.add_row( - str(line_no), - Syntax(str(line), "python"), - "" if gpu_time == 0 else "{:.9f}".format(gpu_time), - "" if cpu_time == 0 else "{:.9f}".format(cpu_time), - ) - time_elapsed = self.end_time - self.start_time - table.title = f"""\n\ - Total time elapsed: {time_elapsed:.3f} seconds - - Stats - """ - console = Console() - console.print(table) - - def print_per_function_stats(self): - cpu_funcs = [] - n_gpu_func_calls = 0 - n_cpu_func_calls = 0 - total_gpu_time = 0 - total_cpu_time = 0 - - table = Table() - for col in ( - "Function", - "GPU ncalls", - "GPU cumtime", - "GPU percall", - "CPU ncalls", - "CPU cumtime", - "CPU percall", - ): - table.add_column(col) - - for func_name, func_data in self.per_function_stats.items(): - gpu_times = func_data["gpu"] - cpu_times = func_data["cpu"] - table.add_row( - func_name, - f"{len(gpu_times)}", - f"{sum(gpu_times):.3f}", - f"{sum(gpu_times) / max(len(gpu_times), 1):.3f}", - f"{len(cpu_times)}", - f"{sum(cpu_times):.3f}", - f"{sum(cpu_times) / max(len(cpu_times), 1):.3f}", - ) - total_gpu_time += sum(gpu_times) - total_cpu_time += sum(cpu_times) - n_gpu_func_calls += len(gpu_times) - n_cpu_func_calls += len(cpu_times) - - if cpu_times and func_name not in cpu_funcs: - cpu_funcs.append(func_name) - - time_elapsed = self.end_time - self.start_time - table.title = f"""\n\ - Total time elapsed: {time_elapsed:.3f} seconds - {n_gpu_func_calls} GPU function calls in {total_gpu_time:.3f} seconds - {n_cpu_func_calls} CPU function calls in {total_cpu_time:.3f} seconds - - Stats - """ - console = Console() - console.print(table) - - if cpu_funcs: - call_to_action = ( - "To request GPU support for any of these functions, " - "please file a Github issue here: " - "[link=https://github.com/rapidsai/cudf/issues/new?assignees" - "=&labels=%3F+-+Needs+Triage%2C+feature+request&projects=" - "&template=pandas_function_request.md&title=%5BFEA%5D]" - "https://github.com/rapidsai/cudf/issues/new/choose" - "[/link]." - ) - console.print( - _cpu_issue_text.format( - cpu_functions_used=format_cpu_functions_used(cpu_funcs) - ) - ) - console.print(call_to_action) - - def dump_stats(self, file_name): - with open(file_name, "wb") as f: - pickle.dump(self, f) - - -def load_stats(file_name): - with open(file_name, "rb") as f: - return pickle.load(f) diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py deleted file mode 100644 index 6f732834e94..00000000000 --- a/python/cudf/cudf/pandas/proxy_base.py +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import cupy as cp -import numpy as np - - -class ProxyNDarrayBase(np.ndarray): - def __new__(cls, arr): - if isinstance(arr, cp.ndarray): - arr = arr.get() - if not isinstance(arr, np.ndarray): - raise TypeError( - "Unsupported array type. Must be numpy.ndarray or cupy.ndarray" - ) - return np.asarray(arr, dtype=arr.dtype).view(cls) - - def __array_finalize__(self, obj): - if obj is None: - return - self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", obj) diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py deleted file mode 100644 index 8870fbc5c28..00000000000 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ /dev/null @@ -1,71 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -""" -Prints the most common test failures for the given tests. - -Usage: - python analyze-test-failures.py - -Example: - python analyze-test-failures.py log.json frame/* -""" - -import json -import sys -from collections import Counter -from fnmatch import fnmatch - -from rich.console import Console -from rich.table import Table - -PANDAS_TEST_PREFIX = "pandas-tests/" - - -def count_failures(log_file_name, pattern): - counter = Counter() - with open(log_file_name) as f: - for line in f: - try: - line = json.loads(line) - except Exception: - continue - if ( - "location" in line - and line["when"] == "call" - and line["outcome"] == "failed" - ): - line_module_name = line["location"][0].removeprefix( - PANDAS_TEST_PREFIX - ) - if fnmatch(line_module_name, pattern): - if "longrepr" in line and line["longrepr"]: - if isinstance(line["longrepr"], (tuple, list)): - message = line["longrepr"][2].splitlines()[0] - elif isinstance(line["longrepr"], str): - message = line["longrepr"] - else: - message = line["longrepr"]["reprcrash"][ - "message" - ].splitlines()[0] - counter[message] += 1 - return counter - - -def render_results(results, num_rows=20): - table = Table() - table.add_column("Failure message") - table.add_column("Number of occurences") - - for msg, num in results.most_common(20): - table.add_row(msg, str(num)) - - console = Console() - console.print(table) - - -if __name__ == "__main__": - log_file_name = sys.argv[1] - pattern = sys.argv[2] - render_results(count_failures(log_file_name, pattern), num_rows=20) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py deleted file mode 100644 index d12d2697729..00000000000 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ /dev/null @@ -1,96 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import contextlib -import json -import os -import sys -import traceback -from collections import defaultdict -from functools import wraps - -import pytest - - -def replace_kwargs(new_kwargs): - def wrapper(func): - @wraps(func) - def wrapped(*args, **kwargs): - kwargs.update(new_kwargs) - return func(*args, **kwargs) - - return wrapped - - return wrapper - - -@contextlib.contextmanager -def null_assert_warnings(*args, **kwargs): - try: - yield [] - finally: - pass - - -@pytest.fixture(scope="session", autouse=True) # type: ignore -def patch_testing_functions(): - tm.assert_produces_warning = null_assert_warnings - pytest.raises = replace_kwargs({"match": None})(pytest.raises) - - -# Dictionary to store function call counts -function_call_counts = {} # type: ignore - -# The specific functions to track -FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"} - - -def find_pytest_file(frame): - stack = traceback.extract_stack() - absolute_paths = [frame.filename for frame in stack] - for file in absolute_paths: - if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[ - -1 - ].startswith("test_"): - return str(file).rsplit("pandas-tests/", 1)[-1] - return None - - -def trace_calls(frame, event, arg): - if event != "call": - return - code = frame.f_code - func_name = code.co_name - - if func_name in FUNCTION_NAME: - filename = find_pytest_file(frame) - if filename is None: - return - if filename not in function_call_counts: - function_call_counts[filename] = defaultdict(int) - function_call_counts[filename][func_name] += 1 - - -def pytest_sessionstart(session): - # Set the profile function to trace calls - sys.setprofile(trace_calls) - - -def pytest_sessionfinish(session, exitstatus): - # Remove the profile function - sys.setprofile(None) - - -@pytest.hookimpl(trylast=True) -def pytest_unconfigure(config): - if hasattr(config, "workerinput"): - # Running in xdist worker, write the counts before exiting - worker_id = config.workerinput["workerid"] - output_file = f"function_call_counts_worker_{worker_id}.json" - with open(output_file, "w") as f: - json.dump(function_call_counts, f, indent=4) - print(f"Function call counts have been written to {output_file}") - - -sys.path.append(os.path.dirname(__file__)) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh deleted file mode 100755 index 9b9ce026571..00000000000 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Run Pandas unit tests with cudf.pandas. -# -# Usage: -# run-pandas-tests.sh -# -# Examples -# Run a single test -# run-pandas-tests.sh -n auto -v tests/groupby/test_groupby_dropna.py -# Run all tests -# run-pandas-tests.sh --tb=line --report-log=log.json -# -# This script creates a `pandas-testing` directory if it doesn't exist - -set -euo pipefail - -# Grab the Pandas source corresponding to the version -# of Pandas installed. -PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)") - -# tests/io/test_clipboard.py::TestClipboard crashes pytest workers (possibly due to fixture patching clipboard functionality) -PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py \ ---ignore=tests/io/test_clipboard.py" - -mkdir -p pandas-testing -cd pandas-testing - -if [ ! -d "pandas" ]; then - git clone https://github.com/pandas-dev/pandas -fi -cd pandas && git clean -fdx && git checkout v$PANDAS_VERSION && cd ../ - - -if [ ! -d "pandas-tests" ]; then - # Copy just the tests out of the Pandas source tree. - # Not exactly sure why this is needed but Pandas - # imports fail if we don't do this: - mkdir -p pandas-tests - cp -r pandas/pandas/tests pandas-tests/ - # directory layout requirement - # conftest.py - # pyproject.toml - # tests/ - cp pandas/pandas/conftest.py pandas-tests/conftest.py - # Vendored from pandas/pyproject.toml - cat > pandas-tests/pyproject.toml << \EOF -[tool.pytest.ini_options] -xfail_strict = true -filterwarnings = [ - # Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758 - "ignore:`np.MachAr` is deprecated:DeprecationWarning:numba", -] -markers = [ - "single_cpu: tests that should run on a single cpu only", - "slow: mark a test as slow", - "network: mark a test as network", - "db: tests requiring a database (mysql or postgres)", - "clipboard: mark a pd.read_clipboard test", - "arm_slow: mark a test as slow for arm64 architecture", - "skip_ubsan: Tests known to fail UBSAN check", -] -EOF - - # Substitute `pandas.tests` with a relative import. - # This will depend on the location of the test module relative to - # the pandas-tests directory. - for hit in $(find . -iname '*.py' | xargs grep "pandas.tests" | cut -d ":" -f 1 | sort | uniq); do - # Get the relative path to the test module - test_module=$(echo $hit | cut -d "/" -f 2-) - # Get the number of directories to go up - num_dirs=$(echo $test_module | grep -o "/" | wc -l) - num_dots=$(($num_dirs - 2)) - # Construct the relative import - relative_import=$(printf "%0.s." $(seq 1 $num_dots)) - # Replace the import - sed -i "s/pandas.tests/${relative_import}/g" $hit - done -fi - -# append the contents of patch-confest.py to conftest.py -cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py - -# Run the tests -cd pandas-tests/ - - -# TODO: Needs motoserver/moto container running on http://localhost:5000 -TEST_THAT_NEED_MOTO_SERVER="not test_styler_to_s3 \ -and not test_with_s3_url[None] \ -and not test_with_s3_url[gzip] \ -and not test_with_s3_url[bz2] \ -and not test_with_s3_url[zip] \ -and not test_with_s3_url[xz] \ -and not test_with_s3_url[tar] \ -and not test_s3_permission_output[etree] \ -and not test_read_s3_jsonl \ -and not test_s3_parser_consistency \ -and not test_to_s3 \ -and not test_parse_public_s3a_bucket \ -and not test_parse_public_s3_bucket_nrows \ -and not test_parse_public_s3_bucket_chunked \ -and not test_parse_public_s3_bucket_chunked_python \ -and not test_parse_public_s3_bucket_python \ -and not test_infer_s3_compression \ -and not test_parse_public_s3_bucket_nrows_python \ -and not test_read_s3_fails_private \ -and not test_read_csv_handles_boto_s3_object \ -and not test_read_csv_chunked_download \ -and not test_read_s3_with_hash_in_key \ -and not test_read_feather_s3_file_path \ -and not test_parse_public_s3_bucket \ -and not test_parse_private_s3_bucket \ -and not test_parse_public_s3n_bucket \ -and not test_read_with_creds_from_pub_bucket \ -and not test_read_without_creds_from_pub_bucket \ -and not test_from_s3_csv \ -and not test_s3_protocols[s3] \ -and not test_s3_protocols[s3a] \ -and not test_s3_protocols[s3n] \ -and not test_s3_parquet \ -and not test_s3_roundtrip_explicit_fs \ -and not test_s3_roundtrip \ -and not test_s3_roundtrip_for_dir[partition_col0] \ -and not test_s3_roundtrip_for_dir[partition_col1] \ -and not test_s3_roundtrip" - -TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \ -and not test_large_string_pyarrow \ -and not test_interchange_from_corrected_buffer_dtypes \ -and not test_eof_states \ -and not test_array_tz" - -# TODO: Remove "not db" once a postgres & mysql container is set up on the CI -PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \ - -v -m "not single_cpu and not db" \ - -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \ - --import-mode=importlib \ - ${PYTEST_IGNORES} \ - "$@" || [ $? = 1 ] # Exit success if exit code was 1 (permit test failures but not other errors) - -mv *.json .. -cd .. -rm -rf pandas-testing/pandas-tests/ diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py deleted file mode 100644 index 4ea0b3b4413..00000000000 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ /dev/null @@ -1,156 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. -# All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -""" -Summarizes the test results per module. - -Examples: - python summarize-test-results.py log.json - python summarize-test-results.py log.json --output json - python summarize-test-results.py log.json --output table -""" - -import argparse -import glob -import json -import os - -from rich.console import Console -from rich.table import Table - -PANDAS_TEST_PREFIX = "pandas-tests/" - - -def get_per_module_results(log_file_name): - per_module_results = {} - with open(log_file_name) as f: - for line in f: - try: - line = json.loads(line) - except Exception: - line = {} - if "outcome" in line: - outcome = line["outcome"] - # outcome can be "passed", "failed", or "skipped". - # Depending on other fields, it can indicate - # an errored, xpassed, or xfailed test. - if line.get("when", None) != "call": - # when != call indicates test setup or teardown - if outcome == "failed": - # if the test failed during setup or teardown, - # it counts as an "errored" test: - outcome = "errored" - else: - # we don't care about other outcomes during - # setup or teardown - continue - else: - if line.get("wasxfail", False) and outcome == "passed": - # it's an xpassed test - outcome = "failed" - module_name = ( - line["nodeid"] - .split("::")[0] - .removeprefix(PANDAS_TEST_PREFIX) - ) - per_module_results.setdefault(module_name, {}) - per_module_results[module_name].setdefault("total", 0) - per_module_results[module_name].setdefault(outcome, 0) - per_module_results[module_name]["total"] += 1 - per_module_results[module_name][outcome] += 1 - - directory = os.path.dirname(log_file_name) - pattern = os.path.join(directory, "function_call_counts_worker_*.json") - matching_files = glob.glob(pattern) - function_call_counts = {} - - for file in matching_files: - with open(file) as f: - function_call_count = json.load(f) - if not function_call_counts: - function_call_counts.update(function_call_count) - else: - for key, value in function_call_count.items(): - if key not in function_call_counts: - function_call_counts[key] = value - else: - if "_slow_function_call" not in function_call_counts[key]: - function_call_counts[key]["_slow_function_call"] = 0 - if "_fast_function_call" not in function_call_counts[key]: - function_call_counts[key]["_fast_function_call"] = 0 - function_call_counts[key]["_slow_function_call"] += ( - value.get("_slow_function_call", 0) - ) - function_call_counts[key]["_fast_function_call"] += ( - value.get("_fast_function_call", 0) - ) - - for key, value in per_module_results.items(): - if key in function_call_counts: - per_module_results[key]["_slow_function_call"] = ( - function_call_counts[key].get("_slow_function_call", 0) - ) - per_module_results[key]["_fast_function_call"] = ( - function_call_counts[key].get("_fast_function_call", 0) - ) - else: - per_module_results[key]["_slow_function_call"] = 0 - per_module_results[key]["_fast_function_call"] = 0 - return per_module_results - - -def sort_results(results): - sorted_keys = sorted( - results, key=lambda key: results[key].get("failed", 0) - ) - return {key: results[key] for key in sorted_keys} - - -def print_results_as_json(results): - print(json.dumps(results, indent=4)) - - -def print_results_as_table(results): - table = Table() - table.add_column("Test module") - table.add_column("Total tests") - table.add_column("Passed tests") - table.add_column("Failed tests") - table.add_column("Errored tests") - table.add_column("Skipped tests") - totals = {"total": 0, "passed": 0, "failed": 0, "errored": 0, "skipped": 0} - for module_name, row in results.items(): - values = [] - for key in ("total", "passed", "failed", "errored", "skipped"): - totals[key] += row.get(key, 0) - values.append(row.get(key, 0)) - table.add_row(module_name, *map(str, values)) - table.add_section() - table.add_row( - "total={}, passed={}, failed={}, errored={}, skipped={}".format( - *map(str, totals.values()) - ) - ) - console = Console() - console.print(table) - - -if __name__ == "__main__": - # parse arguments - parser = argparse.ArgumentParser() - parser.add_argument( - "log_file_name", nargs=1, help="The input log file name" - ) - parser.add_argument( - "--output", - choices=["json", "table"], - default="table", - help="The output format", - ) - args = parser.parse_args() - results = sort_results(get_per_module_results(args.log_file_name[0])) - if args.output == "json": - print_results_as_json(results) - else: - print_results_as_table(results) diff --git a/python/cudf/cudf/testing/__init__.py b/python/cudf/cudf/testing/__init__.py deleted file mode 100644 index 4e92b43b9f9..00000000000 --- a/python/cudf/cudf/testing/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.testing.testing import ( - assert_eq, - assert_frame_equal, - assert_index_equal, - assert_neq, - assert_series_equal, -) diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py deleted file mode 100644 index 8cb9efa873c..00000000000 --- a/python/cudf/cudf/testing/_utils.py +++ /dev/null @@ -1,392 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import itertools -import string -import time -from collections import abc -from contextlib import contextmanager -from decimal import Decimal - -import numpy as np -import pandas as pd -import pytest -from numba.core.typing import signature as nb_signature -from numba.core.typing.templates import AbstractTemplate -from numba.cuda.cudadecl import registry as cuda_decl_registry -from numba.cuda.cudaimpl import lower as cuda_lower - -import cudf -from cudf._lib.null_mask import bitmask_allocation_size_bytes -from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion -from cudf.core.udf.strings_lowering import cast_string_view_to_udf_string -from cudf.core.udf.strings_typing import StringView, string_view, udf_string -from cudf.utils import dtypes as dtypeutils - -supported_numpy_dtypes = [ - "bool", - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "datetime64[ms]", - "datetime64[us]", -] - -SIGNED_INTEGER_TYPES = sorted(list(dtypeutils.SIGNED_INTEGER_TYPES)) -UNSIGNED_TYPES = sorted(list(dtypeutils.UNSIGNED_TYPES)) -INTEGER_TYPES = sorted(list(dtypeutils.INTEGER_TYPES)) -FLOAT_TYPES = sorted(list(dtypeutils.FLOAT_TYPES)) -SIGNED_TYPES = sorted(list(dtypeutils.SIGNED_TYPES)) -NUMERIC_TYPES = sorted(list(dtypeutils.NUMERIC_TYPES)) -DATETIME_TYPES = sorted(list(dtypeutils.DATETIME_TYPES)) -TIMEDELTA_TYPES = sorted(list(dtypeutils.TIMEDELTA_TYPES)) -OTHER_TYPES = sorted(list(dtypeutils.OTHER_TYPES)) -ALL_TYPES = sorted(list(dtypeutils.ALL_TYPES)) - -SERIES_OR_INDEX_NAMES = [ - None, - pd.NA, - cudf.NA, - np.nan, - float("NaN"), - "abc", - 1, - pd.NaT, - np.datetime64("nat"), - np.timedelta64("NaT"), - np.timedelta64(10, "D"), - np.timedelta64(5, "D"), - np.datetime64("1970-01-01 00:00:00.000000001"), - np.datetime64("1970-01-01 00:00:00.000000002"), - pd.Timestamp(1), - pd.Timestamp(2), - pd.Timedelta(1), - pd.Timedelta(2), - Decimal("NaN"), - Decimal("1.2"), - np.int64(1), - np.int32(1), - np.float32(1), - pd.Timestamp(1), -] - - -def set_random_null_mask_inplace(series, null_probability=0.5, seed=None): - """Randomly nullify elements in series with the provided probability.""" - probs = [null_probability, 1 - null_probability] - rng = np.random.default_rng(seed=seed) - mask = rng.choice([False, True], size=len(series), p=probs) - series.iloc[mask] = None - - -# TODO: This function should be removed. Anywhere that it is being used should -# instead be generating a random boolean array (bytemask) and use the public -# APIs to set those elements to None. -def random_bitmask(size): - """ - Parameters - ---------- - size : int - number of bits - """ - sz = bitmask_allocation_size_bytes(size) - data = np.random.randint(0, 255, dtype="u1", size=sz) - return data.view("i1") - - -def expand_bits_to_bytes(arr): - def fix_binary(bstr): - bstr = bstr[2:] - diff = 8 - len(bstr) - return ("0" * diff + bstr)[::-1] - - ba = bytearray(arr.data) - return list(map(int, "".join(map(fix_binary, map(bin, ba))))) - - -def count_zero(arr): - arr = np.asarray(arr) - return np.count_nonzero(arr == 0) - - -def assert_exceptions_equal( - lfunc, - rfunc, - lfunc_args_and_kwargs=None, - rfunc_args_and_kwargs=None, - check_exception_type=True, -): - """Compares if two functions ``lfunc`` and ``rfunc`` raise - same exception or not. - - Parameters - ---------- - lfunc : callable - A callable function to obtain the Exception. - rfunc : callable - A callable function to compare the Exception - obtained by calling ``rfunc``. - lfunc_args_and_kwargs : tuple, default None - Tuple containing positional arguments at first position, - and key-word arguments at second position that need to be passed into - ``lfunc``. If the tuple is of length 1, it must either contain - positional arguments(as a Sequence) or key-word arguments(as a Mapping - dict). - rfunc_args_and_kwargs : tuple, default None - Tuple containing positional arguments at first position, - and key-word arguments at second position that need to be passed into - ``rfunc``. If the tuple is of length 1, it must either contain - positional arguments(as a Sequence) or key-word arguments(as a Mapping - dict). - check_exception_type : boolean, default True - Whether to compare the exception types raised by ``lfunc`` - with ``rfunc`` exception type or not. If False, ``rfunc`` - is simply evaluated against `Exception` type. - - Returns - ------- - None - If exceptions raised by ``lfunc`` and - ``rfunc`` match. - - Raises - ------ - AssertionError - If call to ``lfunc`` doesn't raise any Exception. - """ - - lfunc_args, lfunc_kwargs = _get_args_kwars_for_assert_exceptions( - lfunc_args_and_kwargs - ) - rfunc_args, rfunc_kwargs = _get_args_kwars_for_assert_exceptions( - rfunc_args_and_kwargs - ) - - try: - lfunc(*lfunc_args, **lfunc_kwargs) - except KeyboardInterrupt: - raise - except Exception as e: - with pytest.raises(type(e) if check_exception_type else Exception): - rfunc(*rfunc_args, **rfunc_kwargs) - else: - raise AssertionError("Expected to fail with an Exception.") - - -def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs): - if func_args_and_kwargs is None: - return [], {} - else: - if len(func_args_and_kwargs) == 1: - func_args, func_kwargs = [], {} - if isinstance(func_args_and_kwargs[0], abc.Sequence): - func_args = func_args_and_kwargs[0] - elif isinstance(func_args_and_kwargs[0], abc.Mapping): - func_kwargs = func_args_and_kwargs[0] - else: - raise ValueError( - "length 1 func_args_and_kwargs must be " - "either a Sequence or a Mapping" - ) - elif len(func_args_and_kwargs) == 2: - if not isinstance(func_args_and_kwargs[0], abc.Sequence): - raise ValueError( - "Positional argument at 1st position of " - "func_args_and_kwargs should be a sequence." - ) - if not isinstance(func_args_and_kwargs[1], abc.Mapping): - raise ValueError( - "Key-word argument at 2nd position of " - "func_args_and_kwargs should be a dictionary mapping." - ) - - func_args, func_kwargs = func_args_and_kwargs - else: - raise ValueError("func_args_and_kwargs must be of length 1 or 2") - return func_args, func_kwargs - - -def gen_rand(dtype, size, **kwargs): - dtype = cudf.dtype(dtype) - if dtype.kind == "f": - res = np.random.random(size=size).astype(dtype) - if kwargs.get("positive_only", False): - return res - else: - return res * 2 - 1 - elif dtype == np.int8 or dtype == np.int16: - low = kwargs.get("low", -32) - high = kwargs.get("high", 32) - return np.random.randint(low=low, high=high, size=size).astype(dtype) - elif dtype.kind == "i": - low = kwargs.get("low", -10000) - high = kwargs.get("high", 10000) - return np.random.randint(low=low, high=high, size=size).astype(dtype) - elif dtype == np.uint8 or dtype == np.uint16: - low = kwargs.get("low", 0) - high = kwargs.get("high", 32) - return np.random.randint(low=low, high=high, size=size).astype(dtype) - elif dtype.kind == "u": - low = kwargs.get("low", 0) - high = kwargs.get("high", 128) - return np.random.randint(low=low, high=high, size=size).astype(dtype) - elif dtype.kind == "b": - low = kwargs.get("low", 0) - high = kwargs.get("high", 2) - return np.random.randint(low=low, high=high, size=size).astype( - np.bool_ - ) - elif dtype.kind == "M": - low = kwargs.get("low", 0) - time_unit, _ = np.datetime_data(dtype) - high = kwargs.get( - "high", - int(1e18) / _unit_to_nanoseconds_conversion[time_unit], - ) - return pd.to_datetime( - np.random.randint(low=low, high=high, size=size), unit=time_unit - ) - elif dtype.kind in ("O", "U"): - low = kwargs.get("low", 10) - high = kwargs.get("high", 11) - nchars = np.random.randint(low=low, high=high, size=1)[0] - char_options = np.array(list(string.ascii_letters + string.digits)) - all_chars = "".join(np.random.choice(char_options, nchars * size)) - return np.array( - [all_chars[nchars * i : nchars * (i + 1)] for i in range(size)] - ) - - raise NotImplementedError(f"dtype.kind={dtype.kind}") - - -def gen_rand_series(dtype, size, **kwargs): - values = gen_rand(dtype, size, **kwargs) - if kwargs.get("has_nulls", False): - return cudf.Series.from_masked_array(values, random_bitmask(size)) - - return cudf.Series(values) - - -def _decimal_series(input, dtype): - return cudf.Series( - [x if x is None else Decimal(x) for x in input], - dtype=dtype, - ) - - -@contextmanager -def does_not_raise(): - yield - - -def assert_column_memory_eq( - lhs: cudf.core.column.ColumnBase, rhs: cudf.core.column.ColumnBase -): - """Assert the memory location and size of `lhs` and `rhs` are equivalent. - - Both data pointer and mask pointer are checked. Also recursively check for - children to the same constraints. Also fails check if the number of - children mismatches at any level. - """ - - def get_ptr(x) -> int: - return x.get_ptr(mode="read") if x else 0 - - assert get_ptr(lhs.base_data) == get_ptr(rhs.base_data) - assert get_ptr(lhs.base_mask) == get_ptr(rhs.base_mask) - assert lhs.base_size == rhs.base_size - assert lhs.offset == rhs.offset - assert lhs.size == rhs.size - assert len(lhs.base_children) == len(rhs.base_children) - for lhs_child, rhs_child in zip(lhs.base_children, rhs.base_children): - assert_column_memory_eq(lhs_child, rhs_child) - if isinstance(lhs, cudf.core.column.CategoricalColumn) and isinstance( - rhs, cudf.core.column.CategoricalColumn - ): - assert_column_memory_eq(lhs.categories, rhs.categories) - assert_column_memory_eq(lhs.codes, rhs.codes) - - -def assert_column_memory_ne( - lhs: cudf.core.column.ColumnBase, rhs: cudf.core.column.ColumnBase -): - try: - assert_column_memory_eq(lhs, rhs) - except AssertionError: - return - raise AssertionError("lhs and rhs holds the same memory.") - - -parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( - "left_dtype,right_dtype", - list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), -) - - -@contextmanager -def expect_warning_if(condition, warning=FutureWarning, *args, **kwargs): - """Catch a warning using pytest.warns if the expect_warning is True. - - All arguments are forwarded to pytest.warns if expect_warning is True. - """ - if condition: - with pytest.warns(warning, *args, **kwargs): - yield - else: - yield - - -def sv_to_udf_str(sv): - """ - Cast a string_view object to a udf_string object - - This placeholder function never runs in python - It exists only for numba to have something to replace - with the typing and lowering code below - - This is similar conceptually to needing a translation - engine to emit an expression in target language "B" when - there is no equivalent in the source language "A" to - translate from. This function effectively defines the - expression in language "A" and the associated typing - and lowering describe the translation process, despite - the expression having no meaning in language "A" - """ - pass - - -@cuda_decl_registry.register_global(sv_to_udf_str) -class StringViewToUDFStringDecl(AbstractTemplate): - def generic(self, args, kws): - if isinstance(args[0], StringView) and len(args) == 1: - return nb_signature(udf_string, string_view) - - -@cuda_lower(sv_to_udf_str, string_view) -def sv_to_udf_str_testing_lowering(context, builder, sig, args): - return cast_string_view_to_udf_string( - context, builder, sig.args[0], sig.return_type, args[0] - ) - - -class cudf_timeout: - """ - Context manager to raise a TimeoutError after a specified number of seconds. - """ - - def __init__(self, timeout): - self.timeout = timeout - - def __enter__(self): - self.start_time = time.perf_counter() - - def __exit__(self, *args): - elapsed_time = ( - time.perf_counter() - self.start_time - ) # Calculate elapsed time - if elapsed_time >= self.timeout: - raise TimeoutError( - f"Expected to finish in {self.timeout=} seconds but took {elapsed_time=} seconds" - ) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py deleted file mode 100644 index 13c194d6be0..00000000000 --- a/python/cudf/cudf/testing/dataset_generator.py +++ /dev/null @@ -1,850 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -# This module is for generating "synthetic" datasets. It was originally -# designed for testing filtered reading. Generally, it should be useful -# if you want to generate data where certain phenomena (e.g., cardinality) -# are exaggerated. - -import copy -import random -import string -import uuid -from multiprocessing import Pool - -import numpy as np -import pandas as pd -import pyarrow as pa -from pyarrow import parquet as pq - -import cudf -from cudf.utils.dtypes import np_to_pa_dtype - - -class ColumnParameters: - """Parameters for generating column of data - - Attributes - ---------- - cardinality : int or None - Size of a random set of values that generated data is sampled from. - The values in the random set are derived from the given generator. - If cardinality is None, the Iterable returned by the given generator - is invoked for each value to be generated. - null_frequency : 0.1 - Probability of a generated value being null - generator : Callable - Function for generating random data. - is_sorted : bool - Sort this column. Columns are sorted in same order as ColumnParameters - instances stored in column_params of Parameters. If there are one or - more columns marked as sorted, the generated PyArrow Table will be - converted to a Pandas DataFrame to do the sorting. This may implicitly - convert numbers to floats in the presence of nulls. - dtype : optional - a numpy dtype to control the format of the data - """ - - def __init__( - self, - cardinality=100, - null_frequency=0.1, - generator=lambda: [ - _generate_string(string.ascii_letters, random.randint(4, 8)) - for _ in range(100) - ], - is_sorted=True, - dtype=None, - ): - self.cardinality = cardinality - self.null_frequency = null_frequency - self.generator = generator - self.is_sorted = is_sorted - self.dtype = dtype - - -class Parameters: - """Parameters for random dataset generation - - Attributes - ---------- - num_rows : int - Number of rows to generate - column_parameters : List[ColumnParams] - ColumnParams for each column - seed : int or None, default None - Seed for random data generation - """ - - def __init__( - self, - num_rows=2048, - column_parameters=None, - seed=None, - ): - self.num_rows = num_rows - if column_parameters is None: - column_parameters = [] - self.column_parameters = column_parameters - self.seed = seed - - -def _write(tbl, path, format): - if format["name"] == "parquet": - if isinstance(tbl, pa.Table): - pq.write_table(tbl, path, row_group_size=format["row_group_size"]) - elif isinstance(tbl, pd.DataFrame): - tbl.to_parquet(path, row_group_size=format["row_group_size"]) - - -def _generate_column(column_params, num_rows): - # If cardinality is specified, we create a set to sample from. - # Otherwise, we simply use the given generator to generate each value. - - if column_params.cardinality is not None: - # Construct set of values to sample from where - # set size = cardinality - - if ( - isinstance(column_params.dtype, str) - and column_params.dtype == "category" - ): - vals = pa.array( - column_params.generator, - size=column_params.cardinality, - safe=False, - ) - return pa.DictionaryArray.from_arrays( - dictionary=vals, - indices=np.random.randint( - low=0, high=len(vals), size=num_rows - ), - mask=np.random.choice( - [True, False], - size=num_rows, - p=[ - column_params.null_frequency, - 1 - column_params.null_frequency, - ], - ) - if column_params.null_frequency > 0.0 - else None, - ) - - if hasattr(column_params.dtype, "to_arrow"): - arrow_type = column_params.dtype.to_arrow() - elif column_params.dtype is not None: - arrow_type = np_to_pa_dtype(cudf.dtype(column_params.dtype)) - else: - arrow_type = None - - if isinstance(column_params.dtype, cudf.StructDtype): - vals = pa.StructArray.from_arrays( - column_params.generator, - names=column_params.dtype.fields.keys(), - mask=pa.array( - np.random.choice( - [True, False], - size=num_rows, - p=[ - column_params.null_frequency, - 1 - column_params.null_frequency, - ], - ) - ) - if column_params.null_frequency > 0.0 - else None, - ) - return vals - elif not isinstance(arrow_type, pa.lib.Decimal128Type): - vals = pa.array( - column_params.generator, - size=column_params.cardinality, - safe=False, - type=arrow_type, - ) - vals = pa.array( - np.random.choice(column_params.generator, size=num_rows) - if isinstance(arrow_type, pa.lib.Decimal128Type) - else np.random.choice(vals, size=num_rows), - mask=np.random.choice( - [True, False], - size=num_rows, - p=[ - column_params.null_frequency, - 1 - column_params.null_frequency, - ], - ) - if column_params.null_frequency > 0.0 - else None, - size=num_rows, - safe=False, - type=None - if isinstance(arrow_type, pa.lib.Decimal128Type) - else arrow_type, - ) - if isinstance(arrow_type, pa.lib.Decimal128Type): - vals = vals.cast(arrow_type, safe=False) - return vals - else: - # Generate data for current column - return pa.array( - column_params.generator, - mask=np.random.choice( - [True, False], - size=num_rows, - p=[ - column_params.null_frequency, - 1 - column_params.null_frequency, - ], - ) - if column_params.null_frequency > 0.0 - else None, - size=num_rows, - safe=False, - ) - - -def generate( - path, - parameters, - format=None, - use_threads=True, -): - """ - Generate dataset using given parameters and write to given format - - Parameters - ---------- - path : str or file-like object - Path to write to - parameters : Parameters - Parameters specifying how to randomly generate data - format : Dict - Format to write - """ - if format is None: - format = {"name": "parquet", "row_group_size": 64} - df = get_dataframe(parameters, use_threads) - - # Write - _write(df, path, format) - - -def get_dataframe(parameters, use_threads): - # Initialize seeds - if parameters.seed is not None: - np.random.seed(parameters.seed) - - # For each column, invoke the data generator - for column_params in parameters.column_parameters: - column_params.generator = column_params.generator() - - # Get schema for each column - table_fields = [] - for i, column_params in enumerate(parameters.column_parameters): - if ( - isinstance(column_params.dtype, str) - and column_params.dtype == "category" - ): - arrow_type = pa.dictionary( - index_type=pa.int64(), - value_type=np_to_pa_dtype( - cudf.dtype(type(next(iter(column_params.generator)))) - ), - ) - elif hasattr(column_params.dtype, "to_arrow"): - arrow_type = column_params.dtype.to_arrow() - else: - arrow_type = np_to_pa_dtype( - cudf.dtype(type(next(iter(column_params.generator)))) - if column_params.dtype is None - else column_params.dtype - ) - table_fields.append( - pa.field( - name=str(i), - type=arrow_type, - nullable=column_params.null_frequency > 0, - ) - ) - - schema = pa.schema(table_fields) - - # Initialize column data and which columns should be sorted - column_data = [None] * len(parameters.column_parameters) - columns_to_sort = [ - str(i) - for i, column_params in enumerate(parameters.column_parameters) - if column_params.is_sorted - ] - # Generate data - if not use_threads: - for i, column_params in enumerate(parameters.column_parameters): - column_data[i] = _generate_column( - column_params, parameters.num_rows - ) - else: - pool = Pool(pa.cpu_count()) - column_data = pool.starmap( - _generate_column, - [ - (column_params, parameters.num_rows) - for i, column_params in enumerate(parameters.column_parameters) - ], - ) - pool.close() - pool.join() - # Convert to Pandas DataFrame and sort columns appropriately - tbl = pa.Table.from_arrays( - column_data, - schema=schema, - ) - if columns_to_sort: - tbl = tbl.to_pandas() - tbl = tbl.sort_values(columns_to_sort) - tbl = pa.Table.from_pandas(tbl, schema) - return tbl - - -def rand_dataframe( - dtypes_meta, rows, seed=random.randint(0, 2**32 - 1), use_threads=True -): - """ - Generates a random table. - - Parameters - ---------- - dtypes_meta : List of dict - Specifies list of dtype meta data. dtype meta data should - be a dictionary of the form example: - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10} - `"str"` dtype can contain an extra key `max_string_length` to - control the maximum size of the strings being generated in each row. - If not specified, it will default to 1000. - rows : int - Specifies the number of rows to be generated. - seed : int - Specifies the `seed` value to be utilized by all downstream - random data generation APIs. - use_threads : bool - Indicates whether to use threads pools to build the columns - - Returns - ------- - PyArrow Table - A Table with columns of corresponding dtypes mentioned in `dtypes_meta` - """ - # Apply seed - random.seed(seed) - np.random.seed(seed) - - column_params = [] - for meta in dtypes_meta: - dtype = copy.deepcopy(meta["dtype"]) - null_frequency = copy.deepcopy(meta["null_frequency"]) - cardinality = copy.deepcopy(meta["cardinality"]) - - if dtype == "list": - lists_max_length = meta["lists_max_length"] - nesting_max_depth = meta["nesting_max_depth"] - value_type = meta["value_type"] - nesting_depth = np.random.randint(1, nesting_max_depth) - - dtype = cudf.core.dtypes.ListDtype(value_type) - - # Determining the `dtype` from the `value_type` - # and the nesting_depth - i = 1 - while i < nesting_depth: - dtype = cudf.core.dtypes.ListDtype(dtype) - i += 1 - - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=list_generator( - dtype=value_type, - size=cardinality, - nesting_depth=nesting_depth, - lists_max_length=lists_max_length, - ), - is_sorted=False, - dtype=dtype, - ) - ) - elif dtype == "struct": - nesting_max_depth = meta["nesting_max_depth"] - max_types_at_each_level = meta["max_types_at_each_level"] - max_null_frequency = meta["max_null_frequency"] - nesting_depth = np.random.randint(1, nesting_max_depth) - structDtype = create_nested_struct_type( - max_types_at_each_level=max_types_at_each_level, - nesting_level=nesting_depth, - ) - - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=struct_generator( - dtype=structDtype, - cardinality=cardinality, - size=rows, - max_null_frequency=max_null_frequency, - ), - is_sorted=False, - dtype=structDtype, - ) - ) - elif dtype == "decimal64": - max_precision = meta.get( - "max_precision", cudf.Decimal64Dtype.MAX_PRECISION - ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) - dtype = cudf.Decimal64Dtype(precision=precision, scale=scale) - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), - is_sorted=False, - dtype=dtype, - ) - ) - elif dtype == "decimal32": - max_precision = meta.get( - "max_precision", cudf.Decimal32Dtype.MAX_PRECISION - ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) - dtype = cudf.Decimal32Dtype(precision=precision, scale=scale) - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), - is_sorted=False, - dtype=dtype, - ) - ) - elif dtype == "decimal128": - max_precision = meta.get( - "max_precision", cudf.Decimal128Dtype.MAX_PRECISION - ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) - dtype = cudf.Decimal128Dtype(precision=precision, scale=scale) - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), - is_sorted=False, - dtype=dtype, - ) - ) - elif dtype == "category": - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=lambda cardinality=cardinality: [ - _unique_string() for _ in range(cardinality) - ], - is_sorted=False, - dtype="category", - ) - ) - else: - dtype = cudf.dtype(dtype) - if dtype.kind in ("i", "u"): - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=int_generator( - dtype=dtype, - size=cardinality, - min_bound=meta.get("min_bound", None), - max_bound=meta.get("max_bound", None), - ), - is_sorted=False, - dtype=dtype, - ) - ) - elif dtype.kind == "f": - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=float_generator( - dtype=dtype, - size=cardinality, - min_bound=meta.get("min_bound", None), - max_bound=meta.get("max_bound", None), - ), - is_sorted=False, - dtype=dtype, - ) - ) - elif dtype.kind in ("U", "O"): - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=lambda cardinality=cardinality: [ - _generate_string( - string.printable, - np.random.randint( - low=0, - high=meta.get("max_string_length", 1000), - size=1, - )[0], - ) - for _ in range(cardinality) - ], - is_sorted=False, - dtype=dtype, - ) - ) - elif dtype.kind == "M": - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=datetime_generator( - dtype=dtype, - size=cardinality, - min_bound=meta.get("min_bound", None), - max_bound=meta.get("max_bound", None), - ), - is_sorted=False, - dtype=cudf.dtype(dtype), - ) - ) - elif dtype.kind == "m": - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=timedelta_generator( - dtype=dtype, - size=cardinality, - min_bound=meta.get("min_bound", None), - max_bound=meta.get("max_bound", None), - ), - is_sorted=False, - dtype=cudf.dtype(dtype), - ) - ) - elif dtype.kind == "b": - column_params.append( - ColumnParameters( - cardinality=cardinality, - null_frequency=null_frequency, - generator=boolean_generator(cardinality), - is_sorted=False, - dtype=cudf.dtype(dtype), - ) - ) - else: - raise TypeError(f"Unsupported dtype: {dtype}") - # TODO: Add List column support once - # https://github.com/rapidsai/cudf/pull/6075 - # is merged. - - df = get_dataframe( - Parameters( - num_rows=rows, - column_parameters=column_params, - seed=seed, - ), - use_threads=use_threads, - ) - - return df - - -def int_generator(dtype, size, min_bound=None, max_bound=None): - """ - Generator for int data - """ - if min_bound is not None and max_bound is not None: - low, high = min_bound, max_bound - else: - iinfo = np.iinfo(dtype) - low, high = iinfo.min, iinfo.max - - return lambda: np.random.randint( - low=low, - high=high, - size=size, - dtype=dtype, - ) - - -def float_generator(dtype, size, min_bound=None, max_bound=None): - """ - Generator for float data - """ - if min_bound is not None and max_bound is not None: - low, high = min_bound, max_bound - return lambda: np.random.uniform( - low=low, - high=high, - size=size, - ) - else: - finfo = np.finfo(dtype) - return ( - lambda: np.random.uniform( - low=finfo.min / 2, - high=finfo.max / 2, - size=size, - ) - * 2 - ) - - -def datetime_generator(dtype, size, min_bound=None, max_bound=None): - """ - Generator for datetime data - """ - if min_bound is not None and max_bound is not None: - low, high = min_bound, max_bound - else: - iinfo = np.iinfo("int64") - low, high = iinfo.min + 1, iinfo.max - - return lambda: np.random.randint( - low=np.datetime64(low, "ns").astype(dtype).astype("int"), - high=np.datetime64(high, "ns").astype(dtype).astype("int"), - size=size, - ) - - -def timedelta_generator(dtype, size, min_bound=None, max_bound=None): - """ - Generator for timedelta data - """ - if min_bound is not None and max_bound is not None: - low, high = min_bound, max_bound - else: - iinfo = np.iinfo("int64") - low, high = iinfo.min + 1, iinfo.max - - return lambda: np.random.randint( - low=np.timedelta64(low, "ns").astype(dtype).astype("int"), - high=np.timedelta64(high, "ns").astype(dtype).astype("int"), - size=size, - ) - - -def boolean_generator(size): - """ - Generator for bool data - """ - return lambda: np.random.choice(a=[False, True], size=size) - - -def decimal_generator(dtype, size): - max_integral = 10 ** (dtype.precision - dtype.scale) - 1 - max_float = (10**dtype.scale - 1) if dtype.scale != 0 else 0 - return lambda: ( - np.random.uniform( - low=-max_integral, - high=max_integral + (max_float / 10**dtype.scale), - size=size, - ) - ) - - -def get_values_for_nested_data(dtype, lists_max_length=None, size=None): - """ - Returns list of values based on dtype. - """ - if size is None: - cardinality = np.random.randint(0, lists_max_length) - else: - cardinality = size - - dtype = cudf.dtype(dtype) - if dtype.kind in ("i", "u"): - values = int_generator(dtype=dtype, size=cardinality)() - elif dtype.kind == "f": - values = float_generator(dtype=dtype, size=cardinality)() - elif dtype.kind in ("U", "O"): - values = [ - _generate_string( - string.printable, - 100, - ) - for _ in range(cardinality) - ] - elif dtype.kind == "M": - values = datetime_generator(dtype=dtype, size=cardinality)().astype( - dtype - ) - elif dtype.kind == "m": - values = timedelta_generator(dtype=dtype, size=cardinality)().astype( - dtype - ) - elif dtype.kind == "b": - values = boolean_generator(cardinality)().astype(dtype) - else: - raise TypeError(f"Unsupported dtype: {dtype}") - - return values - - -def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): - """ - Helper to create random list of lists with `nesting_depth` and - specified value type `dtype`. - """ - nesting_depth -= 1 - if nesting_depth >= 0: - L = np.random.randint(1, lists_max_length) - for i in range(L): - top_level_list.append( - make_lists( - dtype=dtype, - lists_max_length=lists_max_length, - nesting_depth=nesting_depth, - top_level_list=[], - ) - ) - else: - top_level_list = get_values_for_nested_data( - dtype=dtype, lists_max_length=lists_max_length - ) - # To ensure numpy arrays are not passed as input to - # list constructor, returning a python list object here. - if isinstance(top_level_list, np.ndarray): - top_level_list = top_level_list.tolist() - - return top_level_list - - -def make_array_for_struct(dtype, cardinality, size, max_null_frequency): - """ - Helper to create a pa.array with `size` and `dtype` - for a `StructArray`. - """ - - null_frequency = np.random.uniform(low=0, high=max_null_frequency) - local_cardinality = max(np.random.randint(low=0, high=cardinality), 1) - data = get_values_for_nested_data( - dtype=dtype.type.to_pandas_dtype(), size=local_cardinality - ) - vals = np.random.choice(data, size=size) - - return pa.array( - vals, - mask=np.random.choice( - [True, False], - size=size, - p=[null_frequency, 1 - null_frequency], - ) - if null_frequency > 0.0 - else None, - size=size, - safe=False, - type=dtype.type, - ) - - -def get_nested_lists(dtype, size, nesting_depth, lists_max_length): - """ - Returns a list of nested lists with random nesting - depth and random nested lists length. - """ - list_of_lists = [] - - while len(list_of_lists) <= size: - list_of_lists.extend( - make_lists( - dtype=dtype, - lists_max_length=lists_max_length, - nesting_depth=nesting_depth, - top_level_list=[], - ) - ) - - return list_of_lists - - -def get_nested_structs(dtype, cardinality, size, max_null_frequency): - """ - Returns a list of arrays with random data - corresponding to the dtype provided. - ``dtype`` here should be a ``cudf.StructDtype`` - """ - list_of_arrays = [] - - for name, col_dtype in dtype.fields.items(): - if isinstance(col_dtype, cudf.StructDtype): - result_arrays = get_nested_structs( - col_dtype, cardinality, size, max_null_frequency - ) - result_arrays = pa.StructArray.from_arrays( - result_arrays, names=col_dtype.fields.keys() - ) - else: - result_arrays = make_array_for_struct( - dtype=dtype._typ[name], - cardinality=cardinality, - size=size, - max_null_frequency=max_null_frequency, - ) - list_of_arrays.append(result_arrays) - - return list_of_arrays - - -def list_generator(dtype, size, nesting_depth, lists_max_length): - """ - Generator for list data - """ - return lambda: get_nested_lists( - dtype=dtype, - size=size, - nesting_depth=nesting_depth, - lists_max_length=lists_max_length, - ) - - -def struct_generator(dtype, cardinality, size, max_null_frequency): - """ - Generator for struct data - """ - return lambda: get_nested_structs( - dtype=dtype, - cardinality=cardinality, - size=size, - max_null_frequency=max_null_frequency, - ) - - -def create_nested_struct_type(max_types_at_each_level, nesting_level): - dtypes_list = cudf.utils.dtypes.ALL_TYPES - picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level) - type_dict = {} - for name, type_ in enumerate(picked_types): - if type_ == "struct": - type_dict[str(name)] = create_nested_struct_type( - max_types_at_each_level, nesting_level - 1 - ) - else: - type_dict[str(name)] = cudf.dtype(type_) - return cudf.StructDtype(type_dict) - - -def _generate_string(str_seq: str, length: int = 10) -> str: - return "".join(random.choices(str_seq, k=length)) - - -def _unique_string() -> str: - return str(uuid.uuid4()).replace("-", "") diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py deleted file mode 100644 index 668e7a77454..00000000000 --- a/python/cudf/cudf/testing/testing.py +++ /dev/null @@ -1,812 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import warnings - -import cupy as cp -import numpy as np -import pandas as pd -from pandas import testing as tm - -import cudf -from cudf._lib.unary import is_nan -from cudf.api.types import is_numeric_dtype, is_string_dtype -from cudf.core.missing import NA, NaT - - -def dtype_can_compare_equal_to_other(dtype): - # return True if values of this dtype can compare - # as equal to equal values of a different dtype - return not ( - is_string_dtype(dtype) - or isinstance( - dtype, - ( - cudf.IntervalDtype, - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype, - ), - ) - ) - - -def _check_isinstance(left, right, obj): - if not isinstance(left, obj): - raise AssertionError( - f"{obj} Expected type {obj}, found {type(left)} instead" - ) - elif not isinstance(right, obj): - raise AssertionError( - f"{obj} Expected type {obj}, found {type(right)} instead" - ) - - -def raise_assert_detail(obj, message, left, right, diff=None): - msg = f"""{obj} are different - -{message} -[left]: {left} -[right]: {right}""" - - if diff is not None: - msg += f"\n[diff]: {diff}" - - raise AssertionError(msg) - - -def _check_types( - left, right, check_categorical=True, exact="equiv", obj="Index" -): - if not exact or exact == "equiv": - if ( - isinstance(left, cudf.RangeIndex) - and ( - isinstance(right, cudf.Index) - and hasattr(right, "dtype") - and right.dtype.kind == "i" - ) - ) or ( - isinstance(right, cudf.RangeIndex) - and ( - isinstance(left, cudf.Index) - and hasattr(left, "dtype") - and left.dtype.kind == "i" - ) - ): - return - - if type(left) != type(right): - raise_assert_detail( - obj, "Class types are different", f"{type(left)}", f"{type(right)}" - ) - - if ( - exact - and not isinstance(left, cudf.MultiIndex) - and isinstance(left.dtype, cudf.CategoricalDtype) - ): - if left.dtype != right.dtype: - raise_assert_detail( - obj, "Categorical difference", f"{left}", f"{right}" - ) - - -def assert_column_equal( - left, - right, - check_dtype=True, - check_column_type="equiv", - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_category_order=True, - rtol=1e-05, - atol=1e-08, - obj="ColumnBase", -): - """ - Check that left and right columns are equal - - This function is intended to compare two columns and output - any differences. Additional parameters allow varying the strictness - of the equality checks performed. - - Parameters - ---------- - left : Column - left Column to compare - right : Column - right Column to compare - check_dtype : bool, default True - Whether to check the Column dtype is identical. - check_column_type : bool or {'equiv'}, default 'equiv' - Whether to check the columns class, dtype and - inferred_type are identical. Currently it is idle, - and similar to pandas. - check_exact : bool, default False - Whether to compare number exactly. - check_datetime_like_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - check_category_order : bool, default True - Whether to compare category order of internal Categoricals - rtol : float, default 1e-5 - Relative tolerance. Only used when `check_exact` is False. - atol : float, default 1e-8 - Absolute tolerance. Only used when `check_exact` is False. - obj : str, default 'ColumnBase' - Specify object name being compared, internally used to - show appropriate assertion message. - """ - if check_dtype is True: - if ( - isinstance(left.dtype, cudf.CategoricalDtype) - and isinstance(right.dtype, cudf.CategoricalDtype) - and not check_categorical - ): - pass - else: - if type(left) != type(right) or left.dtype != right.dtype: - msg1 = f"{left.dtype}" - msg2 = f"{right.dtype}" - raise_assert_detail(obj, "Dtypes are different", msg1, msg2) - else: - if left.null_count == len(left) and right.null_count == len(right): - return True - - if check_datetimelike_compat: - if left.dtype.kind == "M": - right = right.astype(left.dtype) - elif right.dtype.kind == "M": - left = left.astype(right.dtype) - - if left.dtype.kind == "M": - if not left.equals(right): - raise AssertionError( - f"[datetimelike_compat=True] {left.values} " - f"is not equal to {right.values}." - ) - return - - if check_exact and check_categorical: - if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance( - right.dtype, cudf.CategoricalDtype - ): - left_cat = left.categories - right_cat = right.categories - - if check_category_order: - assert_index_equal( - left_cat, - right_cat, - exact=check_dtype, - check_exact=True, - check_categorical=False, - rtol=rtol, - atol=atol, - ) - assert_column_equal( - left.codes, - right.codes, - check_dtype=check_dtype, - check_exact=True, - check_categorical=False, - check_category_order=False, - rtol=rtol, - atol=atol, - ) - - if left.ordered != right.ordered: - msg1 = f"{left.ordered}" - msg2 = f"{right.ordered}" - raise_assert_detail( - f"{obj} category", "Orders are different", msg1, msg2 - ) - - if ( - not check_dtype - and isinstance(left.dtype, cudf.CategoricalDtype) - and isinstance(right.dtype, cudf.CategoricalDtype) - ): - left = left.astype(left.categories.dtype) - right = right.astype(right.categories.dtype) - columns_equal = False - if left.size == right.size == 0: - columns_equal = True - elif not ( - ( - not dtype_can_compare_equal_to_other(left.dtype) - and is_numeric_dtype(right.dtype) - ) - or ( - is_numeric_dtype(left.dtype) - and not dtype_can_compare_equal_to_other(right.dtype) - ) - ): - try: - # nulls must be in the same places for all dtypes - columns_equal = cp.all( - left.isnull().values == right.isnull().values - ) - - if ( - columns_equal - and not check_exact - and is_numeric_dtype(left.dtype) - ): - # non-null values must be the same - columns_equal = cp.allclose( - left.apply_boolean_mask( - left.isnull().unary_operator("not") - ).values, - right.apply_boolean_mask( - right.isnull().unary_operator("not") - ).values, - ) - if columns_equal and ( - left.dtype.kind == right.dtype.kind == "f" - ): - columns_equal = cp.all( - is_nan(left).values == is_nan(right).values - ) - else: - columns_equal = left.equals(right) - except TypeError as e: - if str(e) != "Categoricals can only compare with the same type": - raise e - else: - columns_equal = False - if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance( - right.dtype, cudf.CategoricalDtype - ): - left = left.astype(left.categories.dtype) - right = right.astype(right.categories.dtype) - if not columns_equal: - try: - ldata = str([val for val in left.to_pandas(nullable=True)]) - rdata = str([val for val in right.to_pandas(nullable=True)]) - except NotImplementedError: - ldata = str([val for val in left.to_pandas(nullable=False)]) - rdata = str([val for val in right.to_pandas(nullable=False)]) - try: - diff = 0 - for i in range(left.size): - if not null_safe_scalar_equals(left[i], right[i]): - diff += 1 - diff = diff * 100.0 / left.size - except BaseException: - diff = 100.0 - raise_assert_detail( - obj, - f"values are different ({np.round(diff, 5)} %)", - {ldata}, - {rdata}, - ) - - -def null_safe_scalar_equals(left, right): - if left in {NA, NaT, np.nan} or right in {NA, NaT, np.nan}: - return left is right - return left == right - - -def assert_index_equal( - left, - right, - exact="equiv", - check_names: bool = True, - check_exact: bool = True, - check_categorical: bool = True, - check_order: bool = True, - rtol: float = 1e-5, - atol: float = 1e-8, - obj: str = "Index", -): - """ - Check that left and right Index are equal - - This function is intended to compare two Index and output - any differences. Additional parameters allow varying the strictness - of the equality checks performed. - - Parameters - ---------- - left : Index - left Index to compare - right : Index - right Index to compare - exact : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. If 'equiv', then RangeIndex can be substituted - for Index with an int8/int32/int64 dtype as well. - check_names : bool, default True - Whether to check the names attribute. - check_exact : bool, default False - Whether to compare number exactly. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - check_order : bool, default True - Whether to compare the order of index entries as - well as their values. - If True, both indexes must contain the same elements, - in the same order. - If False, both indexes must contain the same elements, - but in any order. - rtol : float, default 1e-5 - Relative tolerance. Only used when `check_exact` is False. - atol : float, default 1e-8 - Absolute tolerance. Only used when `check_exact` is False. - obj : str, default 'Index' - Specify object name being compared, internally used to - show appropriate assertion message. - - Examples - -------- - >>> import cudf - >>> id1 = cudf.Index([1, 2, 3, 4]) - >>> id2 = cudf.Index([1, 2, 3, 5]) - >>> cudf.testing.assert_index_equal(id1, id2) - ...... - ...... - AssertionError: ColumnBase are different - - values are different (25.0 %) - [left]: [1 2 3 4] - [right]: [1 2 3 5] - - >>> id2 = cudf.Index([1, 2, 3, 4], name="b") - >>> cudf.testing.assert_index_equal(id1, id2) - ...... - ...... - AssertionError: Index are different - - name mismatch - [left]: a - [right]: b - - This will pass without any hitch: - - >>> id2 = cudf.Index([1, 2, 3, 4], name="a") - >>> cudf.testing.assert_index_equal(id1, id2) - """ - - # instance validation - _check_isinstance(left, right, cudf.BaseIndex) - - _check_types( - left, right, exact=exact, check_categorical=check_categorical, obj=obj - ) - - if len(left) != len(right): - raise_assert_detail( - obj, "lengths are different", f"{len(left)}", f"{len(right)}" - ) - - # If order doesn't matter then sort the index entries - if not check_order: - left = left.sort_values() - right = right.sort_values() - - if isinstance(left, cudf.MultiIndex): - if left.nlevels != right.nlevels: - raise AssertionError( - "Number of levels mismatch, " - f"left has {left.nlevels} levels and right has {right.nlevels}" - ) - - for level in range(left.nlevels): - llevel = cudf.Index._from_column( - left._columns[level], name=left.names[level] - ) - rlevel = cudf.Index._from_column( - right._columns[level], name=right.names[level] - ) - mul_obj = f"MultiIndex level [{level}]" - assert_index_equal( - llevel, - rlevel, - exact=check_exact, - check_names=check_names, - check_exact=check_exact, - check_order=check_order, - rtol=rtol, - atol=atol, - obj=mul_obj, - ) - return - assert_column_equal( - left._columns[0], - right._columns[0], - check_dtype=exact, - check_exact=check_exact, - check_categorical=check_categorical, - obj=obj, - ) - - # metadata comparison - if check_names and (left.name != right.name): - raise_assert_detail( - obj, "name mismatch", f"{left.name}", f"{right.name}" - ) - - -def assert_series_equal( - left, - right, - check_dtype=True, - check_index_type="equiv", - check_series_type=True, - check_names=True, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_category_order=True, - rtol=1e-5, - atol=1e-8, - obj="Series", -): - """ - Check that left and right Series are equal - - This function is intended to compare two Series and output - any differences. Additional parameters allow varying the strictness - of the equality checks performed. - - Parameters - ---------- - left : Series - left Series to compare - right : Series - right Series to compare - check_dtype : bool, default True - Whether to check the Series dtype is identical. - check_index_type : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. - check_series_type : bool, default True - Whether to check the series class, dtype and - inferred_type are identical. Currently it is idle, - and similar to pandas. - check_names : bool, default True - Whether to check that the names attribute for both the index - and column attributes of the Series is identical. - check_exact : bool, default False - Whether to compare number exactly. - check_datetime_like_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - check_category_order : bool, default True - Whether to compare category order of internal Categoricals - rtol : float, default 1e-5 - Relative tolerance. Only used when `check_exact` is False. - atol : float, default 1e-8 - Absolute tolerance. Only used when `check_exact` is False. - obj : str, default 'Series' - Specify object name being compared, internally used to - show appropriate assertion message. - - Examples - -------- - >>> import cudf - >>> sr1 = cudf.Series([1, 2, 3, 4], name="a") - >>> sr2 = cudf.Series([1, 2, 3, 5], name="b") - >>> cudf.testing.assert_series_equal(sr1, sr2) - ...... - ...... - AssertionError: ColumnBase are different - - values are different (25.0 %) - [left]: [1 2 3 4] - [right]: [1 2 3 5] - - >>> sr2 = cudf.Series([1, 2, 3, 4], name="b") - >>> cudf.testing.assert_series_equal(sr1, sr2) - ...... - ...... - AssertionError: Series are different - - name mismatch - [left]: a - [right]: b - - This will pass without any hitch: - - >>> sr2 = cudf.Series([1, 2, 3, 4], name="a") - >>> cudf.testing.assert_series_equal(sr1, sr2) - """ - - # instance validation - _check_isinstance(left, right, cudf.Series) - - if len(left) != len(right): - msg1 = f"{len(left)}, {left.index}" - msg2 = f"{len(right)}, {right.index}" - raise_assert_detail(obj, "Series length are different", msg1, msg2) - - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.index", - ) - - assert_column_equal( - left._column, - right._column, - check_dtype=check_dtype, - check_column_type=check_series_type, - check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - check_category_order=check_category_order, - rtol=rtol, - atol=atol, - ) - - # metadata comparison - if check_names and (left.name != right.name): - raise_assert_detail( - obj, "name mismatch", f"{left.name}", f"{right.name}" - ) - - -def assert_frame_equal( - left, - right, - check_dtype=True, - check_index_type="equiv", - check_column_type="equiv", - check_frame_type=True, - check_names=True, - by_blocks=False, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_like=False, - rtol=1e-5, - atol=1e-8, - obj="DataFrame", -): - """ - Check that left and right DataFrame are equal - - This function is intended to compare two DataFrame and output - any differences. Additional parameters allow varying the strictness - of the equality checks performed. - - Parameters - ---------- - left : DataFrame - left DataFrame to compare - right : DataFrame - right DataFrame to compare - check_dtype : bool, default True - Whether to check the DataFrame dtype is identical. - check_index_type : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. - check_column_type : bool, default True - Whether to check the column class, dtype and - inferred_type are identical. Currently it is idle, - and similar to pandas. - check_frame_type : bool, default True - Whether to check the DataFrame class is identical. - check_names : bool, default True - Whether to check that the names attribute for both the index and - column attributes of the DataFrame is identical. - check_exact : bool, default False - Whether to compare number exactly. - by_blocks : bool, default False - Not supported - check_exact : bool, default False - Whether to compare number exactly. - check_datetime_like_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - check_like : bool, default False - If True, ignore the order of index & columns. - Note: index labels must match their respective - rows (same as in columns) - same labels must be with the same data. - rtol : float, default 1e-5 - Relative tolerance. Only used when `check_exact` is False. - atol : float, default 1e-8 - Absolute tolerance. Only used when `check_exact` is False. - obj : str, default 'DataFrame' - Specify object name being compared, internally used to - show appropriate assertion message. - - Examples - -------- - >>> import cudf - >>> df1 = cudf.DataFrame({"a":[1, 2], "b":[1.0, 2.0]}, index=[1, 2]) - >>> df2 = cudf.DataFrame({"a":[1, 2], "b":[1.0, 2.0]}, index=[2, 3]) - >>> cudf.testing.assert_frame_equal(df1, df2) - ...... - ...... - AssertionError: ColumnBase are different - - values are different (100.0 %) - [left]: [1 2] - [right]: [2 3] - - >>> df2 = cudf.DataFrame({"a":[1, 2], "c":[1.0, 2.0]}, index=[1, 2]) - >>> cudf.testing.assert_frame_equal(df1, df2) - ...... - ...... - AssertionError: DataFrame.columns are different - - DataFrame.columns values are different (50.0 %) - [left]: Index(['a', 'b'], dtype='object') - right]: Index(['a', 'c'], dtype='object') - - >>> df2 = cudf.DataFrame({"a":[1, 2], "b":[1.0, 3.0]}, index=[1, 2]) - >>> cudf.testing.assert_frame_equal(df1, df2) - ...... - ...... - AssertionError: Column name="b" are different - - values are different (50.0 %) - [left]: [1. 2.] - [right]: [1. 3.] - - This will pass without any hitch: - - >>> df2 = cudf.DataFrame({"a":[1, 2], "b":[1.0, 2.0]}, index=[1, 2]) - >>> cudf.testing.assert_frame_equal(df1, df2) - """ - _check_isinstance(left, right, cudf.DataFrame) - - if check_frame_type: - assert isinstance(left, type(right)) - - # shape comparison - if left.shape != right.shape: - raise AssertionError("left and right shape mismatch") - - if check_like: - left, right = left.reindex(index=right.index), right - right = right[list(left._column_names)] - - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.index", - ) - - pd.testing.assert_index_equal( - left._data.to_pandas_index(), - right._data.to_pandas_index(), - exact=check_column_type, - check_names=check_names, - check_exact=check_exact, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f"{obj}.columns", - ) - - for col in left._column_names: - assert_column_equal( - left._data[col], - right._data[col], - check_dtype=check_dtype, - check_exact=check_exact, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - rtol=rtol, - atol=atol, - obj=f'Column name="{col}"', - ) - - -def assert_eq(left, right, **kwargs): - """Assert that two cudf-like things are equivalent - - Parameters - ---------- - left - Object to compare - right - Object to compare - kwargs - Keyword arguments to control behaviour of comparisons. See - :func:`assert_frame_equal`, :func:`assert_series_equal`, and - :func:`assert_index_equal`. - - Notes - ----- - This equality test works for pandas/cudf dataframes/series/indexes/scalars - in the same way, and so makes it easier to perform parametrized testing - without switching between assert_frame_equal/assert_series_equal/... - functions. - - Raises - ------ - AssertionError - If the two objects do not compare equal. - """ - # dtypes that we support but Pandas doesn't will convert to - # `object`. Check equality before that happens: - if kwargs.get("check_dtype", True): - if hasattr(left, "dtype") and hasattr(right, "dtype"): - if isinstance( - left.dtype, cudf.core.dtypes._BaseDtype - ) and not isinstance( - left.dtype, cudf.CategoricalDtype - ): # leave categorical comparison to Pandas - assert_eq(left.dtype, right.dtype) - - if hasattr(left, "to_pandas"): - left = left.to_pandas() - if hasattr(right, "to_pandas"): - right = right.to_pandas() - if isinstance(left, cp.ndarray): - left = cp.asnumpy(left) - if isinstance(right, cp.ndarray): - right = cp.asnumpy(right) - - if isinstance(left, (pd.DataFrame, pd.Series, pd.Index)): - # TODO: A warning is emitted from the function - # pandas.testing.assert_[series, frame, index]_equal for some inputs: - # "DeprecationWarning: elementwise comparison failed; this will raise - # an error in the future." - # or "FutureWarning: elementwise ..." - # This warning comes from a call from pandas to numpy. It is ignored - # here because it cannot be fixed within cudf. - with warnings.catch_warnings(): - warnings.simplefilter( - "ignore", (DeprecationWarning, FutureWarning) - ) - if isinstance(left, pd.DataFrame): - tm.assert_frame_equal(left, right, **kwargs) - elif isinstance(left, pd.Series): - tm.assert_series_equal(left, right, **kwargs) - else: - tm.assert_index_equal(left, right, **kwargs) - - elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray): - if left.dtype.kind == "f" and right.dtype.kind == "f": - assert np.allclose(left, right, equal_nan=True) - else: - assert np.array_equal(left, right) - else: - # Use the overloaded __eq__ of the operands - if left == right: - return True - elif any(np.issubdtype(type(x), np.floating) for x in (left, right)): - np.testing.assert_almost_equal(left, right) - else: - np.testing.assert_equal(left, right) - return True - - -def assert_neq(left, right, **kwargs): - """Assert that two cudf-like things are not equal. - - Provides the negation of the meaning of :func:`assert_eq`. - """ - __tracebackhide__ = True - try: - assert_eq(left, right, **kwargs) - except AssertionError: - pass - else: - raise AssertionError diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py deleted file mode 100644 index 437bc4cba67..00000000000 --- a/python/cudf/cudf/tests/conftest.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import itertools -import os -import pathlib - -import cupy as cp -import numpy as np -import pytest - -import rmm # noqa: F401 - -import cudf -from cudf.testing import assert_eq - -_CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent) - - -@pytest.fixture(scope="session") -def datadir(): - return pathlib.Path(__file__).parent / "data" - - -@pytest.fixture( - params=itertools.product([0, 2, None], [0.3, None]), - ids=lambda arg: f"n={arg[0]}-frac={arg[1]}", -) -def sample_n_frac(request): - """ - Specific to `test_sample*` tests. - """ - n, frac = request.param - if n is not None and frac is not None: - pytest.skip("Cannot specify both n and frac.") - return n, frac - - -def shape_checker(expected, got): - assert expected.shape == got.shape - - -def exact_checker(expected, got): - assert_eq(expected, got) - - -@pytest.fixture( - params=[ - (None, None, shape_checker), - (42, 42, shape_checker), - (np.random.RandomState(42), np.random.RandomState(42), exact_checker), - ], - ids=["None", "IntSeed", "NumpyRandomState"], -) -def random_state_tuple_axis_1(request): - """ - Specific to `test_sample*_axis_1` tests. - A pytest fixture of valid `random_state` parameter pairs for pandas - and cudf. Valid parameter combinations, and what to check for each pair - are listed below: - - pandas: None, seed(int), np.random.RandomState - cudf: None, seed(int), np.random.RandomState - ------ - check: shape, shape, exact result - - Each column above stands for one valid parameter combination and check. - """ - - return request.param - - -@pytest.fixture( - params=[ - (None, None, shape_checker), - (42, 42, shape_checker), - (np.random.RandomState(42), np.random.RandomState(42), exact_checker), - (np.random.RandomState(42), cp.random.RandomState(42), shape_checker), - ], - ids=["None", "IntSeed", "NumpyRandomState", "CupyRandomState"], -) -def random_state_tuple_axis_0(request): - """ - Specific to `test_sample*_axis_0` tests. - A pytest fixture of valid `random_state` parameter pairs for pandas - and cudf. Valid parameter combinations, and what to check for each pair - are listed below: - - pandas: None, seed(int), np.random.RandomState, np.random.RandomState - cudf: None, seed(int), np.random.RandomState, cp.random.RandomState - ------ - check: shape, shape, exact result, shape - - Each column above stands for one valid parameter combination and check. - """ - - return request.param - - -@pytest.fixture(params=[None, "builtin_list", "ndarray"]) -def make_weights_axis_0(request): - """Specific to `test_sample*_axis_0` tests. - Only testing weights array that matches type with random state. - """ - - if request.param is None: - return lambda *_: (None, None) - elif request.param == "builtin-list": - return lambda size, _: ([1] * size, [1] * size) - else: - - def wrapped(size, numpy_weights_for_cudf): - # Uniform distribution, non-normalized - if numpy_weights_for_cudf: - return np.ones(size), np.ones(size) - else: - return np.ones(size), cp.ones(size) - - return wrapped - - -# To set and remove the NO_EXTERNAL_ONLY_APIS environment variable we must use -# the sessionstart and sessionfinish hooks rather than a simple autouse, -# session-scope fixture because we need to set these variable before collection -# occurs because the environment variable will be checked as soon as cudf is -# imported anywhere. -def pytest_sessionstart(session): - """ - Called after the Session object has been created and - before performing collection and entering the run test loop. - """ - os.environ["NO_EXTERNAL_ONLY_APIS"] = "1" - os.environ["_CUDF_TEST_ROOT"] = _CURRENT_DIRECTORY - - -def pytest_sessionfinish(session, exitstatus): - """ - Called after whole test run finished, right before - returning the exit status to the system. - """ - try: - del os.environ["NO_EXTERNAL_ONLY_APIS"] - del os.environ["_CUDF_TEST_ROOT"] - except KeyError: - pass - - -@pytest.fixture(params=[32, 64]) -def default_integer_bitwidth(request): - old_default = cudf.get_option("default_integer_bitwidth") - cudf.set_option("default_integer_bitwidth", request.param) - yield request.param - cudf.set_option("default_integer_bitwidth", old_default) - - -@pytest.fixture(params=[32, 64]) -def default_float_bitwidth(request): - old_default = cudf.get_option("default_float_bitwidth") - cudf.set_option("default_float_bitwidth", request.param) - yield request.param - cudf.set_option("default_float_bitwidth", old_default) - - -@pytest.hookimpl(tryfirst=True, hookwrapper=True) -def pytest_runtest_makereport(item, call): - """Hook to make result information available in fixtures - - This makes it possible for a pytest.fixture to access the current test - state through `request.node.report`. - See the `manager` fixture in `test_spilling.py` for an example. - - Pytest doc: - """ - outcome = yield - rep = outcome.get_result() - - # Set a report attribute for each phase of a call, which can - # be "setup", "call", "teardown" - setattr(item, "report", {rep.when: rep}) diff --git a/python/cudf/cudf/tests/data/__init__.py b/python/cudf/cudf/tests/data/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/data/avro/__init__.py b/python/cudf/cudf/tests/data/avro/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/data/avro/example.avro b/python/cudf/cudf/tests/data/avro/example.avro deleted file mode 100644 index 4a95d1abe86..00000000000 Binary files a/python/cudf/cudf/tests/data/avro/example.avro and /dev/null differ diff --git a/python/cudf/cudf/tests/data/ipums.pkl b/python/cudf/cudf/tests/data/ipums.pkl deleted file mode 100644 index 5c8e896487d..00000000000 Binary files a/python/cudf/cudf/tests/data/ipums.pkl and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc deleted file mode 100644 index 1c661e1c6f0..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc deleted file mode 100644 index edc1094a186..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc deleted file mode 100644 index fe5f57af14c..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc deleted file mode 100644 index 53c323436d6..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc deleted file mode 100644 index 1bb4079c492..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc deleted file mode 100644 index a457b8285bd..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet b/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet deleted file mode 100644 index a80ce5fbd25..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NestedStructDataFrame.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NestedStructDataFrame.orc deleted file mode 100644 index 6cfb2238150..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.NestedStructDataFrame.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc deleted file mode 100644 index 2103e0212fc..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.IntWithNulls.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc deleted file mode 100644 index e57da851820..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc deleted file mode 100644 index 32d0c85dd25..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc deleted file mode 100644 index 1c6e53a0b92..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.NoIndStrm.StructWithNoNulls.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc deleted file mode 100644 index 26535e09549..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc deleted file mode 100644 index a13b19efa86..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.EmptyDecompData.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.NestedNotNullableStruct.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.NestedNotNullableStruct.orc deleted file mode 100644 index 91efff903d0..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.Spark.NestedNotNullableStruct.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc deleted file mode 100644 index dd51856c3f7..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.apache_timestamp.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.boolean_corruption_PR_6636.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.boolean_corruption_PR_6636.orc deleted file mode 100644 index 86b07f024cc..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.boolean_corruption_PR_6636.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.boolean_corruption_PR_6702.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.boolean_corruption_PR_6702.orc deleted file mode 100644 index 8705022b3e8..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.boolean_corruption_PR_6702.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.multiple.values.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.multiple.values.orc deleted file mode 100644 index 65af4677941..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.multiple.values.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.orc deleted file mode 100644 index cb0f7b9d767..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.runpos.issue.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.runpos.issue.orc deleted file mode 100644 index 72139acd88a..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.runpos.issue.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.same.values.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.same.values.orc deleted file mode 100644 index b0a24a44c89..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.decimal.same.values.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.demo-12-zlib.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.demo-12-zlib.orc deleted file mode 100644 index 862dd27af27..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.demo-12-zlib.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.emptyFile.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.emptyFile.orc deleted file mode 100644 index ecdadcbff13..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.emptyFile.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.gmt.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.gmt.orc deleted file mode 100644 index 7256130d79d..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.gmt.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.int16.rle.size.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.int16.rle.size.orc deleted file mode 100644 index 2e96e40cda7..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.int16.rle.size.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.int_decimal.precision_19.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.int_decimal.precision_19.orc deleted file mode 100644 index 20df4c1a92f..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.int_decimal.precision_19.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.largeTimestamps.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.largeTimestamps.orc deleted file mode 100644 index 095b7372c89..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.largeTimestamps.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.lima_timezone.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.lima_timezone.orc deleted file mode 100644 index f40e2d40944..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.lima_timezone.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.nulls-at-end-snappy.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.nulls-at-end-snappy.orc deleted file mode 100644 index 2099c484fde..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.nulls-at-end-snappy.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc deleted file mode 100644 index 4fb0beff868..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.test1.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.testDate1900.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.testDate1900.orc deleted file mode 100644 index f51ffdbd03a..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.testDate1900.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.testDate2038.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.testDate2038.orc deleted file mode 100644 index cd11fa8a4e9..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.testDate2038.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.testPySparkStruct.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.testPySparkStruct.orc deleted file mode 100644 index 7748e901bce..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.testPySparkStruct.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.testSnappy.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.testSnappy.orc deleted file mode 100644 index aa6cc9c9ba1..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.testSnappy.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.testStringAndBinaryStatistics.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.testStringAndBinaryStatistics.orc deleted file mode 100644 index 4282c2a1965..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.testStringAndBinaryStatistics.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.testStripeLevelStats.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.testStripeLevelStats.orc deleted file mode 100644 index 7073bfad071..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.testStripeLevelStats.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.issue.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.issue.orc deleted file mode 100644 index 55494803196..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.issue.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/__init__.py b/python/cudf/cudf/tests/data/orc/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/data/orc/nodata.orc b/python/cudf/cudf/tests/data/orc/nodata.orc deleted file mode 100644 index 0f53e73b0ed..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/nodata.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/orc/uncompressed_snappy.orc b/python/cudf/cudf/tests/data/orc/uncompressed_snappy.orc deleted file mode 100644 index fc2b0e2cc6d..00000000000 Binary files a/python/cudf/cudf/tests/data/orc/uncompressed_snappy.orc and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/__init__.py b/python/cudf/cudf/tests/data/parquet/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/data/parquet/bad_dict.parquet b/python/cudf/cudf/tests/data/parquet/bad_dict.parquet deleted file mode 100644 index 5008ac0b22b..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/bad_dict.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/binary_decimal.parquet b/python/cudf/cudf/tests/data/parquet/binary_decimal.parquet deleted file mode 100644 index 24b45d23495..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/binary_decimal.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/brotli_int16.parquet b/python/cudf/cudf/tests/data/parquet/brotli_int16.parquet deleted file mode 100644 index e52ad4e8d11..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/brotli_int16.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet b/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet deleted file mode 100644 index 7f6006a75bf..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/delta_byte_arr.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet b/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet deleted file mode 100644 index ea6952e5bcd..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/delta_encoding.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet b/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet deleted file mode 100644 index b0ee8f2e4d2..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/fixed_len_byte_array.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/mixed_compression.parquet b/python/cudf/cudf/tests/data/parquet/mixed_compression.parquet deleted file mode 100644 index 663db076717..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/mixed_compression.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/nested-unsigned-malformed.parquet b/python/cudf/cudf/tests/data/parquet/nested-unsigned-malformed.parquet deleted file mode 100644 index 6e1d59aedca..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/nested-unsigned-malformed.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/nested_column_map.parquet b/python/cudf/cudf/tests/data/parquet/nested_column_map.parquet deleted file mode 100644 index 8f6966ed68b..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/nested_column_map.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet deleted file mode 100644 index 7440d357a12..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list.parquet deleted file mode 100644 index f10d3a10290..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/one_level_list.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet deleted file mode 100644 index cd5acd04594..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/one_level_list2.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list3.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list3.parquet deleted file mode 100644 index 788e2c05743..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/one_level_list3.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet b/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet deleted file mode 100644 index 6a6de0a9422..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/spark_decimal.parquet b/python/cudf/cudf/tests/data/parquet/spark_decimal.parquet deleted file mode 100644 index 8672d8bd531..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/spark_decimal.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/spark_timestamp.snappy.parquet b/python/cudf/cudf/tests/data/parquet/spark_timestamp.snappy.parquet deleted file mode 100644 index 41cecca0e06..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/spark_timestamp.snappy.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/spark_zstd.parquet b/python/cudf/cudf/tests/data/parquet/spark_zstd.parquet deleted file mode 100644 index 99b584aa557..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/spark_zstd.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/trailing_nans.parquet b/python/cudf/cudf/tests/data/parquet/trailing_nans.parquet deleted file mode 100644 index a6046ec01f6..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/trailing_nans.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet deleted file mode 100644 index efde6ff11bf..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/parquet/zstd_huff_tables_bug.parquet b/python/cudf/cudf/tests/data/parquet/zstd_huff_tables_bug.parquet deleted file mode 100644 index 4fb66fd86fc..00000000000 Binary files a/python/cudf/cudf/tests/data/parquet/zstd_huff_tables_bug.parquet and /dev/null differ diff --git a/python/cudf/cudf/tests/data/pkl/__init__.py b/python/cudf/cudf/tests/data/pkl/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl deleted file mode 100644 index 1ec077d10f7..00000000000 Binary files a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl and /dev/null differ diff --git a/python/cudf/cudf/tests/data/sas/cars.sas7bdat b/python/cudf/cudf/tests/data/sas/cars.sas7bdat deleted file mode 100644 index ca5d3474c36..00000000000 Binary files a/python/cudf/cudf/tests/data/sas/cars.sas7bdat and /dev/null differ diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/__init__.py b/python/cudf/cudf/tests/data/subword_tokenizer_data/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/__init__.py b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt deleted file mode 100644 index 84b13c9d946..00000000000 --- a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt +++ /dev/null @@ -1,4382 +0,0 @@ -26899 -27424 -875 -7428432802425011718 0 -5054974408289448963 6 -18358444369622338053 9 -5716902217424485892 14 -8236612966193239043 18 -15282833726017872390 21 -15533348956988973570 27 -9001315167781089284 29 -7621090240282984451 33 -15337888141402371590 36 -16169070283077377537 42 -15615300272936709634 43 -12338784885023498756 45 -3175624061711419395 49 -9436392785812228615 52 -12978641027296058883 59 -14468815760709033991 62 -15607694490571932163 69 -53295083356623878 72 -0 78 -2230148770582976004 78 -6120456721458209796 82 -15411373208619074054 86 -10274574020114097153 92 -9000294930530661890 93 -13031557903172483076 95 -11350066664294002181 99 -6325605033787362307 104 -2909954277284188676 107 -4104562716099355138 111 -3267092979937387012 113 -17525453481571210244 117 -11532627846208440834 121 -10784672185103672321 123 -11229796758348255749 124 -4379577250247562242 129 -1041161126836283908 131 -3854383966527313413 135 -16467720483237810694 140 -14820844471735454722 146 -13111220924289178119 148 -2548683052821249538 155 -719749806464434178 157 -2121722119826170883 159 -9005614210949580292 162 -7050169108294333445 166 -17351764915062575107 171 -14644698505496219141 174 -11657834349296686081 179 -13626797927783164930 180 -14735048589438940164 182 -1078491261937017863 186 -7952761372439242754 193 -7692446865301965827 195 -4552111108816020995 198 -12455022990418032132 201 -1123962659471997957 205 -3056549312838577156 210 -1025661670765243906 214 -5397331336358247944 216 -7810366437124875782 224 -1195318972358038531 230 -7079722807026103811 233 -2524512050942986248 236 -1208593608912656389 244 -458260789232344578 249 -13194777122325112327 251 -5922704468287492 258 -11746235869336195079 262 -8611574268876189188 269 -7889840228953421829 273 -16998721522558936068 278 -6703563424903621638 282 -8885848295085850114 288 -13776273837475230211 290 -6036043703810622467 293 -2006225773287659526 296 -14202467530861800964 302 -7157057020317447684 306 -16885485872491802629 310 -12800303798361952772 315 -621325108927868418 319 -16727475898656483841 321 -6890112792805515778 322 -2421332377941126151 324 -16243404411124196356 331 -179400401794890244 335 -2630159406474274819 339 -1306609735592145925 342 -14908020842914311174 347 -1684452927247835651 353 -9400495923215416322 356 -8041860727239247878 358 -5619270496913133574 364 -2985476283152588291 370 -18150632792370312198 373 -13075355875451793410 379 -7596576612263365635 381 -7174955249282660868 384 -2272878747426984963 388 -9645618748109430277 391 -5995177571885476868 396 -16871713338758691845 400 -11801224416933808644 405 -15551192014010130949 409 -8196030292452405250 414 -4794784530053649411 416 -68047322062825475 419 -10163451915097363972 422 -4366630365820669955 426 -9174613115382159879 429 -17673253091692480002 436 -10710744348807818249 438 -6301209632168211460 447 -6557199531177304066 451 -10370980735304160259 453 -2426040420413965827 456 -18123352379522220547 459 -15891150425892429319 462 -16507447417454265351 469 -487708338428237827 476 -14107089365716616196 479 -747857609528251395 483 -17357876987202521607 486 -321005419951863300 493 -703083947315053061 497 -0 502 -17149635587492691460 502 -8277651075246678020 506 -1819886593879462403 510 -13106328552418381315 513 -17519686381941948418 516 -10696099526822671877 518 -4627984173327437314 523 -2628632462897246722 525 -3686397216490033667 527 -6617920799692924934 530 -6679301623707790339 536 -2596030458845084674 539 -13288938917088308226 541 -8348492885671808517 543 -6252009608718840325 548 -5807005916268695559 553 -15382799971167504899 560 -14954638692016032262 563 -8963684459383523331 569 -2934745887866391556 572 -8236887590303639044 576 -2016330563068923911 580 -12976290063611676164 587 -9986513189506445831 591 -780378482699725318 598 -383862355994530823 604 -7511344867307093508 611 -1435616864863593988 615 -12590979271693393411 619 -859813995721111047 622 -17910873098448224770 629 -16703366890805911553 631 -6922480979814889987 632 -8200210214462711297 635 -18382541080931060232 636 -12959023536126992897 644 -11055794376142651906 645 -8668012051305565187 647 -6795201209679524868 650 -3864186432644490244 654 -4574634299775772674 658 -2086703290536303619 660 -7145543127561014787 663 -9889572542971630085 666 -3510566585561691650 671 -10482036181312531460 673 -4296479271603189251 677 -17165580381790665732 680 -17931697598514948104 684 -5072138329769649158 692 -17857316349005986308 698 -1196313437880152072 702 -16094827446472526340 710 -6365083142954013701 714 -17639674970007880709 719 -1336948026798963208 724 -15719079816546418177 732 -453771991153695748 733 -15666021623592344581 737 -3887496731301423107 742 -16351565489992748547 745 -12913808626051103749 748 -9427161342471792643 753 -14610089064185748483 756 -11909740995340709890 759 -3386059367942955011 761 -7100313088634791944 764 -14954362273735097348 772 -5300343188950335490 776 -3306636399811602435 778 -15049176780536452612 781 -11478464585367391747 785 -4192691696663825924 788 -1724981527538165256 792 -8923121468991320579 800 -10407927314751914499 803 -4140577061391662082 806 -11024499228689010181 808 -11103397578962422789 813 -16103730809841527300 818 -2161511371026989571 822 -16905537098408481288 825 -14418359835235787780 833 -8643099440826274820 837 -15803230958149170691 841 -2270949347024239618 844 -16607521085023703556 846 -12520505897845165062 850 -10502193626894192132 856 -12350321094518214659 860 -4950437143309872131 863 -938542234576037889 866 -9547302901107668484 867 -7827404372121768966 871 -17757593377946824198 877 -13699186867246955524 883 -9859653826627356163 887 -16394835100035514883 890 -13800374264730731525 893 -16954635983094506500 898 -8015308433863798275 902 -858715644299290630 905 -4519655150699331077 911 -7134867591233050115 916 -6432786657037144579 919 -0 922 -9408341322832972291 922 -13653279902433200130 925 -1249019122170091524 927 -5444522055126761479 931 -18233734556082323457 938 -1838285473517654531 939 -10799019207790220804 942 -2448710159565130755 946 -18425837006146807297 949 -1384258267102048263 950 -6553795393861204486 957 -5022631533298058243 963 -2595435540421003780 966 -18298501952506793480 970 -17380720526409169413 978 -10291550905275666437 983 -8968303908578660869 988 -7762552109517888009 993 -12993351549860134403 1002 -13098482377540869636 1005 -17174134275815044100 1009 -2405939573849534980 1013 -11051603729345690626 1017 -2765842466801084934 1019 -13348255112383532037 1025 -4560899789258637829 1030 -17071422935680193539 1035 -11513452937230732294 1038 -1637355496640499203 1044 -14940739688966611972 1047 -8286559267538602502 1051 -6029036263825492484 1057 -6337648087046756355 1061 -12327119652833755139 1064 -7489768843341343236 1067 -17101806024406781955 1071 -1494687508867621385 1074 -915975103999953922 1083 -14731060910946571783 1085 -7993361195780195330 1092 -13688799604315935236 1094 -7328858946338903047 1098 -2913637027195678723 1105 -18189363439163655681 1108 -11261484070936291332 1109 -1244962005334571010 1113 -12618388435910808066 1115 -655187203027088898 1117 -1699259352638115337 1119 -9837815037477742085 1128 -10558465000768489987 1133 -3128326958710492164 1136 -16210393874387209731 1140 -3831602806328386054 1143 -1858477608543888899 1149 -11203849268139405826 1152 -14876215834473532933 1154 -838167957834962945 1159 -4472540425609859076 1160 -11410947109444917250 1164 -8435818218907397633 1166 -11045000766266457089 1167 -12325335880954441220 1168 -16708265953266297345 1172 -18342265362969646594 1173 -6953158344648897539 1175 -9922701673105435137 1178 -10113283973443524101 1179 -11668798096262926343 1184 -2129351334726026241 1191 -5692959118811792390 1192 -2917574127780044290 1198 -0 1200 -14420924818562740228 1200 -6098057863303978497 1204 -1252966646111680002 1205 -7111078464697947144 1207 -14144456899593720327 1215 -7367692118573781509 1222 -9319588592876439043 1227 -5212294342286609410 1230 -1600499660866511361 1232 -17579747388547180552 1233 -8365608306992954885 1241 -10307394306592963076 1246 -17092600292669807621 1250 -17030981925892977667 1255 -6929843536411176451 1258 -9908722951841282057 1261 -14685407131320535554 1270 -12861962652898171396 1272 -11958437143660911107 1276 -15904867421058229764 1279 -7283769647955500035 1283 -7872121678898447876 1286 -11726527760261815816 1290 -2316085662456682505 1298 -12840093831481137155 1307 -15574983692566917639 1310 -15176154862895929860 1317 -16186650646772958214 1321 -1965140296142659588 1327 -17362020270091437575 1331 -26356620300320263 1338 -4688323194808506371 1345 -470137109846916612 1348 -785647648524588041 1352 -686083037273571331 1361 -8705676087000994307 1364 -15985311040931325446 1367 -8848102120172622345 1373 -14900059783221505542 1382 -11611185676221023751 1388 -5823293000835959809 1395 -11173877492782561286 1396 -5985141512875075076 1402 -16607272189142469634 1406 -7000924871247012354 1408 -12796508861938638339 1410 -16352304696891085315 1413 -12654027566339262469 1416 -17652126895193709571 1421 -2059554016646703617 1424 -8824828815238545922 1425 -8026041213654553606 1427 -189105210507091461 1433 -8038465995762949635 1438 -0 1441 -4346653818095449092 1441 -13441396742193060358 1445 -5067771148519478785 1451 -210369551309682178 1452 -7856429334361659909 1454 -6456628847560069634 1459 -4777640967745320451 1461 -8983636279512822276 1464 -14568805960710332932 1468 -13817574021643753989 1472 -14625711259902278149 1477 -4632056779689710085 1482 -17613320542667293189 1487 -3172012402848437254 1492 -8040798394603101188 1498 -14064841209998140419 1502 -1914908168343121410 1505 -7368139610144548354 1507 -12868473585497306119 1509 -0 1516 -1618708134596732930 1516 -12587973098332420105 1518 -4964388169698209795 1527 -11644359715676310021 1530 -2644060095775605251 1535 -6430078223195648003 1538 -10183198452214045187 1541 -1240799682393062914 1544 -594310634075621378 1546 -2369514519273954820 1548 -10180653661786314245 1552 -954303650251543043 1557 -14430712698160791045 1560 -7362398115224322564 1565 -17170839233019868678 1569 -4334478792852912645 1575 -6976600872204725253 1580 -2757627166710815234 1585 -11581525848542896643 1587 -1902097979216049156 1590 -7092174838851165700 1594 -3776232881097953287 1598 -4956341896516184071 1605 -16560365104979398147 1612 -9985649880040289799 1615 -8870322153106933763 1622 -6905121755133908995 1625 -13368640352340902916 1628 -6681848478588709895 1632 -1825204937600832520 1639 -10492979809894170628 1647 -16021790814379410438 1651 -2537982728896871938 1657 -17110141827238231043 1659 -8972517116882764291 1662 -6878463938568223238 1665 -3653948979877717506 1671 -11414481194651397126 1673 -14522267179648162819 1679 -3098339502618796035 1682 -7079749050994126342 1685 -13571764215085394946 1691 -4748948606525397506 1693 -1577643399485818884 1695 -4080235243237779462 1699 -10874175738252140040 1705 -8407257242091918850 1713 -13208300770644489219 1715 -692428139842995202 1718 -1811883090719733762 1720 -9059362818280152070 1722 -1942856588307002885 1728 -8118332366482353665 1733 -4958069245857057284 1734 -14647311378680886789 1738 -10762024033896625670 1743 -28898254948429830 1749 -9834906317233815042 1755 -14985989359682912259 1757 -1282980713864208388 1760 -6063131598875265027 1764 -11171681444901584901 1767 -9942643440891227650 1772 -7536761905759707139 1774 -17586310513048226310 1777 -5368266791748388869 1783 -14231943828217691651 1788 -12518647321260815877 1791 -129394441281844743 1796 -2483490487411335170 1803 -654244401428041732 1805 -15646533714849457160 1809 -11807354932867949571 1817 -15902831808268765699 1820 -16275101253541722114 1823 -7489443708629377026 1825 -15395914353243975682 1827 -5617555619731661829 1829 -3134100206450675206 1834 -11607495136261988868 1840 -4974806308616426501 1844 -17446584074836170241 1849 -15686830167444742663 1850 -9706307518401206273 1857 -1668062460313515521 1858 -1175330870409010693 1859 -6316020408117881860 1864 -3926008952689808899 1868 -7412001888157663237 1871 -16350342416828571139 1876 -17722048717800707588 1879 -6638262866276511751 1883 -7428951476729761793 1890 -17816197047883941382 1891 -1346568064340942337 1897 -3701787015222295555 1898 -6659812133237486083 1901 -1828541539854978054 1904 -12379063259192634885 1910 -2611769333840765443 1915 -9618163593004828678 1918 -10135224491789939206 1924 -12979651712861326853 1930 -8882180359699969027 1935 -8839565787481092102 1938 -13328456084920556038 1944 -14232512278042323458 1950 -1868952656876792325 1952 -7567044498348088836 1957 -9878469525845452294 1961 -10877666723773861891 1967 -4437849393189355524 1970 -542122243470857732 1974 -4059190346138068994 1978 -14321675947144358916 1980 -14971180244834539009 1984 -7944574903635664900 1985 -6982417546170903047 1989 -9205813465909939715 1996 -14237044737088801799 1999 -636814072910696963 2006 -12520841226045264391 2009 -8898943418672995331 2016 -15646690259358356484 2019 -15618851112604340228 2023 -10285088843216830977 2027 -18286036510192394760 2028 -6450286360774949890 2036 -12025307250191760899 2038 -7044602746592181249 2041 -8270361223031661060 2042 -7199149542695273990 2046 -16798091800673956358 2052 -5285433079037354499 2058 -8498140496880657410 2061 -18434636390635965953 2063 -8780418579830073348 2064 -959965579978681347 2068 -2666650386212475906 2071 -4093783342266269185 2073 -7977153448080645638 2074 -3230317076849645570 2080 -2644129221999468547 2082 -7597431151331275265 2085 -6151418962808616963 2086 -16786361788616914434 2089 -9522044737514147334 2091 -15360350686533802498 2097 -4398995179394704386 2099 -4163122903470647302 2101 -18110267126768664070 2107 -17811600627481865731 2113 -11988559903619469315 2116 -5893679902922151940 2119 -3302430115655037445 2123 -2756050317441962502 2128 -7373324598575981572 2134 -15626353672087051269 2138 -9026268416534243843 2143 -5857105831257628164 2146 -11246462751297413124 2150 -7459631049065515526 2154 -2175352842263141379 2160 -9748465532031254533 2163 -12060676108130005507 2168 -8160425232164846593 2171 -1665947540125783558 2172 -10758171140537368580 2178 -5744770555727548418 2182 -15867521551313803780 2184 -11178209498970826244 2188 -2663862265833334277 2192 -646145646253570050 2197 -6886825228888300036 2199 -5219187155516171272 2203 -16142200027647465989 2211 -8727938199665870852 2216 -1200328579526163971 2220 -12449385538114001417 2223 -14632283715533800450 2232 -5295800027246062086 2234 -8827019094633400323 2240 -14543826221768176641 2243 -12388128316821831686 2244 -3087048392675298821 2250 -17669786912563615747 2255 -3879520399747123716 2258 -15648071975541157893 2262 -5580473107362200071 2267 -6895786389712974853 2274 -17709709086906012676 2279 -9627483233657542665 2283 -9602326803985618949 2292 -6748599026443758086 2297 -11488364339401397254 2303 -6716511183525677573 2309 -16003763240189186563 2314 -6003803301075291138 2317 -15800367754014516746 2319 -2817341800198731782 2329 -2110085916033252869 2335 -10353852055773781511 2340 -8745468498457416193 2347 -15197463976907486213 2348 -11844773108515011075 2353 -10745169896165544965 2356 -9502565595236673539 2361 -18340734722524717062 2364 -0 2370 -4877506240735029250 2370 -6632868101528461318 2372 -1094192348264738308 2378 -15930308455756352518 2382 -7517061312773919237 2388 -11537382714050522116 2393 -15343851421525887493 2397 -15685583084244037124 2402 -11443729733346354693 2406 -18096845502703148037 2411 -13060060807344890377 2416 -8226818503915081731 2425 -5171144332412330499 2428 -5367144440061049859 2431 -4687503341676126209 2434 -8115677569098133507 2435 -8753274682505368066 2438 -6767268893840927749 2440 -10747160183142327300 2445 -5318831768157948930 2449 -16744837601970291208 2451 -3968740997769839108 2459 -1041860322726726147 2463 -13185494599343868419 2466 -3781663100474830852 2469 -8664347289501861378 2473 -7145447006642560001 2475 -977858689003972101 2476 -188865761021926916 2481 -14781205616979726850 2485 -7514076159997088261 2487 -15227633270557658627 2492 -7486357174119883778 2495 -7899052859637422087 2497 -4312982947448530435 2504 -2484418012864310785 2507 -8450324929602980870 2508 -11374778755239228418 2514 -10780034123560756745 2516 -10313953391808102916 2525 -13836623279669341188 2529 -16297706918062760459 2533 -6404560275247226885 2544 -8323769790774729734 2549 -10061687257419431941 2555 -6724033317759518212 2560 -12265972209834273288 2564 -4748706107567735299 2572 -17588235414846031363 2575 -16029681841978911746 2578 -333014962274056196 2580 -2819861156000228870 2584 -17301319418358929926 2590 -14323022738651812355 2596 -17758251407482208260 2599 -9992216596142364674 2603 -5541911712511293955 2605 -1880849355295036931 2608 -15421034026101803523 2611 -2288503501826235907 2614 -2336333131728265731 2617 -15127408664422292997 2620 -6756061181968708102 2625 -2316367058427453443 2631 -13786932856453332482 2634 -17564157627292750852 2636 -5809790665868502019 2640 -9389430036410766853 2643 -15157257604368261123 2648 -523412383725034497 2651 -5270886391729814021 2652 -8987256414287503365 2657 -2751897370690544643 2662 -47819066577966599 2665 -9543124453318907909 2672 -15186331456703232514 2677 -9731347057535958023 2679 -6234700495105510914 2686 -17720066604242729989 2688 -611878128332703234 2693 -6029104170087404549 2695 -14612606995632327172 2700 -7357792311987945475 2704 -6074856230289873410 2707 -13368808999886628358 2709 -5918378978107988995 2715 -15624776793824203778 2718 -4241055509726121476 2720 -12687432015779367427 2724 -4003272975122620932 2727 -17483676776191982087 2731 -2701605488646040584 2738 -7387630099939362308 2746 -16331822462747681798 2750 -2197183442359868933 2756 -17624623361194542087 2761 -1749450990014992388 2768 -2888206094896619010 2772 -12985412669390948353 2774 -9843120678422464515 2775 -15590458610270713859 2778 -5950622975418741251 2781 -17607672802725530117 2784 -1225097419526011394 2789 -3758572251524375044 2791 -5891371767718009858 2795 -6843754938996156419 2797 -13418347525088883204 2800 -2887280155684756490 2804 -7867196614872225796 2814 -10992396837241625094 2818 -15526482250456426497 2824 -7582254907030848515 2825 -14309589056601523716 2828 -2843794758628944386 2832 -10106627892829635078 2834 -11117505412117820418 2840 -17559521087909430786 2842 -18410508844162253834 2844 -7796754440171003912 2854 -1826091018065355268 2862 -5568124937607335426 2866 -9164033835486570503 2868 -7917102923116225537 2875 -10708221634884163076 2876 -966446973350329348 2880 -1882776320247897092 2884 -18137433528115911172 2888 -7577505208556149252 2892 -3902521102041700356 2896 -11942362790107158020 2900 -2328713611561709573 2904 -8376513561567004165 2909 -18415012889800110091 2914 -7983446382889179652 2925 -2304166271864391689 2929 -708759182721729026 2938 -10774631175750681603 2940 -2608247964063907842 2943 -7317603117343176707 2945 -12615180422705001477 2948 -17995452459822326275 2953 -12439250137675515394 2956 -9947610136498965509 2958 -10340600516380348420 2963 -10073894039732477444 2967 -15954561361998232578 2971 -6039226287079734788 2973 -12684813664097613833 2977 -8337524429261820932 2986 -0 2990 -5738139389410570757 2990 -0 2995 -163262518049440773 2995 -11390362112332120070 3000 -7666496378417453571 3006 -17188351170280199170 3009 -14157925477049500677 3011 -16535316221715341826 3016 -701193705161007105 3018 -15417977144980853763 3019 -9623949443365348357 3022 -16537640731048440324 3027 -9880057250380779521 3031 -10507448958568448514 3032 -9901540867816521219 3034 -10882434502571251716 3037 -15939490563935542790 3041 -3818155241101528578 3047 -10810785028031231493 3049 -17268925026504538113 3054 -6000103580025957894 3055 -14492044616225970179 3061 -8964295197943843335 3064 -13244227239481936387 3071 -2072267724499101186 3074 -735562179013069826 3076 -3271477415853879302 3078 -1150251700717751812 3084 -11835839830005115393 3088 -17028480913889055238 3089 -16864969398419772420 3095 -9646252156141336066 3099 -5589333819644110342 3101 -14729039479109188098 3107 -2256025994407046148 3109 -5630416426912279555 3113 -23611161351524356 3116 -16061932977440933889 3120 -7560058124185071106 3121 -8943767870065516551 3123 -17388385529962317834 3130 -11686727589179028995 3140 -2993671307613155843 3143 -7451626547139373061 3146 -12726375988952098305 3151 -0 3152 -1735273330892205060 3152 -2746028049042776065 3156 -17093562035495421445 3157 -7598703106262353411 3162 -17526920923827930631 3165 -0 3172 -18087597149122765317 3172 -11336730259137625602 3177 -9704022087244797957 3179 -14531181144788964866 3184 -5103530438547424773 3186 -7049971328222257156 3191 -2593832991454060548 3195 -2549992206172832771 3199 -2656864556911864322 3202 -3094347590740453380 3204 -0 3208 -10556974365044028932 3208 -12597146506913681926 3212 -18243354473097630721 3218 -4168646291002030084 3219 -8893226051755120644 3223 -7904367695210051587 3227 -17247367703075879942 3230 -1338287165638264836 3236 -6734394253777139715 3240 -14645087877274778627 3243 -1841749727013933062 3246 -0 3252 -9793622484838288388 3252 -15384076833580083718 3256 -14678310837729104389 3262 -8947895455599830021 3267 -12421729442783160325 3272 -14382812703434878978 3277 -3484468606955360259 3279 -2411175954345499653 3282 -18322361710054416389 3287 -8989744845956541448 3292 -9637438279185886726 3300 -8282725403817063939 3306 -10727259769060221446 3309 -280860399088910340 3315 -3074647116268871172 3319 -9311932047626983431 3323 -2990333995786696707 3330 -11415454184475025922 3333 -8194042667332418565 3335 -11269986522125913093 3340 -10773634478079810565 3345 -0 3350 -4302235270674672643 3350 -4579270605621971460 3353 -3687011949425630213 3357 -9678333478858482691 3362 -14661606109051090440 3365 -9504123850532876291 3373 -14299233528797568008 3376 -10370491504729965060 3384 -286239823911254530 3388 -7969121812144744451 3390 -16606218867148559880 3393 -11756345184017143302 3401 -8204961944753809412 3407 -12456910480062157316 3411 -7569786299014196739 3415 -3372309516929818119 3418 -16631131943564946948 3425 -4436969913528429575 3429 -14467771002258720772 3436 -15278270405312088583 3440 -6638334178561090565 3447 -8154814430089498114 3452 -17289464348431017987 3454 -13185969354886446085 3457 -4725380864147687429 3462 -14933071000620043778 3467 -12471883028204926466 3469 -13286302152236950530 3471 -12020003522260348419 3473 -11784545509165047810 3476 -10311182359550097412 3478 -2262872037167824902 3482 -15672162207595698690 3488 -8479660175647360516 3490 -543122224331105283 3494 -8738610060644560897 3497 -15969479020845567490 3498 -3500 -5303047073946667464 -210658854139 -493093586 -15289397349632312454 -5941764183477191834 -3477193953305167424 -236453760381 -7470284155521404014 -24445261 -16426766960960540026 -14549236 -817365937 -1873618471841499416 -71893492 -10694515171064744788 -29330183088506125 -61997475 -4653200 -109445719 -8926052536804313893 -7528330190111771360 -1418462186 -5887104182899575287 -2625321597997091447 -23407864425745813 -1647838213 -6152225753094686522 -14151987057237756511 -18058417591402760409 -538510099 -17855463731522440261 -240752528220 -27920040887059601 -11078361536363433136 -12517601 -15885957841278600403 -518718202 -805438326 -2621553 -1550910461 -2411070513 -59965836 -13012951802392676509 -97518103 -2625321602295859611 -30277976 -546374457 -16759426304739641933 -259654328 -27356063970624739 -1873618458944931675 -6209987959894902621 -5728764444739437994 -18413109988782047308 -13885455448020813663 -13464164481390611573 -5514354709969504081 -6364097374632348674 -2676033351739376985 -1136798196293306910 -5299098874403555921 -2120987217453057458 -17306856587979066781 -1873618532028844481 -5572365145471912335 -18412263926676652075 -105382480 -5303047039553965447 -9881712940254169714 -152830562 -8610102806501591788 -15524263781940136850 -14282671233461718187 -2857298572705729021 -29330122900898936 -10554335258691243263 -8453377129057749572 -18411417864571256842 -811271050 -1873618489038604579 -4657106642463886071 -2676033356038145381 -514654951 -10757572347027851837 -4237766514325588729 -571999061 -9821766011288487605 -7230168968130792223 -2704904949959166469 -1823671323 -103350839 -46006654 -2755882956846859930 -15289397371128186695 -12662636664722033563 -16318735 -18411417894664929297 -5462796894122411284 -9950019064427710530 -6981729909914862956 -1992588707391932346 -63766972 -6422699 -23407808536904833 -15394822466617412826 -16881139139804531782 -14312300901618944289 -2625321593698061230 -9870724570679212 -5780604289886653255 -3870997034531752803 -2531021389865944442 -10908568553618343357 -1860700038481053299 -196215461 -1801847830 -24183115 -18424247431471827427 -14287090 -417019855960 -71631344 -4391052 -61735328 -18413674012989259870 -2625321597996829544 -17957750408840481687 -9870724568648556 -41943405 -2789363542978135882 -18412827950883864637 -548143940 -22151483 -17257283845880874759 -899112529018292807 -538247952 -69599701 -8510664359869943178 -27356081165698156 -27638084672359236 -12255453 -11400819049620310987 -1321272283 -16881139122607162703 -2359405 -3101815889301670444 -518456056 -9232147856523987724 -3758799212073651272 -3591160524196219107 -154600049 -17946608694533885076 -11500631658516907905 -825323275339564903 -9870724566615620 -39911783 -12318365723907459763 -546112310 -18412827980977537092 -536216330 -2676033351739114988 -11069796553860646809 -7880043043777809442 -451412296787 -18411981918872141859 -11678577273375754735 -8856014234050823647 -105120332 -1309344723 -162464400 -681145240220010584 -2626514825137096412 -6589396841525218018 -356832249381 -6156738032733324876 -11202456151687629452 -27638041680086900 -11243723090649876783 -5726358144768542273 -12498251711624252784 -13702827714901707594 -811008904 -8192198 -8714520725396523830 -514392806 -9960543895307946415 -15287141235608259625 -5727354401416546168 -1808894516123993997 -3686437022462641529 -5249797181178709209 -2625321589399030850 -103088691 -3062219857732765097 -830399540494469985 -530117487457144076 -12454108019635062383 -197984938 -8930986418384079868 -818873277 -16056587 -11526999220155450649 -6160551 -63504826 -7621890105505615217 -11847668763332905754 -10377426660276898779 -1873618519132015281 -18092519415945890646 -15882855708139391266 -7993599274919922706 -2789363538679106064 -2150364451440035988 -9870724570416301 -2625321593697799226 -91161094 -1410073577 -23920969 -7513578521803359945 -22279798815198594 -15520597512816297356 -1023125932615797552 -540017436 -8910392170935354895 -195953314 -644809585 -14024943 -71369196 -1873618476141774348 -816841645 -10906583479868327250 -1454041666728626384 -4128904 -18413392005184749654 -108921430 -468609401971 -16204201012116260706 -99025451 -9870724568385196 -18412545943079354421 -11878630053446878902 -18204249488608200784 -5566476545725367766 -17951898368652543383 -7558005371879033601 -16542141154387102177 -6316393479032998553 -11694336983993944146 -11427331956784106382 -4662073785906890031 -1873618454645640429 -537985804 -12999620585941961275 -2295119206548507606 -11993306 -1597536180772867045 -5299098844309358384 -8294669686619703163 -69337553 -1873618506235448739 -518193910 -5406444726343502428 -16765215479188031591 -5460499803636172954 -3431717683755289915 -28202117477106938 -5249797172580910311 -5745384143842643344 -14065038233622153931 -14311172801615955497 -16758489844492275047 -5510538272098551989 -11065487220741573048 -9870724566353399 -5679882735784101879 -259130038 -87097857 -3491703471172619422 -545850164 -18271599167641487963 -5991347923196709309 -1873618458944406678 -7033448275620070919 -812778389 -434977997061097911 -3445982126355516078 -2676033351738852867 -3545799512027105927 -1873618484739311861 -12749251354825264418 -14836382508930370955 -2625321585100000596 -21997756618246082 -8716776809328151764 -15580874176502892132 -3332575624131774585 -4445946672738010859 -5780604328577598853 -2848264744227112681 -1873618441749072804 -257098416 -4930631980557601532 -6877319166685482198 -1005889956380019628 -820642761 -17826079 -23125779236849772 -810746758 -7930050 -8929320279979198383 -9654763076979264499 -11949535972653271176 -1873618514832984063 -514130660 -18066207382028748450 -2573543666009114673 -18613585580197092 -1427238547443354327 -2625321589398768544 -102826544 -5903884228619468800 -4279043148 -7036226112429884975 -818611132 -15794439 -3324580943442478547 -1903640920853056624 -5898403 -1873618497637649718 -1133620887485417426 -10156853965084755435 -63242678 -282723005 -13586095437453200186 -9082058141968173941 -1987794462939089941 -13237708531286474753 -5240852582657493474 -1915314009235720841 -9870724570154139 -90898949 -17090754651615726815 -492307151 -195691169 -11050161621988804687 -23658823 -11623400942792738969 -9304480456320748248 -71107048 -816579498 -23971751058934778 -17869638717220195611 -1873618476141513316 -361675971417279818 -61211034 -1873618501936418049 -3866756 -567411536 -5302201063430292982 -8486888319115725460 -12406930521299355297 -9870724568123690 -11034422950646711803 -4287350254045103750 -5566476545725106758 -1923875870 -547619651 -6366353527348595732 -8597156797828894009 -13590665243542948895 -13237708561380147208 -4254959725487523541 -2907303882175415846 -1873618454645376983 -9230753948926543533 -11731158 -527827717 -5511666307614640107 -1330643932 -69075405 -28202091681942395 -4727296740454696303 -1992881785902860007 -18301216972081072101 -4076606659425995504 -9870724566091296 -39387493 -154075756 -5459976644113468289 -545588016 -12461042340477994821 -223556406340 -32432337723721245 -19595563 -2573543610120276856 -24535874149025753 -5196265237615086368 -17735566651085687884 -6204347601746593065 -1873618484739049815 -812516243 -6152225714402428442 -15291935501556190620 -15505670362359531298 -451411772583 -9484411285755463284 -161940107 -15292499508566297469 -563348302 -506004186 -11238431078799509026 -18323667541285735009 -2625321610894640833 -103179363763488430 -503001580666 -12769025487284210679 -17785259844527786731 -29612147900877606 -15290243377345399572 -17563932 -7667902 -3186488476490139978 -810484612 -1192315333980326167 -1873618514832721746 -15292499491370961900 -513868514 -5347351719937377689 -45220217 -11775490430040476325 -12240192446106372977 -35324256 -2396555433535145871 -7409502855497715015 -7888341864134085054 -4278781002 -1732546121802517809 -2374936041605498895 -21433680820701635 -12189960762281954023 -869984510486186619 -3598203394278688718 -6103488079777762245 -72876542 -16990917635978692369 -818348984 -15532291 -1146796961722731823 -17761874897365304540 -62980530 -4534407021717882867 -5636255 -32714379920409891 -12552846396214610071 -6262673798361580735 -2528483177756102046 -9870724569894177 -9297735470756268616 -5831598115918776853 -32432303331018178 -6064762127302393958 -6156455943246842659 -23396678 -13500652 -16916327697533962956 -70844900 -816317351 -18411699885273055253 -5884848047378859255 -5837238405281154301 -14311736903207619026 -5141736951422061236 -3604608 -31022281504523376 -3599049409094225259 -577045344 -2974323816123992770 -8021450341214588326 -3577503648415550265 -509805280 -9870724567861628 -11098517635487303139 -7462549834646555859 -98501157 -5779476207078475458 -219257375260 -490013379 -4222974949961697922 -6366353553143235674 -3158171969379764633 -21365044 -27638058876667848 -29330140097217635 -1873618454645114642 -2703776923039566000 -68813257 -279448782049 -814285726 -12237654319976351671 -517669620 -5779476284463187670 -10375505326587315831 -18411699915366727708 -6205475624366966000 -3307734082 -39125348 -1087507565178193378 -545325868 -15986098390340470919 -223556143025 -19177592590632702 -8865366478519731984 -19333416 -32432337723461001 -812254097 -11305519054433421356 -1873618484738787248 -5105416417023100899 -572982104 -505742040 -563086155 -104333894 -8070528080642443989 -11327137566841769230 -2625321610894378836 -16377260960560187819 -15586729198848181726 -1873618441748546884 -18413109971585663048 -4825924017323379312 -5915592292141435844 -5832726151436896491 -17247780946628644032 -810222466 -7405754 -11549275701007551889 -10161648502327149991 -570950482 -1873618514832459339 -313841222762 -4452458274095237609 -1445774942907271091 -6101795934071424788 -92406286 -5293539447540681024 -18331491793766525 -197198505 -11199980773228349986 -32432320526091507 -818086838 -1997667722089860216 -2524806027085153844 -1964966944 -15270143 -1370042529145686776 -5565348523104797810 -18331539082773742 -62718382 -2012415014 -18413110001679335503 -5374107 -14282027259104724924 -10375505339483621145 -9887461037680036022 -1873618544926132491 -4662355883991631380 -18412263939573940270 -157614716 -3295137431799204142 -9870724569630759 -491782859 -214958343888 -16875205763331852041 -7241607903360452069 -5408471212899110030 -23134531 -18411417877468545037 -27356081166681957 -644023149 -70582752 -816055205 -3342460 -5246976952665638015 -14212253575230457510 -576783198 -1842511416005692464 -806159226 -5566476498435574920 -15292217517958891614 -13516735047310051359 -5728764487730398405 -468608617008 -4025969582498383295 -16044698410490725659 -1519546451849645365 -9870724567599405 -5566476545724581156 -5619444426388998007 -98239009 -547095362 -27356033875641745 -219257112483 -8140646021471143544 -4713167439824750602 -16357059045845960667 -5462796881224795644 -9138963602338286574 -21102898 -10905173367761798655 -13701595356116683915 -2477484405147109478 -1880166538706292058 -11206864 -1283692271244348427 -68551110 -5885543833259674054 -18413673995792875610 -2352415791 -14947075702982868 -5299098870103476096 -681145240220994278 -163447447 -331038328206 -38863202 -96207382 -153551462 -2625321606595348609 -5461104757014004985 -10744889200825601240 -1988559907 -258343605 -6517011693716180143 -535167753 -2530175340657839273 -811991951 -15291935475760762248 -4397798264919820154 -18413674025886548065 -12109395139072755174 -475082778886408323 -104071746 -161415815 -8697110475982376165 -15584540329550678645 -13669583335851559254 -2625321610894116800 -1873618441748286746 -18412827963781152832 -819856323 -6209141854797957852 -1783548230307677653 -18411981901675757599 -637928298 -7143606 -15855332315905657597 -2625321864544389907 -12020808312486431384 -3076135121411313050 -10139438201185111279 -6152225744495577231 -33560368941368890 -210659313158 -4278256712 -27638024483702949 -24904017 -32432320525830439 -13263754581809432790 -817824692 -15007995 -359800716494834349 -18613516794268696 -9839328478246341893 -62456234 -5111959 -18411981931769430054 -16219982623696489082 -6261827792145090364 -7692717626264324682 -42664306 -13806855580317125108 -9870724569368358 -16269555352897260337 -214958081659 -11214563466575480865 -15636771529559117046 -13271165719268362246 -2652485274356286816 -538968856 -3784724792312663401 -18263821886743185772 -1986666427421953426 -5565348480114297669 -5352348827359053328 -12976359 -1873618476140725820 -421319345246 -70320604 -11703165067112811597 -21715697223994697 -3757107087862401328 -60424594 -3080312 -10697899350700788395 -1873618527730534170 -468608354196 -509280991 -50528646 -1193603335023233930 -16635669954819197974 -15426482629288462533 -5460499803637156023 -2625321602296318353 -9870724567336570 -97976862 -8818864638845060491 -14288223544298637564 -88080898 -6996745855548787140 -5566476571519223063 -546833214 -220421203678071202 -31022238513759415 -1873618458945389823 -6406389097441592980 -20840752 -813761433 -27356085465188671 -68288962 -5865888353649363875 -109394696450803010 -12213481117926952067 -18413391987988365394 -10944716 -517145329 -5723537903358642458 -21715753112570631 -7758478083289188556 -10675690836223986039 -153289315 -95945236 -11547019543992076059 -9649086479758069023 -2625321606595086582 -258081459 -544801575 -5887799994573980828 -2845029447323880298 -18809125 -8510103668314541335 -6205475701751155414 -1990332636357069057 -429916882098 -2673382969485886910 -1873618489039064439 -18413392018082037849 -10914208898869168291 -3773122177597967623 -161153669 -103809598 -14107087915135404740 -6366071515245381876 -18412545955976642616 -15289397371128645360 -5462796868327967227 -1402930148 -28202057290482949 -797695489810761887 -16777494 -18116142943679220675 -5142301044413893172 -17219576355390295334 -5249797112394286460 -13735950183222348532 -6881458 -29048192479791616 -16896582888638318388 -14517406836956661503 -5458848655886518922 -313840698753 -5197393273133271298 -3861350810962691992 -6375653898722412075 -16885380374869314205 -361129707266 -210659050964 -29048123694646491 -3017170418691476659 -1873618450347593089 -15290243360149277503 -14745847 -72090103 -14546784569801180959 -7431889721301470079 -6364097387529111599 -2435475427475262665 -1873618497636600365 -6151097734773868363 -62194086 -17083693200934636558 -32150372909516328 -4849811 -3172873313800750756 -2150364429944620611 -3862478902367620470 -9305858029919208637 -2625321597997287853 -2508194873 -491258567 -1408762855 -5015996636573993090 -2414921941537785811 -538706709 -5734260728554980678 -22610237 -12714212 -70058456 -6208295882974168451 -32714336929384395 -16643035121679272213 -20023641798084435 -4770547828131824981 -2818164 -1930668198955452820 -13726068529822894439 -468608091255 -5569296714050766113 -17490170188584258190 -8694008299851745161 -7073102484926630551 -155058804 -97714714 -40370537 -2625321602296056238 -1703347206 -15895039144349470066 -5352348805862656188 -3068049059797011246 -5880738612678821404 -12309852946450942075 -33560429128451329 -15289397384024950845 -4767727591019973374 -10682570 -10233718743719545342 -850088361543927300 -2792183694107936667 -1107456968073808590 -5759560470823897206 -162923155 -29612216687004362 -5875369269012203157 -95683088 -294416195335096411 -22279760122415532 -5639662680184522626 -17619012653768771484 -13237708544183762948 -8550520059753138843 -27356042474686002 -249849483538007723 -544539427 -13390152586296232130 -10906513561824594910 -18546980 -1873618489038801706 -2676033356038342054 -6313103561496791450 -2063139881 -6848542126596623056 -160891523 -103547450 -14101293042239958 -6151097653090126690 -1584595969 -12424382439595706534 -17698252132056434004 -4129856573689694799 -16885259953617962521 -12393440069873436875 -32432320527338097 -21433680821684597 -8617826180017097033 -1413046597527668667 -3973491001936446780 -819332033 -17305802226190387588 -1873618467542665344 -16515346 -6619310 -6206321690771522709 -4089771542585346905 -1223976962194278208 -13487493291780736605 -2487491354099451134 -8854886172739175692 -9870724570875039 -2625321593698257851 -1535116279 -6262673798362565305 -91619849 -493028049 -5352348797264856883 -8143564249694210398 -6151097683183797493 -9386257309953099582 -196412070 -3865299044899163405 -71827955 -18613366323088485 -18157949162008873831 -7562235583526800081 -817300400 -4618470194090937269 -4587663 -3932922014897081298 -61931938 -1873618497636337289 -2522831856378710008 -6364097413323754682 -6053028402293443390 -42140016 -12287601267178473523 -2625321597997025900 -538444562 -15991329612793777185 -15291089478142986477 -12452064 -2676033644081056812 -2556016 -16508579235574254010 -805372789 -59900299 -14787093348585572176 -2575517759332551933 -2412665810316625225 -7730749911729375728 -6155298010574883251 -10488220504998020326 -1311572948 -883931539946605906 -5352348805862394041 -2786543383251193103 -546308920 -3346269252 -5782296426993943791 -4469799173763958889 -6205475671656957491 -7872981661881076049 -18116424960081923281 -2676033351739311464 -516621038 -1465168459078698840 -5677488692584514734 -105316943 -4562124351240801677 -5245848874158263187 -16432982289349543214 -162661010 -3971798877726246151 -4787251587800828866 -5875369294806846690 -12217235256243064050 -95420943 -5354604868299326678 -4502324021619918399 -544277281 -5940918086979029952 -2014710471177341259 -2140013610 -1873618463243635741 -18284834 -2676033356038079832 -10531295876509927029 -5458848625792321791 -18411699898170343448 -7410231625909407077 -3478039985316562895 -6204347606046083061 -31586254122912349 -6829167320236755019 -27920101074341046 -13165236096819726043 -32432389312220424 -571933524 -5727354401416743090 -10225919154718574351 -4127600472563058730 -160629376 -103285302 -8483828720842049762 -15740334315622960494 -206359759935 -9813006656186419950 -9319686106503382840 -5515085278788979157 -232154663489 -26149204 -6208295848581203181 -3094190453106412515 -6520986101609793850 -32432320527074663 -5245848925746038203 -5942328186188203485 -1873618467542403595 -16253198 -15881445561639371975 -6357162 -63701435 -15515478115209971466 -5833854247140395797 -283181761 -19177532404009207 -16567374854657149772 -684134257893509654 -9870724570613070 -15680489859993767209 -12826571498698443033 -2625321593697995819 -10329316755526125416 -10754752208794748192 -10758418391935812957 -12105446909435186010 -3143159678306028631 -236453432350 -540214046 -14848239906707278405 -29330157293274228 -684134210602468610 -817038254 -4977791693940394179 -71565807 -1873618497636075077 -807142269 -61669791 -11287403619712895066 -4325515 -13819298136066198 -7734678113259293802 -6098975847429179176 -99222062 -18056758355458722638 -9870724568582655 -16224960573811657069 -2625321597996763849 -4078298757842341053 -17625510063045740642 -10528906628815718922 -490734276 -5412367062202975465 -22085946 -12751507524739009261 -538182415 -12189916 -18413109984482951243 -2541195915421354200 -6671860954713623381 -2893509029140760671 -69534164 -747829823970020707 -6770804071406897080 -2293868 -5566476498434524382 -6534429686359852912 -18412263922377556010 -164430493 -9870724566550039 -154534512 -10167299845199168903 -12754891682880490747 -5250413516934022944 -3315661715940248009 -451651625195343029 -32432333423379563 -5941764217869305943 -2141783083 -283748271730 -10161648493728303880 -5240846595623881868 -67502526 -15618641120352995308 -2676033351739049517 -6205475697451599682 -4023356732265137752 -14986955239351847842 -31304272112126853 -516358893 -2207492698791414354 -477207135345 -1309279186 -105054795 -17859691850682797212 -162398863 -4238330517036600601 -152502880 -18412263952471228465 -257295025 -10905173350565414454 -17498716255300421272 -8881019260503721949 -18022689 -534119176 -18411417890365833232 -6293435910568086045 -9374458755688828226 -820839372 -6153071780807051278 -5909364179964069981 -8126661 -3735453693364143828 -6155045908522469290 -745740842898098858 -2625321589398965240 -12142525752872799042 -160367231 -17958290734101235336 -9523554809025136564 -16892239439269464715 -15289397371127860096 -1736311827 -15991050 -63439289 -6095014 -12484855343804124176 -9658025172156550406 -18067928153034001057 -292345808939 -16572875051796793000 -10542598463376395267 -12772641161582545873 -18413674008690163805 -1544487931 -14737352740221028816 -282919615 -12808641794728789765 -2625321593697733840 -17128487303121020 -1706624008 -14101026494875963 -11214563466576463780 -18412827946584768572 -11966722661119888545 -6156455943247300775 -5300226909920168653 -6004915412369541960 -816776108 -4223816177647290930 -71303659 -1873618476141710425 -12477949191893683608 -417019528294 -9511403338599564690 -4063367 -61407645 -2543805385922512178 -9870724578216632 -5407707525201267705 -9870724568320021 -2564752444 -98959914 -15494005608834598990 -15140097999495498431 -21823800 -12734096628671909131 -537920267 -18412827976678441027 -11927769 -69272016 -18411981914573045794 -2571498445011814318 -10592171188278987146 -2057911839619745748 -9870724566287831 -154272366 -545784627 -17616192489740896443 -21715680027609308 -16886908734816455284 -583336804 -2246313005 -516096747 -2625321585099935141 -620888934 -162136717 -331037018572 -477206873177 -503001777494 -15592058013925444099 -1652810939277510396 -10531295803425490030 -3205882223899445065 -31304323701671300 -28484129580057898 -1873618441749006513 -16893851890367073119 -820577224 -16904712944498838074 -1394017249 -17760542 -4160689491693538063 -4047541379259827663 -7864513 -14219872676477209184 -504169174 -17244622751296785814 -2625321589398702921 -4278977611 -7239633818635733091 -5462796868326918190 -1334641629 -73073152 -7460569593843485201 -15287141188316891641 -818545595 -9339868219275806468 -15728902 -5382561551670903978 -9373330690077689939 -18413392000885653589 -5832866 -63177141 -438515402871 -2373415502940997016 -2148672322930150296 -168849237244054062 -12339564610979564477 -8327325764367420682 -7630443591734791098 -12608147700378373379 -9870724570088730 -2150364451439708714 -18412545938780258356 -13221120945827219803 -492241614 -4129856608083381232 -15740733274947783803 -15858116883009440274 -1873618476141446514 -816513961 -17564225130023161250 -13697261 -10668197763104573447 -71041511 -5357143003026951378 -31022281504720056 -1873618501936351339 -3801219 -442814170389 -5701610621477129021 -8520914754064026558 -15289397306641222853 -108593749 -98697768 -9870724568058057 -5780604294184830225 -156041850 -5192881006389626514 -32150304123324262 -219257572663 -18412545968873930811 -5249797099496672683 -11127945220196076778 -9103100569952650951 -11665621 -421318034537 -17619012718254098754 -14443179094226111164 -1873618480440216958 -69009868 -10594427319499622429 -814482337 -13968724050119231192 -28202091681875145 -27638110466671725 -16166203682344470241 -1712194570 -472907842721 -507970270 -15580874172203795679 -23689855033805297 -154010219 -17092164759424403479 -12893049762838873864 -6877309693745106245 -545522479 -5887800020369606783 -14977809576148535095 -19530026 -14105033451515939293 -6795216411027442152 -2543452128325209336 -1385890784 -114426460 -6444189713816225654 -6152225714402364510 -524384476410219715 -17953567922355439196 -17113993018971653874 -573178715 -515834601 -17090754617222956318 -161874570 -1538130937 -47186305 -30458188512103543 -2449021711964768402 -2414448843017751282 -5214737420442796133 -505938649 -2625321610894575340 -13965057806789381527 -970700105235760464 -15223822230290106035 -16285378285009240167 -16940455997476965252 -2601013084734032090 -5248157445900799208 -1580068669843704469 -15043322265989680207 -29048166685607288 -3863606942184311140 -820315079 -17045009756596405420 -29048192480512516 -11510172448171493799 -5885976160280708469 -7602365 -17785259896117529586 -8856014216854897981 -14477731067643038195 -1873618514832657292 -2578187325 -15292499491370895395 -33560368941827284 -13146357072728951328 -17353152791227993245 -159842942 -15530553734630409457 -5569296726948055802 -494159375523777824 -1812923415 -6366353518750729401 -4278715465 -17097308613030775025 -35258719 -1899651063193471062 -12103109825679658143 -6364338522051512284 -2429880031182916564 -11621189233770302317 -72811005 -15466754 -3880024017885400135 -818283447 -62914993 -4076606625033226775 -1873618497637320883 -7746405201714873917 -5570718 -10859426818132543221 -6925759835249836137 -3506237898852665380 -23407812836853915 -1873618523432225060 -17166316876055971050 -18008952305986046279 -43123062 -9870724569826462 -7410173966093388838 -33560399035500221 -511599051947 -214958540605 -13237708557081051143 -20587696099952690 -15339421027537585423 -6104586261132347910 -11103300151687644832 -1456931819 -1873618450346281005 -9181531069949872018 -14650572868605052119 -17783567759008991682 -575239712866634722 -15288269284022357372 -6206321673575138470 -644219759 -13435115 -399811749952817933 -145335345147610979 -70779363 -6366071455058494624 -7529998377695250462 -519635711 -3539071 -576979807 -9568723490388248888 -634323816 -13012951802393594980 -853643387796785445 -98435620 -28766107292140894 -9181555677596944971 -5195701200510977145 -5129024196560096606 -5831598124518278362 -4844858457232050089 -219257310372 -7569568047215545466 -5461104800004441485 -1518418407735101149 -814220189 -11403474 -18005251247539029895 -10333839787251271664 -1836516380 -8054758354584013306 -507708124 -163644058 -9001701177466488459 -2625321606595545096 -153748072 -4787251587801811388 -39059811 -545260331 -2036204584 -5356296971014964874 -19267879 -9714916684781063078 -3055188874828713383 -14576212124415364447 -2150364417046743283 -4662355849599126556 -1372824966366170355 -1318388695 -15289397293744393060 -8423108281783224429 -505676503 -104268357 -477206348880 -5831598081526006949 -4625631396377398109 -2625321610894313322 -6206321759557388696 -12237654281284815334 -17236251 -9391897711091583990 -3891732840317912522 -8856014216854636141 -5758903550139959418 -7340217 -638124907 -810156929 -6206321690772243584 -112132697 -15287987228927658628 -339636063086 -7721139320100816372 -684134305183500639 -22279768720672168 -5831598111619679502 -14814059355306855043 -4211213383 -15290243360149735302 -18411699880973959188 -15204606 -11507341268100646834 -62652845 -6365225483234117329 -5308570 -3491703531359374171 -17791918762976347730 -4127600455366674792 -11130039777759856047 -13951205954302381098 -18115578910873816258 -8659114857360722535 -6153353844499089111 -157549179 -9870724569564298 -16327183209838150989 -491717322 -214958278120 -32432303330691092 -17684252729367202593 -16965951797418331227 -23068994 -2272905061487347697 -1873618450346019367 -7515799761807542411 -815989668 -2576363817137867614 -70517215 -17763448248357489818 -13172970 -3276923 -806093689 -17621268802185464283 -60621205 -18411699911067631643 -576717661 -1685722535145180234 -23689824939607125 -17256155806064642777 -5516892801706297876 -12982659022915898414 -9870724567533791 -15515140725455259155 -547029825 -219257046468 -4180850416920431050 -21037361 -68485573 -11141327 -813958043 -189614828176542708 -1873618480439692390 -279448454880 -16253215886083360174 -572110149897422243 -9896616181508082455 -153485925 -8021450371307931626 -38797665 -19177566795402134 -27356016680241600 -669582195 -2625321606595283106 -554894151 -5512098557251945790 -9568883447315500158 -1440671446449589035 -4502324021620638916 -3249068390006196153 -15292781563660995825 -821822415 -27356063969248337 -18413109967286566983 -10911952793442192048 -6064503826171693679 -11161692095903435283 -1004761907965660269 -2207210695286917386 -6388664954993575829 -46662016 -5885976061401368013 -104006209 -5572809636517250553 -2625321610894051277 -17955470565775510239 -4661227814082512385 -6368045642960996241 -5463642874544129714 -16974104 -533070599 -809894783 -18413109997380239438 -7078069 -637862761 -6288511205539515238 -3974700764184054454 -18613559784442970 -2791055594105669609 -4504298205224635444 -18412263935274844205 -2605266760616185153 -15287987228927396675 -339635799228 -92078603 -8501910827968825512 -5991347884504386492 -210659247559 -17284241873202253123 -16893851873170950707 -651404368114879038 -18411417873169448972 -24838480 -5726226344404977639 -10259573046193883986 -2676958769323838072 -72286714 -6886936648282539655 -14942458 -521143041 -5046422 -13980703149896829784 -1495991284 -62390697 -18199185222634702635 -8834282535679560676 -15925946803693423456 -42598769 -9870724569302153 -5459976661309982295 -11084138473134491150 -5303047078245827995 -214958016090 -12451287838412704489 -5509410202188647833 -2681814701524780811 -10628953736434486617 -9774054990929462949 -18411417903263121427 -3865299049198390675 -12910822 -5356297009705911966 -2421359666 -70255067 -2248112069177510680 -3493395634074945822 -60359057 -12654580528992553525 -519111421 -3808100888100343209 -3014775 -13513632858283052077 -15289397310941235057 -8861613698626554738 -9697577994188492052 -155255415 -10381427610856195682 -9870724567271440 -2625321602296252770 -14512708438227029368 -97911325 -489423554 -4022831255438034250 -30671195 -1873618458945324208 -20775215 -5459976691403654584 -813695896 -12665415616966166285 -5645056620059298667 -68223425 -1319896024 -2390363305266056430 -17634738504986593825 -20305632407192782 -17462509665872383079 -1606616067 -305243098454 -163119765 -48431492 -10590197086357423689 -2787671431665157349 -6366353484357502971 -18413674021587452000 -17620986833073014515 -105775699 -20869665212206112 -4445946672738929841 -95879699 -2625321606595021110 -10906583445476542150 -18412827959482056767 -17205553309096938840 -12294570438877711433 -5461104782808583112 -544736038 -9950019055828534995 -5991347927496394467 -811664269 -5403008449516603011 -18411981897376661534 -572392279 -7677136701370927115 -6155045908523191668 -18067928196024961188 -20587511236070012 -103744061 -161088132 -335336768790 -6155045934318095559 -13322381941750499717 -15291371425760087333 -30740222110467489 -5245848925746498573 -5349308051975768286 -4548309565419816229 -255984301 -5461104787107351969 -16711957 -10906583475570214623 -6365225453139920066 -6177363118375897150 -6815921 -7032232753418799293 -5558136817694803400 -4030203865610717075 -12718336251608304605 -18411981927470333989 -1545208828 -15287141235606883137 -5837238474067478018 -11705421198335413148 -5524868651610213131 -210658985303 -6098975770044925746 -24576334 -13151687854617134836 -4662073803102881076 -72024566 -817497011 -29330157293733695 -17096567568145714575 -1454859013759438228 -14680310 -4784274 -62128549 -1493907215600323645 -6364097387529046615 -12583654612056476062 -12851509922494416016 -1495729137 -15287141218411547437 -828143439367899804 -2523959969279970191 -3919394969679695174 -7595953279435999504 -2625321597997222413 -491193030 -1839046019115124804 -7241043922144659849 -18613499598604650 -18413391983689269329 -10594427319500605883 -12648675 -4861149623842704773 -5782296448490276391 -5516046782590617836 -518849275 -10015828607276288922 -15662612681012938353 -2752627 -60096910 -5133829485924779401 -7003516464553396964 -12903069678853164419 -2625321602295990612 -97649177 -259785401 -5464488953846367762 -546505531 -30409049 -374027977988 -1396769762 -21715680028329254 -5637072609524124450 -7731877951544692100 -1873618458945062288 -6767393152337644543 -9467310877347154547 -5429433323061448040 -10617033 -1730937871 -107356700000258304 -425617786716 -451412690018 -18413392013782941784 -12020684574736647824 -105513554 -3541851256594893702 -16038494049631274933 -497025749 -4661227783988316231 -18412545951677546551 -5565348467217401524 -14428481252717692252 -544473890 -3344434243 -2169005683868174908 -5993603989931887912 -12972952285742288 -13117263636444153530 -811402123 -2676033356038276482 -1873618514833639109 -514786024 -572130134 -160825986 -1938490399 -10280579133800254203 -285938493736356261 -6425213859614951480 -103481913 -11364576519499679975 -1881294612915292853 -15739206202722094240 -4397798316509039896 -17011915733784398286 -1873618446048496233 -14383326641327005 -26345813 -6156455960443095577 -14975681650483333306 -819266496 -16449809 -15288269301218674108 -1873618493337504776 -5782296461386581535 -12162857194684744950 -16633695839999756254 -6553773 -6206321690771457172 -5411573444917201071 -14273081993166850387 -17297538988880889355 -9870724570810095 -339635275824 -101450287 -2625321593698192308 -91554312 -3812049113439014303 -492962512 -15289397349632182266 -342928503145892901 -9257009393629660721 -13674941621707869313 -17952462371364276975 -24314188 -7676326001635166459 -12622921449567619867 -14471968401314024391 -14418163 -71762418 -4522126 -1873618497636273356 -1873618523431177265 -31304285008889193 -2625321597996960522 -42074479 -18895601982637667 -14883032307819284131 -32178524 -490930885 -5459976661309458015 -194314911 -1873618454646032908 -9386257314251803173 -13950077918785243724 -5831598146013367591 -5882159627828332650 -69730775 -6100103913039400051 -15744000533156660854 -12386527 -518587129 -59834762 -9231865831523354279 -2490479 -2148672331528407961 -2908260051937332390 -16876615841046071902 -9950583114428779661 -154731123 -13237708539884666883 -30458205708158447 -2964529530791004471 -40042856 -2933734509745341832 -5459976691403131036 -1730675726 -1873618484739705502 -2676033351739245930 -15215179494928287321 -14866462842593414402 -5463642917535614049 -631243623 -5885261859847867262 -11391362031143292020 -506659547 -105251406 -5778348197355914873 -16324853745603185849 -5509410163496651347 -152699489 -15292499534361856724 -496763604 -544211744 -4078298792234977417 -5461104782808057591 -14648423506775771515 -10504814416598927327 -8709732826087622782 -2544766567488424310 -811139977 -17088205463377873568 -15798241638577276499 -2676033356038014277 -2785415326238639918 -12562453432512743836 -12350988444867431112 -1873618514833377412 -16940553195690134509 -45875581 -103219765 -8854886168440079511 -5941764153383128192 -2625321589399162008 -11818157132458100908 -2785415278947600352 -15257764832492062794 -232154598652 -819004351 -16187661 -4644563108626631009 -4000515045253449269 -16872667624306444468 -1873618493337242815 -6291625 -6156737968247080128 -292346005443 -283116224 -3220426554520570467 -12356593998396393868 -684134257893444250 -17175427809786595961 -9870724570547380 -1992881803100621054 -2625321593697930351 -9450798976826149302 -16655465042802838677 -6474545510181176536 -11740202404159819072 -15289397349631921063 -9714916620293637762 -6098975770044401989 -16364556117061994922 -196084388 -540148509 -24052042 -11065179658016983681 -12480382642832672298 -71500270 -7285785859232107205 -14156017 -17632571483632043275 -61604254 -4259978 -17750109864738752812 -1873618523430913566 -9830100417878166271 -14425661002709010016 -4794173760728861833 -464308734399 -510460641 -2507605048 -41812332 -2679637056 -99156525 -16044698410491643447 -9870724568517151 -5516046735301085409 -6261263733545503259 -3759645248384009814 -538116878 -5779476232874035736 -6104586261131037638 -10531295842117158093 -12124379 -69468627 -5565348505908348542 -814941090 -5299098870104394759 -14322284629040564382 -10440328872292254866 -2228331 -518324983 -16872385650894636566 -6284197438710222140 -8098722631875955846 -5727354392818878727 -9870724566484489 -154468975 -2292825785040636736 -3172873343893834792 -14418466534433295118 -2707725182771857350 -15293345523383077603 -259261111 -19988781 -15371922320578972378 -19741625396299098 -18411699893871247383 -12818875419963886521 -2676033351738984017 -14268291611706526293 -1309213649 -104989258 -6367324841362000185 -7432602967203907143 -11331649863678691999 -15292499534361593441 -1815413785 -5778348223150556659 -5572809636518234139 -11408348231855703653 -2446197814 -13001682102565734253 -17186370630874106258 -2785415274648570354 -14264783202905229777 -7171706723174648069 -820773835 -4645667113710455153 -16425638839461284611 -5353476806987745228 -1840738151924108521 -6153071806601889790 -810877831 -8061124 -5356297048398365877 -4770547841029572913 -12804866717273491655 -15580874133512784221 -514261733 -571605843 -12346762090311779845 -102957618 -10907429529076434052 -2625321589398899121 -5354604872597767596 -4279174221 -27638024484621167 -8483828720841721486 -1459422188 -23689889426704296 -17648172271756969893 -232154335723 -15925513 -10811668319096800853 -6365225478934037607 -9763237054719266042 -11633356565151157114 -63373752 -1873618493336979326 -6029477 -3580814869236944221 -5199085482290645376 -282854078 -2625321593697668091 -9870724570285675 -7449919019336600171 -1839046014815569788 -23789896 -9131616131521448314 -5779476228575003910 -5511666277521099409 -13940760354079114484 -18413109980183855178 -644678512 -71238122 -417019463453 -15131353489256221185 -447360420122266222 -520094464 -3997830 -15096032016463431129 -1873618501936549084 -61342108 -1873618523430651633 -18412263918078459945 -5344573059048999857 -5155859771100236117 -5405598659939206416 -27356033876298083 -2146416200305806198 -5303893093062347743 -21758263 -3189961199463959445 -527958790 -69206479 -11862232 -6364097396127827248 -1320879066 -365262179507571896 -23689855034002659 -1473119215 -18412263948172132400 -31243224015702806 -39518566 -9870724566222277 -545719090 -5301355009924597043 -9391897706793274792 -11514789185312918199 -18411417886066737167 -5299098848607995194 -2284412389694637269 -10530167802300925091 -10427987387505837891 -14322803714593785119 -2625321585099869531 -6829167367527204602 -6013889919468112625 -4181978486829943864 -8698802578697685482 -1654120425802828663 -5569296748444387676 -1873618441748940565 -256967343 -5245848947241584851 -15862817677379702068 -14633483086300318059 -288046714075 -2203332276215481610 -7798976 -810615685 -237175467 -11340219378265033230 -313841615983 -513999587 -18413674004391067740 -2116750858326574509 -8070938101082033295 -2625321589398637514 -25099937047839912 -5245848878456439955 -12118995007347033900 -4562124381333884039 -31586327206235137 -16436648502583690678 -9181481831755875838 -5516046752497929091 -4183106466458307862 -1991460714865167155 -17082847207615301902 -818480058 -15663365 -73007615 -3701600990787603378 -63111604 -5767329 -579208034 -1493907215601306869 -11535686880442518166 -3313969578832561394 -2704904932763174902 -6570315963541227654 -282591932 -5726226297114658480 -17160329975787685834 -8843457619279611284 -18413674034484740195 -9870724570023121 -492176077 -30740204914083091 -21433663625497129 -1629160452 -1873618450346477252 -18412827972379344962 -5243108696682924272 -7260902865540482639 -816448424 -70975974 -15287423196122254433 -1873618501936285414 -5151629580948802356 -3735682 -61079961 -18411981910273949729 -7837634943338155161 -3597357340772992368 -5133829485925763690 -51184007 -10956724774926813288 -98632231 -17309267256018536307 -9870724567992379 -29048106498198701 -3544107379218385465 -14386655907412249373 -219257507157 -21496117 -68944331 -16330874579771459902 -11600084 -11124082762859154482 -5459935770830768809 -814416800 -347984565637089693 -11923578915473263059 -575144796 -517800693 -3297856681506178941 -326737923180 -16038494049632258844 -15104099179857577674 -32996413518841137 -153944682 -2152780467316001469 -8722536002903082945 -10646954815923686447 -545456942 -14458654042895551171 -3935742187522887052 -16064731596255856452 -19464489 -17648172288953812474 -6213874949885069218 -14851060135220743194 -6471725260172231870 -4504298175131421894 -573113178 -11701191021079496730 -12314601354656483126 -13957562954616997312 -161809033 -563217229 -104464968 -1366033375 -1133620930477295468 -6209141923583494372 -2625321610894509848 -5052785364214352114 -6155298040667702671 -5246977012853376412 -4074350485214726972 -27328854 -1873618441748677997 -2000487899013646903 -7465404271946632160 -7239351853821397993 -11742834345080916462 -6368045642961454306 -5516046795487905107 -434216307724 -3493677603186412637 -810353539 -16633695840000739887 -821147663836514852 -18413391996586557524 -7536828 -4151361015346562251 -14540810596246030644 -5995296139937712949 -159777405 -8816997369364548341 -45089144 -18412545934481162291 -9298403582666148514 -15108492788614827244 -35193182 -5568582435113995179 -5570988833963444820 -15289397375428069113 -15401217 -8430474765433179073 -10750398672578676906 -72745468 -5405598728725859379 -9250794030848869727 -62849456 -17422075106091075868 -5505181 -1873618497637255436 -578945889 -13106160036035691955 -282329787 -5570988786672405753 -9870724569761068 -7031431794891230329 -43057525 -1706034183 -491913932 -214958474959 -90505732 -18412545964574834746 -32432303330887118 -846140170598090257 -5458848587099997499 -17607182983838566334 -195297952 -539362075 -5460499872422693597 -23265605 -943759021439519007 -70713826 -816186278 -2207492642904016905 -644154222 -60817815 -806290300 -3473534 -1873618501936022824 -13307798926833551183 -1873618527730926929 -11349795265056081195 -567018319 -9388513449772451585 -165610142 -2625321576501808484 -7290339324003420579 -15287141244205140113 -41025899 -9870724567730368 -5569296739846327213 -98370083 -1531970550 -219257244681 -2065251783916127931 -6151097665987347595 -1407386597 -3973490993339565383 -12463417266756127924 -17631161371525515669 -21233971 -3232498753 -4767727591020628301 -8972557000702888938 -1873618458945784014 -15290525376551717170 -1559626750 -68682184 -12689613402799605860 -527434500 -517538547 -3542979343701772038 -447112610911 -163578521 -326737659857 -30458205707109873 -2625321606595479619 -498702419026 -555090760 -11846037961957312985 -2286775792223980496 -2676819007 -11599686562536949325 -3968978683605551949 -5831598103022077418 -15175534989820758889 -3812049126336301758 -545194794 -12348736218027264207 -12743882002561631754 -12318365723906541324 -8882845388820581451 -12769623874203027091 -1732546160493595960 -10430737389551487761 -9512531412808567772 -21433723812579518 -812123024 -9140909979694467183 -4025048830681353606 -1873618489039455401 -18331530485106038 -5516046791188875281 -6156456003434055463 -12474564753552836994 -17621561863500597513 -104202820 -29612220986426501 -1996555300 -2625321610894247837 -17489156252859434801 -103179363763095696 -15920335005095365860 -13112992413209136128 -2034107431 -17291573824845253535 -9772926989806013640 -819987397 -17170714 -1873618467543321286 -16156684754098128751 -6925759830950740072 -7274680 -16161820259100396848 -3698377064120454404 -10296839827164892306 -13913370016116443160 -1363739614 -92275213 -210659444315 -1784112314702629632 -5461104765611674055 -507299956084 -13237708552781955078 -197067432 -4211147846 -14657391675119111356 -25035091 -1735459858 -15139069 -14426056237756189706 -12771845711499103316 -9940375093616053431 -6523880655054768550 -62587308 -10967349376607587326 -1873618497636993704 -15290807392954681807 -5243033 -1133620917580466754 -1873618523431898109 -11613165301442872555 -282067642 -9870724569498781 -2141513421469058406 -14318336791419094928 -5885976069999102359 -6153917830015027393 -214958212644 -548995910 -90243587 -16101055855214332856 -9409295256684857617 -539099930 -30458248699119542 -23003457 -252379820 -6173800107753209956 -70451678 -13107433 -815924131 -1873618476140856959 -3188833133853148985 -3211386 -60555668 -5514354727165429372 -18430745393540238720 -5566476498435442740 -8821966780582857359 -806028152 -31022281504130688 -15273884660262766886 -17153706162049649384 -15568274631689570656 -98107936 -9870724567468020 -2625321602296449309 -5250413516934940017 -10377197347619277484 -546964288 -2429420595 -68420036 -13840095604897025041 -11075790 -1873618506234530930 -517276402 -31304293607146613 -10225919150420460684 -32714392818354350 -163316374 -17480593072628501093 -3653991426073234491 -28202143271093720 -2625321606595217579 -669516658 -11075097734987253589 -544932649 -5248951136269502637 -24535874148371011 -5247593352907000017 -13750803869111880047 -821756878 -5565348488711963913 -18940198 -23407778443822783 -811860878 -3910652327921846506 -2372569380647405649 -6151097721875664077 -8481290603310483360 -15289115311734721621 -5197393238738928914 -8858552325786961082 -15270695523793439937 -103940672 -6206603741566403719 -151388766 -2531021385567766485 -7563081637033018620 -13044533461222491710 -6154199872212897041 -9126223058424237061 -1160107295621122785 -32714349826081871 -6152225697206437786 -4333982245204396969 -7012532 -5411521012994803182 -5249797159683425776 -570557265 -17619527108083517000 -3758799224970808644 -11069796609748044689 -210659181949 -14926165161459649868 -7570985824906512457 -3234866947851553000 -1906986264008723742 -24772943 -1873618446046923526 -7516607870825792868 -14876921 -72221177 -18411699906768535578 -1495925747 -62325160 -288043895627 -31304259214443724 -3685635809078676834 -4980885 -313838798363 -13951205954302051853 -464309454125 -7151957518376504179 -6153353870293665804 -365428606574 -14319322726341872694 -3493083035910933027 -214957950334 -13222096480399396057 -22741311 -538837783 -12845285 -1675756474409617568 -7676326031729298383 -1873618476140594617 -70189530 -2861086850442987769 -12590629664748537952 -15501473033754248808 -1733166096 -2949238 -5833854255738587405 -6405261049027955879 -60293520 -6364097417622914469 -50397573 -15289397310941170468 -1436145094782551981 -9870724567205432 -155189878 -7996312456522828750 -2413828615876118471 -1818166298 -97845788 -2625321602296187261 -4451323549999957434 -3544953467117898450 -40501610 -6364097443417820330 -1543385872365455415 -12606726616442537392 -16436379939763522008 -7562235540534921217 -546702141 -20709678 -18413109962987470918 -10939233345785957508 -1384869222252743071 -14383042897579063 -245051624454 -813630359 -5881866613803452649 -1455946274504313841 -68157888 -10813643 -4502606072414800438 -9388513432576593267 -517014256 -16739161091967945306 -6203168539198949844 -20305658202031811 -15122676476569913436 -48365955 -5941764144784016877 -12601357272775920269 -5900805793554762144 -163054228 -6155327937823509637 -95814162 -2625321606594955469 -544670501 -11092808190891527547 -6365225423046182853 -3545799490531822688 -5991347927496329957 -2676033356038473537 -6928358494714596151 -18895516000586505 -18413109993081143373 -1317798870 -3242943116712479419 -8468495303965871404 -10215782083327823122 -295544243748734701 -7536133444401891169 -13880529192106527090 -18412263930975748140 -103678524 -8816997365064994109 -5513226652957347114 -13427220419978791304 -4279895118 -2581508047683782932 -151126621 -16436648502584675667 -5245789596497153220 -18411417868870352907 -1574831104 -5512098613140196086 -16646420 -16881311723980129501 -580191075 -6750384 -460010423829 -17142588721119759321 -5411521012994540776 -13331692090551241408 -2236213724530672835 -10512763733196344280 -91750922 -493159123 -210658919829 -5353476789791099071 -2973047420892220660 -102615266471184862 -817431474 -71959029 -14614773 -29330157293667421 -18411417898964025362 -8854886129749066875 -62063012 -1631882651478526261 -1873618497636468806 -1626046306171619904 -4718737 -6971710725545264615 -15463390673086056969 -5996988225456246061 -2625321597997156982 -1258091056198584472 -2365498112266798670 -12258209558853782455 -548471621 -200191596416994196 -5565348480113903112 -10159392401199270768 -538575636 -5782296448490211725 -15289115277341755866 -12583138 -4959080478982475006 -4237766475632413481 -2687090 -60031373 -11241814380293784908 -18413674017288355935 -10162787574158199843 -5625593289148533238 -605557034314828631 -2625321602295925195 -97583640 -16546579671803956126 -546439994 -13513914891881875478 -18412827955182960702 -18142877345697171235 -8716776878113885241 -5991347923197297866 -21715680028265805 -5299098848608717979 -2686971790050919863 -10551496 -2676033351739442523 -5246976935469649046 -4236074403011431549 -5561348123192067576 -516752111 -13525196865559988902 -451412624470 -6813843502384089093 -3452050537366752044 -2723374776553770162 -105448017 -14284319595218536933 -356832576945 -1987904546 -2789363555876800106 -17063697102470777209 -6584302816815089825 -5727354422913010657 -13944415416121166662 -28311895 -11906248855590275274 -3707523343842937215 -18412827985276633157 -821232589 -18415907 -2676033356038210923 -17257283880273643533 -18331556279224644 -9117971362513815455 -18411981923171237924 -309541536868 -113312346 -46072191 -103416376 -27920126869375123 -160760449 -361131345578 -9234597529149245860 -14835085562484362568 -4585257123188181630 -1413046597527538184 -6208295874376239521 -13217980679449939250 -1966081057 -6101795981361546864 -16384272 -10370417990725208293 -4196703391028741586 -6488236 -63832509 -5153885660580611393 -6155045912821630127 -5197393273132877515 -2625321593698126810 -10720606758114626648 -9870724570745030 -30740204914804024 -91488775 -7792373120121047026 -3579577413 -5458848587100981064 -755605599842665887 -17404805271631431757 -417019921504 -9386257335747873389 -817169327 -18413391979390173264 -71696881 -8328637003859953646 -14665059300281706 -6101796011455220816 -4456589 -13070886371126478108 -8733200714257204941 -10913926882465549337 -29330183088310857 -61800865 -14949273699027977966 -1873618523431110190 -3573803894998305775 -5569296709751605280 -5835546375651263675 -9870724568714358 -42008942 -1746899701160150410 -9664889374910385451 -7406761759861377295 -2625321597996894992 -365428082633 -11888218815508973537 -6311975551774360856 -1408369638 -6101795942670075923 -15515140772745448064 -27638058877519937 -13361048879788721990 -2430665780 -22217020 -538313489 -927164962728314711 -69665238 -27638084672424186 -2573543627316201844 -12320990 -2424942 -18413392009483845719 -3660444556051220001 -18412545947378450486 -154665586 -9870724566681132 -546177847 -2229804632046437624 -5245848917148372136 -15906307047154976446 -827351178595273968 -5780604350074062990 -6350640494756627870 -9198943117821938833 -2676033351739180486 -1192315303887243384 -67633599 -6205475723246636047 -17419818910382754661 -162529937 -17083693235326683482 -105185869 -8912366315847026281 -5249797202674912471 -2446394423 -1461650414 -257426098 -17299513133793348673 -4451048243670025981 -14597841535548131734 -14130457194541352666 -15290525359355331959 -9195012299735698785 -524354306 -429916226796 -6153353788611431303 -1728578573 -6153071806602085789 -2676033356037948725 -8257735 -2785415326238575484 -1873618489038408278 -8072726556923202784 -7731878007432940921 -16271603835638319461 -11229884474259868248 -5835546388547569431 -2704904949958969710 -103154228 -2625321589399096275 -6887529782530082437 -45810044 -16365628939578247566 -4408861808311732424 -3554388240579364748 -3431353251379022211 -4131548706499659810 -3229097897723824621 -818938814 -16122124 -10831084194895235709 -6226088 -6366071472254485645 -10441809166173275876 -9538952396691934382 -5994450030541998229 -6835382734606174906 -4397798273518472097 -2625321593697864817 -9870724570481756 -17782439637510195701 -31304332299601191 -4074350515307087985 -10758418391935682553 -11405246090117384413 -196018851 -17943317531894613402 -15289397375426759758 -1801651221 -12716605781588708278 -5353476789790574588 -1873618450346936800 -14462121002204464918 -2785415309041207732 -71434733 -10770155859627543824 -1873618476141841211 -5780604362970367638 -2530739313276357975 -14090480 -5567604589840172352 -296644709200 -11266915032714840583 -4194441 -2200512120787569683 -2549492329236335496 -6211116016906930204 -99090988 -9625506809262378259 -13237708535585570818 -490103571663 -14541340640523322842 -9870724568450966 -1793158821936040552 -9486667438472824267 -21954873 -538051341 -1398211555 -5408700909154273182 -5356297014005859746 -8444237263823374707 -69403090 -2599235317101562153 -15897859265386515143 -6097847713031849822 -2162794 -9796067026192895123 -13117159209037203716 -164299420 -17088031212435737557 -8099682237308012832 -8971880411373045432 -3099205763721988894 -9870724566418979 -545915701 -13237708565679243273 -4449074137450482853 -18115860927276518423 -5247593352907982888 -16533468055605152863 -1873618458944474091 -19923244 -3188833116656765520 -2676033351738918494 -4501477955215362649 -17621268784989013395 -14581169549127125939 -6206321707968234614 -33278352538406314 -516227820 -6890349946557761313 -1411918553413126104 -162267790 -2474797953316292924 -1694703987789596868 -18172096623373846790 -28766090095429261 -1223976979390989739 -3221822110943152678 -104923721 -15185362616787929146 -10003084053115964048 -2625321585100065781 -437798118096833445 -1815348248 -31304323701802109 -152371807 -14046027923586223423 -2021331689141374237 -20869691006257762 -13044533461223476582 -16778219695595128445 -12057002331826554305 -17465760298758178660 -7576852735584046364 -129168850403198609 -820708298 -17891616 -1873618489038145001 -7995587 -11911353550167017696 -4522983015860209939 -12612941966326959190 -102892081 -2625321589398833886 -45547899 -11548493110908749415 -4076606693818764590 -7851156332894489575 -12779163922391107832 -5991347884505304103 -1095239150174145285 -3863606920688567965 -10771469979967884371 -15859976 -14312864964518020808 -17245750799710423012 -5963940 -10655291933708585535 -4162099616697747321 -63308215 -1873618519131818153 -30176189305784773 -53412232 -318140582948 -15611911946388048179 -12640696470018459947 -30176223702288623 -9870724570219682 -33278412725750974 -1409876968 -28766150282773591 -1873618450346674286 -15290243360148359553 -14036340911856223966 -6365225461738636619 -816645035 -417019398489 -6206321673575531611 -12057284352529139627 -71172585 -13828334 -7528870385169533979 -5832726134240118664 -2785415334835848520 -2572415553107265488 -61276571 -3932293 -9870724568188981 -1873618549225491555 -2360543918673038210 -98828841 -12512221777814685432 -17939922315943150958 -6045857707735386835 -21692726 -4502324038816629924 -11490081257974859839 -17639632887023929831 -1316357237551401394 -6101795994259359091 -11796695 -69140942 -18411699889572151318 -12074216556992400767 -1320813529 -8618954206934993224 -164037275 -4160546838840674266 -12591757708863407913 -555549513 -9870724566156739 -154141293 -32714414313178248 -545653553 -223556471268 -12613788024133322735 -812581780 -5778348150066318224 -1500709877 -6741138607599781046 -9227353569080969220 -515965674 -13884327378110449525 -18411699919665823773 -16340493341965880015 -162005644 -620757861 -21997756618049241 -17007720368052373541 -13001845694847518363 -227855238971 -17629469 -1737950228 -9288263741171697848 -20305615210743190 -1873618489037883086 -18613533990193666 -7733439 -313841551493 -15288551330518206781 -17302333254828493968 -6153071832396467338 -2979056014524680527 -8857706336766199103 -2625321589398571980 -45285754 -5991347884505041337 -4502324004423927097 -16874702537456224943 -14911447610171655366 -13944990587222231178 -3308118261903721908 -18413109975884759113 -8412057600244518110 -15597828 -2538734651 -818414521 -17082847207615236134 -18276979644936029994 -5701792 -63046067 -5882159696614657105 -1410790466305853323 -18412263913779363880 -32714379920475611 -539325825270679628 -1873618519131556994 -13536993689470216 -9870724569957729 -43254135 -5153885686374731086 -9387385384162626351 -8336200085500660803 -5303047104041388600 -5512098595943810546 -5717788221838658971 -2324121364801391676 -12012735189037878155 -2192639020 -1873618476141316771 -70910437 -3670145 -2219404100148201532 -2544580112253650683 -61014424 -6155045921420412650 -18412263943873036335 -1873618549225229533 -9870724567926898 -98566694 -29894215892535509 -155910777 -6366353527348399255 -9956242218935388443 -31586340104504804 -219257441372 -13522668389390157414 -18411417881767641102 -11534547 -279448847671 -7242736046355514492 -68878794 -814351263 -1192315299587689576 -2524775482 -34124461934314600 -507839197 -5539270545646881104 -4974759074281293673 -5337229686545450161 -153879145 -12644080653952551280 -30458205707308380 -545391405 -17877509356004052233 -17520266449292560845 -11065487246536017596 -2011949215506761725 -6155045882728942511 -812319634 -1130753852548581517 -573047641 -5299098874402571932 -18413674000091971675 -18331556280207363 -17269866578628118199 -15289397293744523027 -161743496 -10649664295314066054 -6051485356288903427 -4347925833116091776 -30458188511970924 -104399431 -10184384893691038634 -7401639761433855789 -1308623824 -563151692 -2625321610894444316 -7239069803025663720 -11434534198373320614 -1873618441748613384 -5622264654903379074 -29330122899915877 -15636380174699072146 -820184006 -2597848126 -10233694917695638297 -14585410861575638263 -7471291 -85348920764927349 -6366353492955694732 -18413674030185644130 -4127600472562141528 -35127645 -5780604337176709161 -541328159 -2524806001290315567 -13850612818404510827 -18412827968080248897 -15335680 -3493395603981665996 -17858552114457937219 -62783919 -3875793754648151904 -5564423899624572258 -292345154665 -3489447322753895731 -18411981905974853664 -5439644 -42991988 -9870724569695611 -12269921124804135698 -559088458 -33278386930321618 -15289397353931868100 -214958409445 -6219166245997316001 -15289397379726773461 -30458248699315998 -23200068 -12163381674616883890 -70648289 -9000175594581527004 -806224763 -89657146100418951 -15475002888547338265 -3407997 -60752278 -18411981936068526119 -14267039342724252928 -13726068525522684375 -1873618527730862181 -4504298213822565083 -155648632 -98304546 -9870724567665640 -13681696359428851594 -219257178788 -24535844054893958 -50011031689890353 -10532987940533372886 -11272401 -23407795639356361 -68616647 -814089116 -15635925519041823968 -1998521381 -163512984 -797977540607610221 -32150286927595340 -4709060078846741586 -5967447917778832244 -5885976078596834724 -2625321606595414132 -153616999 -1744643526947965735 -17461812017531651650 -987047180239768912 -30740239306197230 -15288833278135765839 -525337347 -5885976155981547843 -18413391992287461459 -10532987970627045461 -56689033 -5722409915131627177 -114033243 -10159956468397444373 -18412545930182066226 -5349367342193968413 -13819010092172884 -104137283 -17953636526298302297 -2224234517276395067 -2789363555875490728 -2625321610894182276 -12426051065400527122 -9355193091131312182 -30740222110861163 -14361095630442006439 -3137288237381257087 -17105177 -819921860 -7209143 -1727529996 -810025856 -805679481429165719 -17298949057997047589 -21997713627284659 -16120716880803858984 -33560368941433940 -1535706104 -10229733804179524009 -18412545960275738681 -9714916620294556051 -4078298775038527628 -5461104765611607541 -210659378559 -92209676 -13418544886826534789 -14264208172476401284 -1917322269 -197001895 -24969554 -5405598728725530322 -15073532 -817890229 -72417787 -1873618471842024407 -17091318705916150977 -5946696443085589628 -5177496 -5847102830955857465 -62521771 -1873618523431831649 -5835546371351184527 -14824583848163281869 -42729843 -9870724569433729 -5780604315680310424 -16385074671182940805 -214958147231 -3007753865419557454 -491586249 -17943317531893566468 -1801912319444323213 -22937920 -539034393 -27356055371580547 -1873618476140792146 -5198803303557629187 -6103488088376871190 -13041896 -1733362705 -70386141 -2306802734 -643826540 -3145849 -14637903957824965363 -519242494 -60490131 -805962615 -5784522635265967958 -1873618527730601376 -18301216972082383618 -11644189250161151139 -2625321602296383846 -9870724567402585 -98042399 -15741861301866530650 -494403323033 -6729754102968812754 -546898751 -6208295835683456476 -33560403333875446 -14409153078548760239 -15530271666638163275 -1873618458945456185 -16951650337051970851 -5144036663261072615 -813826970 -12133908888583014197 -68354499 -11010253 -279448324634 -14749580058850363919 -6633286351216577743 -2089265852158774334 -8929038315166239946 -31586271318836879 -13678484518713821516 -105906772 -96010773 -2625321606595152102 -153354852 -10831360821402142464 -5652457623480305518 -8503320935775669540 -16483453074211931840 -363084051790629688 -544867112 -258146996 -5944020284604679310 -5782296431293302176 -28484176870181368 -23407778443758207 -3973491023432910866 -5778348175860436286 -1873618514834032208 -5438906422044199526 -103875135 -7697026996393675938 -1709507593 -161219206 -13237708548482859013 -3701601059573925529 -879419277503368073 -3822179681402096264 -5565348445721659362 -532291916112267238 -256115374 -1460339693 -13351948495571782591 -14665351642484132 -3008657884776564221 -2341393787733871788 -16904712944497920326 -3967850626592737364 -16843031 -4131548702199581670 -6946995 -809763710 -1928986057181235415 -11964228788262537512 -2989761681675848960 -1873618519132801026 -7276444624641068235 -5994450030542718433 -12284124821458521275 -111739480 -4076606646528706921 -13650504529854072320 -15804734059994287439 -14425661019905001872 -2395604016 -14465116522071263669 -210659116497 -15290243360149343057 -15777957523720635747 -10167863869407233224 -18331517588211470 -12884708026702235763 -14811384 -72155640 -7042731044489660311 -15288269305517836796 -5675796551176948530 -14264208198271043974 -1495860210 -5787083718919720300 -25099894056749168 -683965395648908415 -62259623 -4915348 -12974919760129952993 -6155045917120857525 -1873618523431569790 -9013091190501541709 -4392112055939237960 -2625321597997353452 -15897908900500866947 -6177363174264606048 -15872788267758849077 -491324104 -33560399034844286 -22675774 -17542946455516547053 -2431124533 -538772246 -27920040887322186 -8704274751914773568 -12085352355710699032 -6153353775713551670 -70123993 -27356081166223293 -7885152524183078888 -60227983 -2883701 -11700344903086704893 -7329667560521271617 -518980348 -5833854255738521265 -8618954206935976415 -3901910077209972079 -1713308683 -1992881785903908578 -4530582984922301900 -16130159995999161574 -155124341 -2625321602296121720 -1884114794138700522 -5778348218852443426 -97780251 -4240022615453076686 -6097847786116483627 -6361518319333476776 -30540122 -28484146776247610 -546636604 -5741055947585816645 -6100103891543657570 -8807886331112851129 -813564822 -10223260478367337870 -746324852 -15287423226215073909 -11226550812567014265 -1491796976 -8097653480026868144 -5995296157134227520 -1873618532029106835 -1539245050 -48300418 -331037869860 -95748625 -6314795724398267312 -5888081980883929307 -544604964 -34124418943289166 -5245848947242502849 -32432363517642192 -2676033356038407648 -811533196 -1317733333 -8920676095134336910 -17149817495305717193 -918014392040164136 -103612987 -8695136395555507435 -18349504802666319185 -14847634415788362123 -1584661506 -4287350266942457603 -525512494730316455 -5881302580997523790 -1574765567 -3784125305237867347 -819397570 -8326286517935867839 -16149105318148965958 -16580883 -6684847 -18411699902469439513 -11229983338076703492 -15292499491369977714 -339635406848 -9870724570940976 -100 -101 -102 diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab.txt deleted file mode 100644 index 57c08778e36..00000000000 --- a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab.txt +++ /dev/null @@ -1,3500 +0,0 @@ -[PAD] -[unused1] -[unused2] -[unused3] -[unused4] -[unused5] -[unused6] -[unused7] -[unused8] -[unused9] -[unused10] -[unused11] -[unused12] -[unused13] -[unused14] -[unused15] -[unused16] -[unused17] -[unused18] -[unused19] -[unused20] -[unused21] -[unused22] -[unused23] -[unused24] -[unused25] -[unused26] -[unused27] -[unused28] -[unused29] -[unused30] -[unused31] -[unused32] -[unused33] -[unused34] -[unused35] -[unused36] -[unused37] -[unused38] -[unused39] -[unused40] -[unused41] -[unused42] -[unused43] -[unused44] -[unused45] -[unused46] -[unused47] -[unused48] -[unused49] -[unused50] -[unused51] -[unused52] -[unused53] -[unused54] -[unused55] -[unused56] -[unused57] -[unused58] -[unused59] -[unused60] -[unused61] -[unused62] -[unused63] -[unused64] -[unused65] -[unused66] -[unused67] -[unused68] -[unused69] -[unused70] -[unused71] -[unused72] -[unused73] -[unused74] -[unused75] -[unused76] -[unused77] -[unused78] -[unused79] -[unused80] -[unused81] -[unused82] -[unused83] -[unused84] -[unused85] -[unused86] -[unused87] -[unused88] -[unused89] -[unused90] -[unused91] -[unused92] -[unused93] -[unused94] -[unused95] -[unused96] -[unused97] -[unused98] -[unused99] -[UNK] -[CLS] -[SEP] -[MASK] -[unused100] -[unused101] -! -" -# -$ -% -& -' -( -) -* -+ -, -- -. -/ -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -: -; -< -= -> -? -@ -A -B -C -D -E -F -G -H -I -J -K -L -M -N -O -P -Q -R -S -T -U -V -W -X -Y -Z -[ -\ -] -^ -_ -` -a -b -c -d -e -f -g -h -i -j -k -l -m -n -o -p -q -r -s -t -u -v -w -x -y -z -{ -| -} -~ -¡ -¢ -£ -¥ -§ -¨ -© -ª -« -¬ -® -° -± -² -³ -´ -µ -¶ -· -¹ -º -» -¼ -½ -¾ -¿ -À -Á - -Ä -Å -Æ -Ç -È -É -Í -Î -Ñ -Ó -Ö -× -Ø -Ú -Ü -Þ -ß -à -á -â -ã -ä -å -æ -ç -è -é -ê -ë -ì -í -î -ï -ð -ñ -ò -ó -ô -õ -ö -÷ -ø -ù -ú -û -ü -ý -þ -ÿ -Ā -ā -ă -ą -Ć -ć -Č -č -ď -Đ -đ -ē -ė -ę -ě -ğ -ġ -Ħ -ħ -ĩ -Ī -ī -İ -ı -ļ -Ľ -ľ -Ł -ł -ń -ņ -ň -ŋ -Ō -ō -ŏ -ő -Œ -œ -ř -Ś -ś -Ş -ş -Š -š -Ţ -ţ -ť -ũ -ū -ŭ -ů -ű -ų -ŵ -ŷ -ź -Ż -ż -Ž -ž -Ə -ƒ -ơ -ư -ǎ -ǐ -ǒ -ǔ -ǫ -Ș -ș -Ț -ț -ɐ -ɑ -ɔ -ɕ -ə -ɛ -ɡ -ɣ -ɨ -ɪ -ɲ -ɾ -ʀ -ʁ -ʂ -ʃ -ʊ -ʋ -ʌ -ʐ -ʑ -ʒ -ʔ -ʰ -ʲ -ʳ -ʷ -ʻ -ʼ -ʾ -ʿ -ˈ -ː -ˡ -ˢ -ˣ -́ -̃ -̍ -̯ -͡ -Α -Β -Γ -Δ -Ε -Η -Θ -Ι -Κ -Λ -Μ -Ν -Ο -Π -Σ -Τ -Φ -Χ -Ψ -Ω -ά -έ -ή -ί -α -β -γ -δ -ε -ζ -η -θ -ι -κ -λ -μ -ν -ξ -ο -π -ρ -ς -σ -τ -υ -φ -χ -ψ -ω -ό -ύ -ώ -І -Ј -А -Б -В -Г -Д -Е -Ж -З -И -К -Л -М -Н -О -П -Р -С -Т -У -Ф -Х -Ц -Ч -Ш -Э -Ю -Я -а -б -в -г -д -е -ж -з -и -й -к -л -м -н -о -п -р -с -т -у -ф -х -ц -ч -ш -щ -ъ -ы -ь -э -ю -я -ё -і -ї -ј -њ -ћ -Ա -Հ -ա -ե -ի -կ -մ -յ -ն -ո -ս -տ -ր -ւ -ְ -ִ -ֵ -ֶ -ַ -ָ -ֹ -ּ -א -ב -ג -ד -ה -ו -ז -ח -ט -י -כ -ל -ם -מ -ן -נ -ס -ע -פ -צ -ק -ר -ש -ת -، -ء -آ -أ -إ -ئ -ا -ب -ة -ت -ث -ج -ح -خ -د -ذ -ر -ز -س -ش -ص -ض -ط -ظ -ع -غ -ف -ق -ك -ل -م -ن -ه -و -ى -ي -َ -ِ -ٹ -پ -چ -ک -گ -ہ -ی -ے -ं -आ -क -ग -च -ज -ण -त -द -ध -न -प -ब -भ -म -य -र -ल -व -श -ष -स -ह -ा -ि -ी -ु -े -ो -् -। -॥ -আ -ই -এ -ও -ক -খ -গ -চ -ছ -জ -ট -ত -থ -দ -ধ -ন -প -ব -ম -য -র -ল -শ -স -হ -় -া -ি -ী -ু -ে -ো -্ -য় -க -த -ப -ம -ய -ர -ல -வ -ா -ி -ு -் -ร -་ -ག -ང -ད -ན -བ -མ -ར -ལ -ས -ི -ུ -ེ -ོ -ა -ე -ი -ლ -ნ -ო -რ -ს -ᴬ -ᴵ -ᵀ -ᵃ -ᵇ -ᵈ -ᵉ -ᵍ -ᵏ -ᵐ -ᵒ -ᵖ -ᵗ -ᵘ -ᵢ -ᵣ -ᵤ -ᵥ -ᶜ -ᶠ -ḍ -Ḥ -ḥ -Ḩ -ḩ -ḳ -ṃ -ṅ -ṇ -ṛ -ṣ -ṭ -ạ -ả -ấ -ầ -ẩ -ậ -ắ -ế -ề -ể -ễ -ệ -ị -ọ -ố -ồ -ổ -ộ -ớ -ờ -ợ -ụ -ủ -ứ -ừ -ử -ữ -ự -ỳ -ỹ -ἀ -ἐ -ὁ -ὐ -ὰ -ὶ -ὸ -ῆ -ῖ -ῦ -ῶ -‐ -‑ -‒ -– -— -― -‖ -‘ -’ -‚ -“ -” -„ -† -‡ -• -… -‰ -′ -″ -⁄ -⁰ -ⁱ -⁴ -⁵ -⁶ -⁷ -⁸ -⁹ -⁺ -⁻ -ⁿ -₀ -₁ -₂ -₃ -₄ -₅ -₆ -₇ -₈ -₉ -₊ -₍ -₎ -ₐ -ₑ -ₒ -ₓ -ₕ -ₖ -ₘ -ₙ -ₚ -ₛ -ₜ -₤ -€ -₱ -₹ -ℓ -№ -ℝ -⅓ -← -↑ -→ -↔ -⇌ -⇒ -∂ -∈ -− -∗ -∘ -√ -∞ -∧ -∨ -∩ -∪ -≈ -≠ -≡ -≤ -≥ -⊂ -⊆ -⊕ -⋅ -─ -│ -■ -● -★ -☆ -☉ -♠ -♣ -♥ -♦ -♭ -♯ -⟨ -⟩ -ⱼ -、 -。 -《 -》 -「 -」 -『 -』 -〜 -い -う -え -お -か -き -く -け -こ -さ -し -す -せ -そ -た -ち -つ -て -と -な -に -の -は -ひ -ま -み -む -め -も -や -ゆ -よ -ら -り -る -れ -ん -ア -ィ -イ -ウ -エ -オ -カ -ガ -キ -ク -グ -コ -サ -シ -ジ -ス -ズ -タ -ダ -ッ -テ -デ -ト -ド -ナ -ニ -ハ -バ -パ -フ -ブ -プ -マ -ミ -ム -ャ -ュ -ラ -リ -ル -レ -ロ -ン -・ -ー -一 -三 -上 -下 -中 -事 -二 -井 -京 -人 -亻 -仁 -佐 -侍 -光 -公 -力 -北 -十 -南 -原 -口 -史 -司 -吉 -同 -和 -囗 -国 -國 -土 -城 -士 -大 -天 -太 -夫 -女 -子 -宀 -安 -宮 -宿 -小 -尚 -山 -島 -川 -州 -平 -年 -心 -愛 -戸 -文 -新 -方 -日 -明 -星 -書 -月 -木 -本 -李 -村 -東 -松 -林 -正 -武 -氏 -水 -氵 -江 -河 -海 -版 -犬 -王 -生 -田 -白 -皇 -省 -真 -石 -社 -神 -竹 -美 -義 -花 -藤 -西 -谷 -車 -辶 -道 -郎 -郡 -部 -野 -金 -長 -門 -陽 -青 -食 -馬 -高 -龍 -龸 -사 -씨 -의 -이 -한 -fi -fl -! -( -) -, -- -/ -: -the -of -and -to -in -was -The -is -for -as -on -with -that -##s -his -by -he -at -from -it -her -He -had -an -were -you -be -In -she -are -but -which -It -not -or -have -my -him -one -this -me -has -also -up -their -first -out -who -been -they -She -into -all -would -its -##ing -time -two -##a -##e -said -about -when -over -more -other -can -after -back -them -then -##ed -there -like -so -only -##n -could -##d -##i -##y -what -no -##o -where -This -made -than -if -You -##ly -through -we -before -##r -just -some -##er -years -do -New -##t -down -between -new -now -will -three -most -On -around -year -used -such -being -well -during -They -know -against -under -later -did -part -known -off -while -His -re -... -##l -people -until -way -American -didn -University -your -both -many -get -United -became -head -There -second -As -work -any -But -still -again -born -even -eyes -After -including -de -took -And -long -team -season -family -see -right -same -called -name -because -film -don -10 -found -much -school -##es -going -won -place -away -We -day -left -John -000 -hand -since -World -these -how -make -number -each -life -area -man -four -go -No -here -very -National -##m -played -released -never -began -States -album -home -last -too -held -several -May -own -##on -take -end -School -##h -ll -series -What -want -use -another -city -When -2010 -side -At -may -That -came -face -June -think -game -those -high -March -early -September -##al -2011 -looked -July -state -small -thought -went -January -October -##u -based -August -##us -world -good -April -York -us -12 -2012 -2008 -For -2009 -group -along -few -South -little -##k -following -November -something -2013 -December -set -2007 -old -2006 -2014 -located -##an -music -County -City -former -##in -room -ve -next -All -##man -got -father -house -##g -body -15 -20 -18 -started -If -2015 -town -our -line -War -large -population -named -British -company -member -five -My -single -##en -age -State -moved -February -11 -Her -should -century -government -built -come -best -show -However -within -look -men -door -without -need -wasn -2016 -water -One -system -knew -every -died -League -turned -asked -North -St -wanted -building -received -song -served -though -felt -##ia -station -band -##ers -local -public -himself -different -death -say -##1 -30 -##2 -2005 -16 -night -behind -children -English -members -near -saw -together -son -14 -voice -village -13 -hands -help -##3 -due -French -London -top -told -open -published -third -2017 -play -across -During -put -final -often -include -25 -##le -main -having -2004 -once -ever -let -book -led -gave -late -front -find -club -##4 -German -included -species -College -form -opened -mother -women -enough -West -must -2000 -power -really -17 -making -half -##6 -order -might -##is -given -million -times -days -point -full -service -With -km -major -##7 -original -become -seen -II -north -six -##te -love -##0 -national -International -##5 -24 -So -District -lost -run -couldn -career -always -##9 -2003 -##th -country -##z -House -air -tell -south -worked -woman -player -##A -almost -war -River -##ic -married -continued -Then -James -close -black -short -##8 -##na -using -history -returned -light -car -##ra -sure -William -things -General -##ry -2002 -better -support -100 -among -From -feet -King -anything -21 -19 -established -district -2001 -feel -great -##ton -level -Cup -These -written -games -others -already -title -story -##p -law -thing -US -record -role -however -By -students -England -white -control -least -inside -land -##C -22 -give -community -hard -##ie -non -##c -produced -George -round -period -Park -business -various -##ne -does -present -wife -far -taken -per -reached -David -able -version -working -young -live -created -joined -East -living -appeared -case -High -done -23 -important -President -Award -France -position -office -looking -total -general -class -To -production -##S -football -party -brother -keep -mind -free -Street -hair -announced -development -either -nothing -moment -Church -followed -wrote -why -India -San -election -1999 -lead -How -##ch -##rs -words -European -course -considered -America -arms -Army -political -##la -28 -26 -west -east -ground -further -church -less -site -First -Not -Australia -toward -California -##ness -described -works -An -Council -heart -past -military -27 -##or -heard -field -human -soon -founded -1998 -playing -trying -##x -##ist -##ta -television -mouth -although -taking -win -fire -Division -##ity -Party -Royal -program -Some -Don -Association -According -tried -TV -Paul -outside -daughter -Best -While -someone -match -recorded -Canada -closed -region -Air -above -months -elected -##da -##ian -road -##ar -brought -move -1997 -leave -##um -Thomas -1996 -am -low -Robert -formed -person -services -points -Mr -miles -##b -stop -rest -doing -needed -international -release -floor -start -sound -call -killed -real -dark -research -finished -language -Michael -professional -change -sent -50 -upon -29 -track -hit -event -2018 -term -example -Germany -similar -return -##ism -fact -pulled -stood -says -ran -information -yet -result -developed -girl -##re -God -1995 -areas -signed -decided -##ment -Company -seemed -##el -co -turn -race -common -video -Charles -Indian -##ation -blood -art -red -##able -added -rather -1994 -met -director -addition -design -average -minutes -##ies -##ted -available -bed -coming -friend -idea -kind -Union -Road -remained -##ting -everything -##ma -running -care -finally -Chinese -appointed -1992 -Australian -##ley -popular -mean -teams -probably -##land -usually -project -social -Championship -possible -word -Russian -instead -mi -herself -##T -Peter -Hall -Center -seat -style -money -1993 -else -Department -table -Music -current -31 -features -special -events -character -Two -square -sold -debut -##v -process -Although -Since -##ka -40 -Central -currently -education -placed -lot -China -quickly -forward -seven -##ling -Europe -arm -performed -Japanese -1991 -Henry -Now -Dr -##ion -week -Group -myself -big -UK -Washington -ten -deep -1990 -Club -Japan -space -La -directed -smile -episode -hours -whole -##de -##less -Why -wouldn -designed -strong -training -changed -Society -stage -involved -hadn -towards -leading -police -eight -kept -Institute -study -largest -child -eventually -private -modern -Court -throughout -getting -originally -attack -##E -talk -Great -longer -songs -alone -##ine -wide -dead -walked -shot -##ri -Oh -force -##st -Art -today -friends -Island -Richard -1989 -center -construction -believe -size -White -ship -completed -##B -gone -Just -rock -sat -##R -radio -below -entire -families -league -includes -type -lived -official -range -hold -featured -Most -##ter -president -passed -means -##f -forces -lips -Mary -Do -guitar -##ce -food -wall -Of -spent -Its -performance -hear -##P -Western -reported -sister -##et -morning -##M -especially -##ive -Minister -itself -post -bit -groups -1988 -##tion -Black -##ng -Well -raised -sometimes -Canadian -Paris -Spanish -replaced -schools -Academy -leaving -central -female -Christian -Jack -whose -college -onto -provided -##D -##ville -players -actually -stopped -##son -Museum -doesn -##ts -books -fight -allowed -##ur -beginning -Records -awarded -parents -coach -##os -Red -saying -##ck -Smith -Yes -Lake -##L -aircraft -1987 -##ble -previous -ft -action -Italian -African -happened -vocals -Act -future -court -##ge -1986 -degree -phone -##ro -Is -countries -winning -breath -Love -river -matter -Lord -Other -list -self -parts -##ate -provide -cut -shows -plan -1st -interest -##ized -Africa -stated -Sir -fell -owned -earlier -ended -competition -attention -1985 -lower -nearly -bad -older -stay -Saint -##se -certain -1984 -fingers -blue -try -fourth -Grand -##as -king -##nt -makes -chest -movement -states -moving -data -introduced -model -date -section -Los -deal -##I -skin -entered -middle -success -Texas -##w -summer -island -##N -Republic -length -husband -1980 -##ey -reason -anyone -forced -via -base -500 -job -covered -Festival -Roman -successful -rights -cover -Man -writing -Ireland -##F -related -goal -takes -buildings -true -weeks -1983 -Because -opening -novel -ISBN -meet -gold -##ous -mid -km² -standing -Football -Chicago -shook -whom -##ki -1982 -Day -feeling -scored -boy -higher -Force -leader -heavy -fall -question -sense -army -Second -energy -meeting -themselves -kill -##am -board -census -##ya -##ns -mine -meant -market -required -battle -campaign -attended -approximately -Kingdom -runs -active -##ha -contract -clear -previously -health -1979 -Arts -complete -Catholic -couple -units -##ll -##ty -Committee -shoulder -sea -systems -listed -##O -caught -tournament -##G -northern -author -Film -Your -##men -holding -offered -personal -1981 -southern -artist -traditional -studio -200 -capital -##ful -regular -ask -giving -organization -month -news -Are -read -managed -helped -studied -student -defeated -natural -industry -Year -noted -decision -Government -quite -##id -smiled -1972 -Maybe -tracks -##ke -Mark -al -media -engine -hour -Their -relationship -plays -property -structure -1976 -ago -Hill -Martin -1978 -ready -Many -Like -Bay -immediately -generally -Italy -Greek -practice -caused -division -significant -Joseph -speed -Let -thinking -completely -1974 -primary -mostly -##field -##K -1975 -##to -Even -writer -##led -dropped -magazine -collection -understand -route -highest -particular -films -lines -network -Science -loss -carried -direction -green -1977 -location -producer -according -Women -Queen -neck -thus -independent -view -1970 -Angeles -Soviet -distance -problem -Board -tour -western -income -appearance -access -Mexico -nodded -street -surface -arrived -believed -Old -1968 -1973 -becoming -whether -1945 -figure -singer -stand -Following -issue -window -wrong -pain -everyone -lives -issues -park -slowly -la -act -##va -bring -Lee -operations -key -comes -fine -cold -famous -Navy -1971 -Me -additional -individual -##ner -Zealand -goals -county -contains -Service -minute -2nd -reach -talking -particularly -##ham -movie -Director -glass -paper -studies -##co -railway -standard -Education -45 -represented -Chief -Louis -launched -Star -terms -60 -1969 -experience -watched -Another -Press -Tom -staff -starting -subject -break -Virginia -nine -eye -##age -evidence -foot -##est -companies -Prince -##V -gun -create -Big -People -guy -Green -simply -numerous -##line -increased -twenty -##ga -##do -1967 -award -officer -stone -Before -material -Northern -grew -male -plant -Life -legs -step -Al -unit -35 -except -answer -##U -report -response -Edward -commercial -edition -trade -science -##ca -Irish -Law -shown -rate -failed -##ni -remains -changes -mm -limited -larger -Later -cause -waiting -Time -##wood -cost -Bill -manager -activities -likely -allow -operated -retired -##ping -65 -directly -Who -associated -effect -hell -Florida -straight -hot -Valley -management -girls -expected -eastern -Mike -chance -cast -centre -chair -hurt -problems -##li -walk -programs -Team -characters -Battle -edge -pay -maybe -corner -majority -medical -Joe -Summer -##io -attempt -Pacific -command -Radio -##by -names -municipality -1964 -train -economic -Brown -feature -sex -source -agreed -remember -Three -1966 -1965 -Pennsylvania -victory -senior -annual -III -Southern -results -Sam -serving -religious -Jones -appears -##der -despite -claimed -Both -musical -matches -fast -security -selected -Young -double -complex -hospital -chief -Times -##ve -Championships -filled -Public -Despite -beautiful -Research -plans -Province -##ally -Wales -##ko -artists -metal -nearby -Spain -##il -32 -houses -supported -piece -##no -stared -recording -nature -legal -Russia -##ization -remaining -looks -##sh -bridge -closer -cases -scene -marriage -Little -##é -uses -Earth -specific -Frank -theory -Good -discovered -referred -bass -culture -university -presented -Congress -##go -metres -continue -1960 -isn -Awards -meaning -cell -composed -separate -Series -forms -Blue -cross -##tor -increase -test -computer -slightly -Where -Jewish -Town -tree -status -1944 -variety -responsible -pretty -initially -##way -realized -pass -provides -Captain -Alexander -recent -score -broke -Scott -drive -financial -showed -Line -stories -ordered -soldiers -genus -operation -gaze -sitting -society -Only -hope -actor -follow -Empire -Yeah -technology -happy -focus -policy -spread -situation -##ford -##ba -Mrs -watch -Can -1963 -Commission -touch -earned -troops -Under -1962 -individuals -cannot -19th -##lin -mile -expression -exactly -suddenly -weight -dance -stepped -places -appear -difficult -Railway -anti -numbers -kilometres -star -##ier -department -ice -Britain -removed -Once -##lo -Boston -value -##ant -mission -trees -Order -sports -join -serve -Major -poor -Poland -mainly -Theatre -pushed -Station -##it -Lady -federal -silver -##ler -foreign -##ard -Eastern -##den -box -hall -subsequently -lies -acquired -1942 -ancient -CD -History -Jean -beyond -##ger -El -##les -growing -championship -native -Parliament -Williams -watching -direct -overall -offer -Also -80 -Secretary -spoke -Latin -ability -##ated -safe -presence -##ial -headed -regional -planned -1961 -Johnson -throat -consists -##W -extended -Or -bar -walls -Chris -stations -politician -Olympics -influence -share -fighting -speak -hundred -Carolina -die -stars -##tic -color -Chapter -##ish -fear -sleep -goes -Francisco -oil -Bank -sign -physical -##berg -Dutch -seasons -##rd -Games -Governor -sorry -lack -Centre -memory -baby -smaller -charge -Did -multiple -ships -shirt -Assembly -amount -leaves -3rd -Foundation -conditions -1943 -Rock -Democratic -Daniel -##at -winner -products -##ina -store -latter -Professor -civil -prior -host -1956 -soft -vote -needs -Each -rules -1958 -pressure -letter -normal -proposed -levels -records -1959 -paid -intended -Victoria -purpose -okay -historical -issued -1980s -broadcast -rule -simple -picked -firm -Sea -1941 -Elizabeth -1940 -serious -featuring -highly -graduated -mentioned -choice -1948 -replied -percent -Scotland -##hi -females -constructed -1957 -settled -Steve -recognized -cities -crew -glanced -kiss -competed -flight -knowledge -editor -More -Conference -##H -fifth -elements -##ee -##tes -function -newspaper -recently -Miss -cultural -brown -twice -Office -1939 -truth -Creek -1946 -households -USA -1950 -quality -##tt -border -seconds -destroyed -pre -wait -ahead -build -image -90 -cars -##mi -33 -promoted -professor -et -bank -medal -text -broken -Middle -revealed -sides -wing -seems -channel -1970s -Ben -loved -effort -officers -Will -##ff -70 -Israel -Jim -upper -fully -label -Jr -assistant -powerful -pair -positive -##ary -gives -1955 -20th -races -remain -kitchen -primarily -##ti -Sydney -easy -Tour -whispered -buried -300 -News -Polish -1952 -Duke -Columbia -produce -accepted -00 -approach -minor -1947 -Special -44 -Asian -basis -visit -Fort -Civil -finish -formerly -beside -leaned -##ite -median -rose -coast -effects -supposed -Cross -##hip -Corps -residents -Jackson -##ir -Bob -basketball -36 -Asia -seem -Bishop -Book -##ber -ring -##ze -owner -BBC -##ja -transferred -acting -De -appearances -walking -Le -press -grabbed -1954 -officially -1953 -##pe -risk -taught -review -##X -lay -##well -council -Avenue -seeing -losing -Ohio -Super -province -ones -travel -##sa -projects -equipment -spot -Berlin -administrative -heat -potential -shut -capacity -elections -growth -fought -Republican -mixed -Andrew -teacher -turning -strength -shoulders -beat -wind -1949 -Health -follows -camp -suggested -perhaps -Alex -mountain -contact -divided -candidate -fellow -34 -Show -necessary -workers -ball -horse -ways -questions -protect -gas -activity -younger -bottom -founder -Scottish -screen -treatment -easily -com -##house -dedicated -Master -warm -Night -Georgia -Long -von -##me -perfect -website -1960s -piano -efforts -##ide -Tony -sort -offers -Development -Simon -executive -##nd -save -Over -Senate -1951 -1990s -draw -master -Police -##ius -renamed -boys -initial -prominent -damage -Co -##ov -##za -online -begin -occurred -captured -youth -Top -account -tells -Justice -conducted -forest -##town -bought -teeth -Jersey -##di -purchased -agreement -Michigan -##ure -campus -prison -becomes -product -secret -guess -Route -huge -types -drums -64 -split -defeat -estate -housing -##ot -brothers -Coast -declared -happen -titled -therefore -sun -commonly -alongside -Stadium -library -Home -article -steps -telling -slow -assigned -refused -laughed -wants -Nick -wearing -Rome -Open -##ah -Hospital -pointed -Taylor -lifted -escape -participated -##j -drama -parish -Santa -##per -organized -mass -pick -Airport -gets -Library -unable -pull -Live -##ging -surrounding -##ries -focused -Adam -facilities -##ning -##ny -38 -##ring -notable -era -connected -gained -operating -laid -Regiment -branch -defined -Christmas -machine -Four -academic -Iran -adopted -concept -Men -compared -search -traffic -Max -Maria -greater -##ding -widely -##burg -serves -1938 -37 -Go -hotel -shared -typically -scale -1936 -leg -suffered -yards -pieces -Ministry -Wilson -episodes -empty -1918 -safety -continues -yellow -historic -settlement -400 -Come -Corporation -enemy -content -picture -evening -territory -method -trial -solo -driver -Here -##ls -entrance -Prize -spring -whatever -##ent -75 -##ji -reading -Arthur -##cy -Our -clothes -Prime -Illinois -Kong -code -##ria -sit -Harry -Federal -chosen -administration -bodies -begins -stomach -Though -seats -Hong -density -Sun -leaders -Field -museum -chart -platform -languages -##ron -birth -holds -Gold -##un -fish -combined -##ps -4th -1937 -largely -captain -trust -Game -van -boat diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/test_sentences.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/test_sentences.txt deleted file mode 100644 index 6111117192a..00000000000 --- a/python/cudf/cudf/tests/data/subword_tokenizer_data/test_sentences.txt +++ /dev/null @@ -1,100 +0,0 @@ -This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত -This sample text is public domain and was randomly selected from Project Guttenberg. -The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. -Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. -Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. -"Cass" Beard had risen early that morning, but not with a view to discovery. -A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. -The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. -This was nearly opposite. -Mr. Cassius crossed the highway, and stopped suddenly. -But the Goblin could no longer sit quietly listening to the wisdom and intellect downstairs. No, as soon as the light shone in the evening from the attic it seemed to him as though its beams were strong ropes dragging him up, and he had to go and peep through the key-hole. There he felt the sort of feeling we have looking at the great rolling sea in a storm, and he burst into tears. He could not himself say why he wept, but in spite of his tears he felt quite happy. How beautiful it must be to sit under that tree with the student, but that he could not do; he had to content himself with the key-hole and be happy there! -But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. -Looking at it more attentively, he saw that it bore the inscription, "May to Cass." -Like most of his fellow gold-seekers, Cass was superstitious. -The fountain of classic wisdom, Hypatia herself. -As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. -From my youth I felt in me a soul above the matter-entangled herd. -She revealed to me the glorious fact, that I am a spark of Divinity itself. -A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. -There is a philosophic pleasure in opening one's treasures to the modest young. -Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. -Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; -but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. -Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. -His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; -while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. -At last they reached the quay at the opposite end of the street; -and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. -He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. -Nana also troubled him in another way. He had sometimes a feeling that she did not admire him. “I know she admires you tremendously, George,” -Mrs. Darling would assure him, and then she would sign to the children to be specially nice to father. Lovely dances followed, in which the only other servant, Liza, was sometimes allowed to join. -Such a midget she looked in her long skirt and maid's cap, though she had sworn, when engaged, that she would never see ten again. -The gaiety of those romps! -And gayest of all was Mrs. Darling, who would pirouette so wildly that all you could see of her was the kiss, and then if you had dashed at her you might have got it. -There never was a simpler happier family until the coming of Peter Pan. -Finally, I always go to sea as a sailor, because of the wholesome exercise and pure air of the fore-castle deck. -For as in this world, head winds are far more prevalent than winds from astern (that is, if you never violate the Pythagorean maxim), so for the most part the Commodore on the quarter-deck gets his atmosphere at second hand from the sailors on the forecastle. He thinks he breathes it first; but not so. -In much the same way do the commonalty lead their leaders in many other things, at the same time that the leaders little suspect it. -But wherefore it was that after having repeatedly smelt the sea as a merchant sailor, I should now take it into my head to go on a whaling voyage; this the invisible police officer of the Fates, who has the constant surveillance of me, and secretly dogs me, and influences me in some unaccountable way—he can better answer than any one else. -And, doubtless, my going on this whaling voyage, formed part of the grand programme of Providence that was drawn up a long time ago. -It came in as a sort of brief interlude and solo between more extensive performances. -I take it that this part of the bill must have run something like this: -“_Grand Contested Election for the Presidency of the United States._ -“WHALING VOYAGE BY ONE ISHMAEL. “BLOODY BATTLE IN AFFGHANISTAN.” -Amy followed, but she poked her hands out stiffly before her, and jerked herself along as if she went by machinery, and her "Ow!" was more suggestive of pins being run into her than of fear and anguish. -Jo gave a despairing groan, and Meg laughed outright, while Beth let her bread burn as she watched the fun with interest. -"It's no use! Do the best you can when the time comes, and if the audience laughs, don't blame me. Come on, Meg." -Then things went smoothly, for Don Pedro defied the world in a speech of two pages without a single break. Hagar, the witch, chanted an awful incantation over her kettleful of simmering toads, with weird effect. -Roderigo rent his chains asunder manfully, and Hugo died in agonies of remorse and arsenic, with a wild, "Ha! Ha!" -This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত -This sample text is public domain and was randomly selected from Project Guttenberg. -The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. -Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. -Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. -"Cass" Beard had risen early that morning, but not with a view to discovery. -A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. -The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. -This was nearly opposite. -Mr. Cassius crossed the highway, and stopped suddenly. -Something glittered in the nearest red pool before him. -I had one experience with Master Philip before our visitors betook themselves back to Kent, which, unfortunate as it was, I cannot but relate here. My cousin would enter into none of those rough amusements in which I passed my time, for fear, I took it, of spoiling his fine broadcloths or of losing a gold buckle. He never could be got to wrestle, though I challenged him more than once. And he was a well-built lad, and might, with a little practice, have become skilled in that sport. He laughed at the homespun I wore about the farm, saying it was no costume for a gentleman's son, and begged me sneeringly to don leather breeches. He would have none of the company of those lads with whom I found pleasure, young Harvey, and Willis's son, who was being trained as Mr. Starkie's assistant. Nor indeed did I disdain to join in a game with Hugo, who had been given to me, and other negro lads. Philip saw no sport in a wrestle or a fight between two of the boys from the quarters, and marvelled that I could lower myself to bet with Harvey the younger. He took not a spark of interest in the gaming cocks we raised together to compete at the local contests and at the fair, and knew not a gaff from a cockspur. Being one day at my wits' end to amuse my cousin, I proposed to him a game of quoits on the green beside the spring-house, and thither we repaired, followed by Hugo, and young Harvey come to look on. Master Philip, not casting as well as he might, cries out suddenly to Hugo: "Begone, you black dog! What business have you here watching a game between gentlemen?" -But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. -Looking at it more attentively, he saw that it bore the inscription, "May to Cass." -Like most of his fellow gold-seekers, Cass was superstitious. -The fountain of classic wisdom, Hypatia herself. -As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. -From my youth I felt in me a soul above the matter-entangled herd. -She revealed to me the glorious fact, that I am a spark of Divinity itself. -A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. -There is a philosophic pleasure in opening one's treasures to the modest young. -Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. -Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; -but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. -Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. -His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; -while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. -and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. -He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. -Nana also troubled him in another way. He had sometimes a feeling that she did not admire him. “I know she admires you tremendously, George,” -Mrs. Darling would assure him, and then she would sign to the children to be specially nice to father. Lovely dances followed, in which the only other servant, Liza, was sometimes allowed to join. -Such a midget she looked in her long skirt and maid's cap, though she had sworn, when engaged, that she would never see ten again. -In the Year 1676, the Prince of _Orange_ having, in concert with the _Spaniards_, resolv'd upon the important Siege of _Maestrich_ (the only Town in the _Dutch_ Provinces, then remaining in the Hands of the _French_) it was accordingly invested about the middle of _June_, with an Army of twenty Thousand Men, under the Command of his Highness Prince -_Waldeck_, with the grand Army covering the Siege. It was some Time before the heavy Cannon, which we expected up the _Maes_, from _Holland_, arrived; which gave Occasion to a Piece of Raillery of Monsieur _Calvo_, the Governor, which was as handsomely repartec'd. That Governor, by a Messenger, intimating his Sorrow to find, we had pawn'd our Cannon for Ammunition Bread. Answer was made, That in a few Days we hoped to give him a Taste of the Loaves, which he should find would be sent him into the Town in extraordinary plenty. I remember another Piece of Raillery, which pass'd some Days after between the _Rhingrave_ and the same _Calvo_. The former sending Word, that he hoped within three Weeks to salute that Governor's Mistress within the Place. _Calvo_ reply'd, He'd give him leave to kiss her all over, if he kiss'd her anywhere in three Months. -And gayest of all was Mrs. Darling, who would pirouette so wildly that all you could see of her was the kiss, and then if you had dashed at her you might have got it. -There never was a simpler happier family until the coming of Peter Pan. -Finally, I always go to sea as a sailor, because of the wholesome exercise and pure air of the fore-castle deck. -For as in this world, head winds are far more prevalent than winds from astern (that is, if you never violate the Pythagorean maxim), so for the most part the Commodore on the quarter-deck gets his atmosphere at second hand from the sailors on the forecastle. He thinks he breathes it first; but not so. -In much the same way do the commonalty lead their leaders in many other things, at the same time that the leaders little suspect it. -But wherefore it was that after having repeatedly smelt the sea as a merchant sailor, I should now take it into my head to go on a whaling voyage; this the invisible police officer of the Fates, who has the constant surveillance of me, and secretly dogs me, and influences me in some unaccountable way—he can better answer than any one else. -And, doubtless, my going on this whaling voyage, formed part of the grand programme of Providence that was drawn up a long time ago. -It came in as a sort of brief interlude and solo between more extensive performances. -The British Isles have been ringing for the last few years with the word 'Art' in its German sense; with 'High Art,' 'Symbolic Art,' 'Ecclesiastical Art,' 'Dramatic Art,' 'Tragic Art,' and so forth; and every well-educated person is expected, nowadays, to know something about Art. Yet in spite of all translations of German 'AEsthetic' treatises, and 'Kunstnovellen,' the mass of the British people cares very little about the matter, and sits contented under the imputation of 'bad taste.' Our stage, long since dead, does not revive; our poetry is dying; our music, like our architecture, only reproduces the past; our painting is only first-rate when it handles landscapes and animals, and seems likely so to remain; but, meanwhile, nobody cares. Some of the deepest and most earnest minds vote the question, in general, a 'sham and a snare,' and whisper to each other -confidentially, that Gothic art is beginning to be a 'bore,' and that Sir Christopher Wren was a very good fellow after all; while the middle classes look on the Art movement half amused, as with a pretty toy, half sulkily suspicious of Popery and Paganism, and think, -apparently, that Art is very well when it means nothing, and is merely used to beautify drawing-rooms and shawl patterns; not to mention that, if there were no painters, Mr. Smith could not hand down to posterity likenesses of himself, Mrs. Smith, and family. But -when 'Art' dares to be in earnest, and to mean something, much more to connect itself with religion, Smith's tone alters. He will teach 'Art' to keep in what he considers its place, and if it refuses, take the law of it, and put it into the Ecclesiastical Court. So he says, and what is more, he means what he says; and as all the world, from Hindostan to Canada, knows by most practical proof, what he means, he sooner or later does, perhaps not always in the wisest way, but still he does it.Ah! It's pleasant to drop into my own easy-chair my dear though a little palpitating what with trotting up-stairs and what with trotting down, and why kitchen stairs should all be corner stairs is for the builders to justify though I do not think they fully understand their trade and never did, else why the sameness and why not more conveniences and fewer -draughts and likewise making a practice of laying the plaster on too thick I am well convinced which holds the damp, and as to chimney-pots putting them on by guess-work like hats at a party and no more knowing what their effect will be upon the smoke bless you than I do if so much, except that it will mostly be either to send it down your throat in a straight form or give it a twist before it goes there. And what I says speaking as I find of those new metal chimneys all manner of shapes (there's a row of 'em at Miss Wozenham's lodging-house lower down on the other side of the way) is that they only work your smoke into artificial patterns for you before you swallow it and that I'd quite as soon swallow mine plain, the flavour being the same, not to mention the conceit of putting up signs on the top of your house to show the forms in which you take your smoke into your inside -Amy followed, but she poked her hands out stiffly before her, and jerked herself along as if she went by machinery, and her "Ow!" was more suggestive of pins being run into her than of fear and anguish. -Jo gave a despairing groan, and Meg laughed outright, while Beth let her bread burn as she watched the fun with interest. -"It's no use! Do the best you can when the time comes, and if the audience laughs, don't blame me. Come on, Meg." -Then things went smoothly, for Don Pedro defied the world in a speech of two pages without a single break. Hagar, the witch, chanted an awful incantation over her kettleful of simmering toads, with weird effect.''' \ No newline at end of file diff --git a/python/cudf/cudf/tests/data/text/__init__.py b/python/cudf/cudf/tests/data/text/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/data/text/chess.pgn b/python/cudf/cudf/tests/data/text/chess.pgn deleted file mode 100644 index 6f516e5c640..00000000000 --- a/python/cudf/cudf/tests/data/text/chess.pgn +++ /dev/null @@ -1,16 +0,0 @@ -[Event "F/S Return Match"] -[Site "Belgrade, Serbia JUG"] -[Date "1992.11.04"] -[Round "29"] -[White "Fischer, Robert J."] -[Black "Spassky, Boris V."] -[Result "1/2-1/2"] - -1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 {This opening is called the Ruy Lopez.} -4. Ba4 Nf6 5. O-O Be7 6. Re1 b5 7. Bb3 d6 8. c3 O-O 9. h3 Nb8 10. d4 Nbd7 -11. c4 c6 12. cxb5 axb5 13. Nc3 Bb7 14. Bg5 b4 15. Nb1 h6 16. Bh4 c5 17. dxe5 -Nxe4 18. Bxe7 Qxe7 19. exd6 Qf6 20. Nbd2 Nxd6 21. Nc4 Nxc4 22. Bxc4 Nb6 -23. Ne5 Rae8 24. Bxf7+ Rxf7 25. Nxf7 Rxe1+ 26. Qxe1 Kxf7 27. Qe3 Qg5 28. Qxg5 -hxg5 29. b3 Ke6 30. a3 Kd6 31. axb4 cxb4 32. Ra5 Nd5 33. f3 Bc8 34. Kf2 Bf5 -35. Ra7 g6 36. Ra6+ Kc5 37. Ke1 Nf4 38. g3 Nxh3 39. Kd2 Kb5 40. Rd6 Kc5 41. Ra6 -Nf2 42. g4 Bd3 43. Re6 1/2-1/2 diff --git a/python/cudf/cudf/tests/data/text/chess.pgn.gz b/python/cudf/cudf/tests/data/text/chess.pgn.gz deleted file mode 100644 index f03d0d0f73d..00000000000 Binary files a/python/cudf/cudf/tests/data/text/chess.pgn.gz and /dev/null differ diff --git a/python/cudf/cudf/tests/dataframe/__init__.py b/python/cudf/cudf/tests/dataframe/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/dataframe/test_attributes.py b/python/cudf/cudf/tests/dataframe/test_attributes.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_attributes.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_binary_operations.py b/python/cudf/cudf/tests/dataframe/test_binary_operations.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_binary_operations.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_combining.py b/python/cudf/cudf/tests/dataframe/test_combining.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_combining.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_computation.py b/python/cudf/cudf/tests/dataframe/test_computation.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_computation.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_constructing.py b/python/cudf/cudf/tests/dataframe/test_constructing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_constructing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_conversion.py b/python/cudf/cudf/tests/dataframe/test_conversion.py deleted file mode 100644 index d1de7245634..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_conversion.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -def test_convert_dtypes(): - data = { - "a": [1, 2, 3], - "b": [1, 2, 3], - "c": [1.1, 2.2, 3.3], - "d": [1.0, 2.0, 3.0], - "e": [1.0, 2.0, 3.0], - "f": ["a", "b", "c"], - "g": ["a", "b", "c"], - "h": ["2001-01-01", "2001-01-02", "2001-01-03"], - } - dtypes = [ - "int8", - "int64", - "float32", - "float32", - "float64", - "str", - "category", - "datetime64[ns]", - ] - nullable_columns = list("abcdef") - non_nullable_columns = list(set(data.keys()).difference(nullable_columns)) - - df = pd.DataFrame( - { - k: pd.Series(v, dtype=d) - for k, v, d in zip(data.keys(), data.values(), dtypes) - } - ) - gdf = cudf.DataFrame.from_pandas(df) - expect = df[nullable_columns].convert_dtypes() - got = gdf[nullable_columns].convert_dtypes().to_pandas(nullable=True) - assert_eq(expect, got) - - with pytest.raises(NotImplementedError): - # category and datetime64[ns] are not nullable - gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True) diff --git a/python/cudf/cudf/tests/dataframe/test_function_application.py b/python/cudf/cudf/tests/dataframe/test_function_application.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_function_application.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_indexing.py b/python/cudf/cudf/tests/dataframe/test_indexing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_indexing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py deleted file mode 100644 index 57948afe1d8..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -import contextlib -from io import BytesIO - -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize( - "index", - [range(1, 11), list(range(1, 11)), range(1, 11)[::2]], - ids=["RangeIndex", "IntIndex", "StridedRange"], -) -@pytest.mark.parametrize("write_index", [False, True, None]) -@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"]) -def test_dataframe_parquet_roundtrip(index, write_index, empty): - if empty: - data = {} - else: - data = {"a": [i * 2 for i in index]} - df = cudf.DataFrame(data=data, index=index) - pf = pd.DataFrame(data=data, index=index) - gpu_buf = BytesIO() - cpu_buf = BytesIO() - - df.to_parquet(gpu_buf, index=write_index) - pf.to_parquet(cpu_buf, index=write_index) - gpu_table = pq.read_table(gpu_buf) - cpu_table = pq.read_table(cpu_buf) - metadata_equal = ( - gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata - ) - if empty and write_index is not False: - # https://github.com/rapidsai/cudf/issues/15372 - ctx = pytest.raises(AssertionError) - else: - ctx = contextlib.nullcontext() - with ctx: - assert metadata_equal - - gpu_read = cudf.read_parquet(gpu_buf) - cpu_read = cudf.read_parquet(cpu_buf) - with ctx: - assert_eq(gpu_read, cpu_read) - - -@pytest.mark.parametrize("preserve_index", [False, True, None]) -def test_dataframe_to_arrow_preserve_index(preserve_index): - df = cudf.DataFrame({"x": ["cat", "dog"] * 5}) - pf = df.to_pandas() - expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema - got = df.to_arrow(preserve_index=preserve_index).schema - assert expect == got diff --git a/python/cudf/cudf/tests/dataframe/test_missing.py b/python/cudf/cudf/tests/dataframe/test_missing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_missing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_reindexing.py b/python/cudf/cudf/tests/dataframe/test_reindexing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_reindexing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_reshaping.py b/python/cudf/cudf/tests/dataframe/test_reshaping.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_reshaping.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_selecting.py b/python/cudf/cudf/tests/dataframe/test_selecting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_selecting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_sorting.py b/python/cudf/cudf/tests/dataframe/test_sorting.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_sorting.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/dataframe/test_timeseries.py b/python/cudf/cudf/tests/dataframe/test_timeseries.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/dataframe/test_timeseries.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/general_functions/__init__.py b/python/cudf/cudf/tests/general_functions/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/general_functions/test_conversion.py b/python/cudf/cudf/tests/general_functions/test_conversion.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/general_functions/test_conversion.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/general_functions/test_data_manipulation.py b/python/cudf/cudf/tests/general_functions/test_data_manipulation.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/general_functions/test_data_manipulation.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/general_functions/test_datetimelike.py b/python/cudf/cudf/tests/general_functions/test_datetimelike.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/general_functions/test_datetimelike.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/general_utilities/__init__.py b/python/cudf/cudf/tests/general_utilities/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/general_utilities/test_testing.py b/python/cudf/cudf/tests/general_utilities/test_testing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/general_utilities/test_testing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/groupby/__init__.py b/python/cudf/cudf/tests/groupby/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py deleted file mode 100644 index dc20a27177a..00000000000 --- a/python/cudf/cudf/tests/groupby/test_agg.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -import numpy as np -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize( - "empty", - [True, False], - ids=["empty", "nonempty"], -) -def test_agg_count_dtype(empty): - df = cudf.DataFrame({"a": [1, 2, 1], "c": ["a", "b", "c"]}) - if empty: - df = df.iloc[:0] - result = df.groupby("a").agg({"c": "count"}) - assert result["c"].dtype == np.dtype("int64") - - -@pytest.mark.parametrize("attr", ["agg", "aggregate"]) -def test_series_agg(attr): - df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) - pdf = df.to_pandas() - agg = getattr(df.groupby("a")["a"], attr)("count") - pd_agg = getattr(pdf.groupby(["a"])["a"], attr)("count") - - assert agg.ndim == pd_agg.ndim - - -@pytest.mark.parametrize("func", ["sum", "prod", "mean", "count"]) -@pytest.mark.parametrize("attr", ["agg", "aggregate"]) -def test_dataframe_agg(attr, func): - df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) - pdf = df.to_pandas() - - agg = getattr(df.groupby("a"), attr)(func) - pd_agg = getattr(pdf.groupby(["a"]), attr)(func) - - assert_eq(agg, pd_agg) - - agg = getattr(df.groupby("a"), attr)({"b": func}) - pd_agg = getattr(pdf.groupby(["a"]), attr)({"b": func}) - - assert_eq(agg, pd_agg) - - agg = getattr(df.groupby("a"), attr)([func]) - pd_agg = getattr(pdf.groupby(["a"]), attr)([func]) - - assert_eq(agg, pd_agg) - - agg = getattr(df.groupby("a"), attr)(foo=("b", func), bar=("a", func)) - pd_agg = getattr(pdf.groupby(["a"]), attr)( - foo=("b", func), bar=("a", func) - ) - - assert_eq(agg, pd_agg) - - agg = getattr(df.groupby("a"), attr)( - foo=cudf.NamedAgg(column="b", aggfunc=func), - bar=cudf.NamedAgg(column="a", aggfunc=func), - ) - pd_agg = getattr(pdf.groupby(["a"]), attr)( - foo=("b", func), bar=("a", func) - ) - - assert_eq(agg, pd_agg) - - -def test_dataframe_agg_with_invalid_kwarg(): - with pytest.raises(TypeError, match="Invalid keyword argument"): - df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]}) - df.groupby("a").agg(foo=set()) diff --git a/python/cudf/cudf/tests/groupby/test_computation.py b/python/cudf/cudf/tests/groupby/test_computation.py deleted file mode 100644 index 630fcdc4dce..00000000000 --- a/python/cudf/cudf/tests/groupby/test_computation.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) -def test_rank_return_type_compatible_mode(method): - # in compatible mode, rank() always returns floats - pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]}) - with cudf.option_context("mode.pandas_compatible", True): - df = cudf.from_pandas(pdf) - result = df.groupby("a").rank(method=method) - expect = pdf.groupby("a").rank(method=method) - assert_eq(expect, result) - assert result["b"].dtype == "float64" diff --git a/python/cudf/cudf/tests/groupby/test_function_application.py b/python/cudf/cudf/tests/groupby/test_function_application.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/groupby/test_function_application.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/groupby/test_groupby_obj.py b/python/cudf/cudf/tests/groupby/test_groupby_obj.py deleted file mode 100644 index ab2b16d263c..00000000000 --- a/python/cudf/cudf/tests/groupby/test_groupby_obj.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -from numpy.testing import assert_array_equal - -import cudf -from cudf.testing import assert_eq - - -def test_groupby_14955(): - # https://github.com/rapidsai/cudf/issues/14955 - df = cudf.DataFrame({"a": [1, 2] * 2}, index=[0] * 4) - agg = df.groupby("a") - pagg = df.to_pandas().groupby("a") - for key in agg.groups: - assert_array_equal(pagg.indices[key], agg.indices[key].get()) - assert_eq(pagg.get_group(key), agg.get_group(key)) diff --git a/python/cudf/cudf/tests/groupby/test_indexing.py b/python/cudf/cudf/tests/groupby/test_indexing.py deleted file mode 100644 index 43b6183fca5..00000000000 --- a/python/cudf/cudf/tests/groupby/test_indexing.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -import cudf -from cudf.testing import assert_eq - - -def test_rank_return_type_compatible_mode(): - # in compatible mode, rank() always returns floats - df = cudf.DataFrame({"a": range(10), "b": [0] * 10}, index=[0] * 10) - pdf = df.to_pandas() - expect = pdf.groupby("b").get_group(0) - result = df.groupby("b").get_group(0) - assert_eq(expect, result) diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py deleted file mode 100644 index a009802bab0..00000000000 --- a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -import numpy as np -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.fixture(params=[False, True], ids=["without_nulls", "with_nulls"]) -def with_nulls(request): - return request.param - - -@pytest.mark.parametrize("nrows", [30, 300, 300_000]) -@pytest.mark.parametrize("nkeys", [1, 2, 4]) -def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): - key_names = [f"key{key}" for key in range(nkeys)] - key_values = [np.random.randint(100, size=nrows) for _ in key_names] - value = np.random.randint(-100, 100, size=nrows) - df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) - if with_nulls: - for key in key_names: - df.loc[df[key] == 1, key] = None - with cudf.option_context("mode.pandas_compatible", True): - got = df.groupby(key_names, sort=False).agg({"value": "sum"}) - expect = ( - df.to_pandas().groupby(key_names, sort=False).agg({"value": "sum"}) - ) - assert_eq(expect, got, check_index_type=not with_nulls) diff --git a/python/cudf/cudf/tests/groupby/test_stats.py b/python/cudf/cudf/tests/groupby/test_stats.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/groupby/test_stats.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py deleted file mode 100644 index f7138036ddf..00000000000 --- a/python/cudf/cudf/tests/groupby/test_transform.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -import itertools - -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"]) -def keys_null(request): - return request.param - - -@pytest.fixture(params=[False, True], ids=["no-null-values", "null-values"]) -def values_null(request): - return request.param - - -@pytest.fixture -def df(keys_null, values_null): - keys = ["a", "b", "a", "c", "b", "b", "c", "a"] - r = range(len(keys)) - if keys_null: - keys[::3] = itertools.repeat(None, len(r[::3])) - values = list(range(len(keys))) - if values_null: - values[1::3] = itertools.repeat(None, len(r[1::3])) - return cudf.DataFrame({"key": keys, "values": values}) - - -@pytest.mark.parametrize("agg", ["cumsum", "cumprod", "max", "sum", "prod"]) -def test_transform_broadcast(agg, df): - pf = df.to_pandas() - got = df.groupby("key").transform(agg) - expect = pf.groupby("key").transform(agg) - assert_eq(got, expect, check_dtype=False) - - -def test_transform_invalid(): - df = cudf.DataFrame({"key": [1, 1], "values": [4, 5]}) - with pytest.raises(TypeError): - df.groupby("key").transform({"values": "cumprod"}) diff --git a/python/cudf/cudf/tests/indexes/__init__.py b/python/cudf/cudf/tests/indexes/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/indexes/datetime/__init__.py b/python/cudf/cudf/tests/indexes/datetime/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/indexes/datetime/test_components.py b/python/cudf/cudf/tests/indexes/datetime/test_components.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/datetime/test_components.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/datetime/test_constructing.py b/python/cudf/cudf/tests/indexes/datetime/test_constructing.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/datetime/test_constructing.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/datetime/test_conversion.py b/python/cudf/cudf/tests/indexes/datetime/test_conversion.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/indexes/datetime/test_conversion.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py deleted file mode 100644 index 4c0ce2ed191..00000000000 --- a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -import zoneinfo - -import pandas as pd - -import cudf -from cudf.testing import assert_eq - - -def test_slice_datetimetz_index(): - tz = zoneinfo.ZoneInfo("US/Eastern") - data = ["2001-01-01", "2001-01-02", None, None, "2001-01-03"] - pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(tz) - idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(tz) - expected = pidx[1:4] - got = idx[1:4] - assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py deleted file mode 100644 index 7cc629270b1..00000000000 --- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -import zoneinfo - -import pandas as pd - -import cudf -from cudf.testing import assert_eq - - -def test_tz_localize(): - tz = zoneinfo.ZoneInfo("America/New_York") - pidx = pd.date_range("2001-01-01", "2001-01-02", freq="1s") - pidx = pidx.astype("= PANDAS_CURRENT_SUPPORTED_VERSION - and request.node.callspec.id == "None-2-data3", - reason="https://github.com/pandas-dev/pandas/issues/57390", - ) - ) - gdf = DataFrame(data) - pdf = gdf.to_pandas(nullable=True) - - with pytest.warns(FutureWarning): - expect = pdf.applymap(func, na_action=na_action) - with pytest.warns(FutureWarning): - got = gdf.applymap(func, na_action=na_action) - - assert_eq(expect, got, check_dtype=False) - - -def test_applymap_raise_cases(): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - - def f(x, some_kwarg=0): - return x + some_kwarg - - with pytest.warns(FutureWarning): - with pytest.raises(NotImplementedError): - df.applymap(f, some_kwarg=1) - - with pytest.warns(FutureWarning): - with pytest.raises(ValueError): - df.applymap(f, na_action="some_invalid_option") diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py deleted file mode 100644 index 979c936a182..00000000000 --- a/python/cudf/cudf/tests/test_array_function.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -# To determine if NEP18 is available in the current version of NumPy we simply -# attempt to concatenate an object with `__array_function__` defined and see if -# NumPy invokes the protocol or not. Taken from dask array -# https://github.com/dask/dask/blob/master/dask/array/utils.py#L352-L363 -# TODO: Unclear if this is still necessary. NEP 18 was introduced as the -# default in 1.17 (https://github.com/numpy/numpy/releases/tag/v1.17.0) almost -# 3 years ago, and it was originally introduced one version before in 1.16 -# (although not enabled by default then). Can we safely assume that testers -# will have a sufficiently new version of numpy to run these tests? -class _Test: - def __array_function__(self, *args, **kwargs): - return True - - -try: - np.concatenate([_Test()]) -except ValueError: - missing_arrfunc_cond = True -else: - missing_arrfunc_cond = False - -del _Test - -missing_arrfunc_reason = "NEP-18 support is not available in NumPy" - -np.random.seed(0) - - -@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "func", - [ - lambda x: np.mean(x), - lambda x: np.sum(x), - lambda x: np.var(x, ddof=1), - lambda x: np.unique(x), - lambda x: np.dot(x, x), - lambda x: np.linalg.norm(x), - ], -) -def test_array_func_cudf_series(func): - np_ar = np.random.random(100) - cudf_ser = cudf.Series(np_ar) - expect = func(np_ar) - got = func(cudf_ser) - if np.isscalar(expect): - assert_eq(expect, got) - else: - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "func", - [ - lambda x: np.mean(x, axis=0), - lambda x: np.sum(x, axis=0), - lambda x: np.var(x, ddof=1, axis=0), - lambda x: np.dot(x, x.transpose()), - lambda x: np.all(x), - lambda x: np.any(x), - lambda x: np.prod(x, axis=0), - lambda x: np.prod(x, axis=1), - ], -) -def test_array_func_cudf_dataframe(func): - pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) - cudf_df = cudf.from_pandas(pd_df) - expect = func(pd_df) - got = func(cudf_df) - assert_eq(expect, got) - - -@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "func", - [ - lambda x: np.cov(x, x), - lambda x: np.linalg.norm(x), - lambda x: np.linalg.det(x), - ], -) -def test_array_func_missing_cudf_dataframe(func): - pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) - cudf_df = cudf.from_pandas(pd_df) - with pytest.raises(TypeError): - func(cudf_df) - - -@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "func", - [ - lambda x: np.unique(x), - ], -) -def test_array_func_cudf_index(func): - np_ar = np.random.random(100) - cudf_index = cudf.Index(cudf.Series(np_ar)) - expect = func(np_ar) - got = func(cudf_index) - if np.isscalar(expect): - assert_eq(expect, got) - else: - assert_eq(expect, got.to_numpy()) - - -@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "func", - [ - lambda x: np.cov(x, x), - lambda x: np.linalg.norm(x), - lambda x: np.linalg.det(x), - ], -) -def test_array_func_missing_cudf_index(func): - np_ar = np.random.random(100) - cudf_index = cudf.Index(cudf.Series(np_ar)) - with pytest.raises(TypeError): - func(cudf_index) - - -@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "func", - [ - lambda x: np.cov(x, x), - lambda x: np.dot(x, x), - lambda x: np.linalg.norm(x), - lambda x: np.linalg.det(x), - ], -) -def test_array_func_missing_cudf_multi_index(func): - levels = [["a", "b"], ["c", "d"]] - codes = [[0, 1], [1, 0]] - - cudf_multi_index = cudf.MultiIndex(levels, codes) - with pytest.raises(TypeError): - func(cudf_multi_index) - - -@pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -def test_list_input_array_func(): - ar = np.array([1, 2, 3]) - - s = cudf.Series(ar) - with pytest.raises(TypeError): - np.concatenate([s, s, s]) - - s = cudf.Series(ar, index=[1, 2, 3]) - with pytest.raises(TypeError): - np.concatenate([s, s, s]) diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py deleted file mode 100644 index 41b9188f036..00000000000 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ /dev/null @@ -1,461 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import operator -import warnings -from contextlib import contextmanager -from functools import reduce - -import cupy as cp -import numpy as np -import pytest - -import cudf -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_LT_300, - PANDAS_VERSION, -) -from cudf.testing import assert_eq -from cudf.testing._utils import expect_warning_if, set_random_null_mask_inplace - -_UFUNCS = [ - obj - for obj in (getattr(np, name) for name in dir(np)) - if isinstance(obj, np.ufunc) -] - - -@contextmanager -def _hide_ufunc_warnings(ufunc): - # pandas raises warnings for some inputs to the following ufuncs: - name = ufunc.__name__ - if name in { - "arccos", - "arccosh", - "arcsin", - "arctanh", - "fmod", - "log", - "log10", - "log2", - "reciprocal", - }: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - f"invalid value encountered in {name}", - category=RuntimeWarning, - ) - warnings.filterwarnings( - "ignore", - f"divide by zero encountered in {name}", - category=RuntimeWarning, - ) - yield - elif name in { - "bitwise_and", - "bitwise_or", - "bitwise_xor", - }: - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Operation between non boolean Series with different " - "indexes will no longer return a boolean result in " - "a future version. Cast both Series to object type " - "to maintain the prior behavior.", - category=FutureWarning, - ) - yield - else: - yield - - -@pytest.mark.parametrize("ufunc", _UFUNCS) -def test_ufunc_index(request, ufunc): - # Note: This test assumes that all ufuncs are unary or binary. - fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=not hasattr(cp, fname), - reason=f"cupy has no support for '{fname}'", - ) - ) - request.applymarker( - pytest.mark.xfail( - condition=fname == "matmul" and PANDAS_LT_300, - reason="Fixed by https://github.com/pandas-dev/pandas/pull/57079", - ) - ) - - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - pandas_args = args = [ - cudf.Index( - cp.random.randint(low=1, high=10, size=N), - ) - for _ in range(ufunc.nin) - ] - - got = ufunc(*args) - - with _hide_ufunc_warnings(ufunc): - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) - - if ufunc.nout > 1: - for g, e in zip(got, expect): - assert_eq(g, e, check_exact=False) - else: - assert_eq(got, expect, check_exact=False) - - -@pytest.mark.parametrize( - "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] -) -@pytest.mark.parametrize("reflect", [True, False]) -def test_binary_ufunc_index_array(ufunc, reflect): - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - args = [cudf.Index(cp.random.rand(N)) for _ in range(ufunc.nin)] - - arg1 = args[1].to_cupy() - - if reflect: - got = ufunc(arg1, args[0]) - expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) - else: - got = ufunc(args[0], arg1) - expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) - - if ufunc.nout > 1: - for g, e in zip(got, expect): - if reflect: - assert (cp.asnumpy(g) == e).all() - else: - assert_eq(g, e, check_exact=False) - else: - if reflect: - assert (cp.asnumpy(got) == expect).all() - else: - assert_eq(got, expect, check_exact=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("ufunc", _UFUNCS) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("indexed", [True, False]) -def test_ufunc_series(request, ufunc, has_nulls, indexed): - # Note: This test assumes that all ufuncs are unary or binary. - fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=( - indexed - and fname - in { - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - "equal", - } - ), - reason="Comparison operators do not support misaligned indexes.", - ) - ) - request.applymarker( - pytest.mark.xfail( - condition=ufunc == np.matmul and has_nulls, - reason="Can't call cupy on column with nulls", - ) - ) - # If we don't have explicit dispatch and cupy doesn't support the operator, - # we expect a failure - request.applymarker( - pytest.mark.xfail( - condition=not hasattr(cp, fname), - reason=f"cupy has no support for '{fname}'", - ) - ) - - request.applymarker( - pytest.mark.xfail( - condition=fname.startswith("bitwise") and indexed and has_nulls, - reason="https://github.com/pandas-dev/pandas/issues/52500", - ) - ) - - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - pandas_args = args = [ - cudf.Series( - cp.random.randint(low=1, high=10, size=N), - index=cp.random.choice(range(N), N, False) if indexed else None, - ) - for _ in range(ufunc.nin) - ] - - if has_nulls: - # Converting nullable integer cudf.Series to pandas will produce a - # float pd.Series, so instead we replace nulls with an arbitrary - # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg) - pandas_args = [arg.fillna(0) for arg in args] - - # Note: Different indexes must be aligned before the mask is computed. - # This requires using an internal function (_align_indices), and that - # is unlikely to change for the foreseeable future. - aligned = ( - cudf.core.series._align_indices(args, allow_non_unique=True) - if indexed and ufunc.nin == 2 - else args - ) - mask = reduce(operator.or_, (a.isna() for a in aligned)).to_pandas() - - got = ufunc(*args) - - with _hide_ufunc_warnings(ufunc): - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) - - if ufunc.nout > 1: - for g, e in zip(got, expect): - if has_nulls: - e[mask] = np.nan - assert_eq(g, e, check_exact=False) - else: - if has_nulls: - with expect_warning_if( - fname - in ( - "isfinite", - "isinf", - "isnan", - "logical_and", - "logical_not", - "logical_or", - "logical_xor", - "signbit", - "equal", - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - ) - ): - expect[mask] = np.nan - assert_eq(got, expect, check_exact=False) - - -@pytest.mark.parametrize( - "ufunc", [np.add, np.greater, np.greater_equal, np.logical_and] -) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("indexed", [True, False]) -@pytest.mark.parametrize("reflect", [True, False]) -def test_binary_ufunc_series_array( - request, ufunc, has_nulls, indexed, reflect -): - fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=reflect and has_nulls, - reason=( - "When cupy is the left operand there is no way for us to " - "avoid calling its binary operators, which cannot handle " - "cudf objects that contain nulls." - ), - ) - ) - # The way cudf casts nans in arrays to nulls during binops with cudf - # objects is currently incompatible with pandas. - request.applymarker( - pytest.mark.xfail( - condition=( - fname in {"greater", "greater_equal", "logical_and"} - and has_nulls - ), - reason=( - "cudf and pandas incompatible casting nans " - "to nulls in binops" - ), - ) - ) - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - args = [ - cudf.Series( - cp.random.rand(N), - index=cp.random.choice(range(N), N, False) if indexed else None, - ) - for _ in range(ufunc.nin) - ] - - if has_nulls: - # Converting nullable integer cudf.Series to pandas will produce a - # float pd.Series, so instead we replace nulls with an arbitrary - # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg) - - # Cupy doesn't support nulls, so we fill with nans before converting. - args[1] = args[1].fillna(cp.nan) - mask = args[0].isna().to_pandas() - - arg1 = args[1].to_cupy() - - if reflect: - got = ufunc(arg1, args[0]) - expect = ufunc(args[1].to_numpy(), args[0].to_pandas()) - else: - got = ufunc(args[0], arg1) - expect = ufunc(args[0].to_pandas(), args[1].to_numpy()) - - if ufunc.nout > 1: - for g, e in zip(got, expect): - if has_nulls: - e[mask] = np.nan - if reflect: - assert (cp.asnumpy(g) == e).all() - else: - assert_eq(g, e, check_exact=False) - else: - if has_nulls: - expect[mask] = np.nan - if reflect: - assert (cp.asnumpy(got) == expect).all() - else: - assert_eq(got, expect, check_exact=False) - - -@pytest.mark.parametrize( - "func", - [np.add], -) -def test_ufunc_cudf_series_error_with_out_kwarg(func): - cudf_s1 = cudf.Series(data=[-1, 2, 3, 0]) - cudf_s2 = cudf.Series(data=[-1, 2, 3, 0]) - cudf_s3 = cudf.Series(data=[0, 0, 0, 0]) - # this throws a value-error because of presence of out kwarg - with pytest.raises(TypeError): - func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) - - -# Skip matmul since it requires aligned shapes. -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("indexed", [True, False]) -def test_ufunc_dataframe(request, ufunc, has_nulls, indexed): - # Note: This test assumes that all ufuncs are unary or binary. - fname = ufunc.__name__ - request.applymarker( - pytest.mark.xfail( - condition=( - indexed - and fname - in { - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - "equal", - } - ), - reason="Comparison operators do not support misaligned indexes.", - ) - ) - # If we don't have explicit dispatch and cupy doesn't support the operator, - # we expect a failure - request.applymarker( - pytest.mark.xfail( - condition=not hasattr(cp, fname), - reason=f"cupy has no support for '{fname}'", - ) - ) - - N = 100 - # Avoid zeros in either array to skip division by 0 errors. Also limit the - # scale to avoid issues with overflow, etc. We use ints because some - # operations (like bitwise ops) are not defined for floats. - # TODO: Add tests of mismatched columns etc. - pandas_args = args = [ - cudf.DataFrame( - {"foo": cp.random.randint(low=1, high=10, size=N)}, - index=cp.random.choice(range(N), N, False) if indexed else None, - ) - for _ in range(ufunc.nin) - ] - - if has_nulls: - # Converting nullable integer cudf.Series to pandas will produce a - # float pd.Series, so instead we replace nulls with an arbitrary - # integer value, precompute the mask, and then reapply it afterwards. - for arg in args: - set_random_null_mask_inplace(arg["foo"]) - pandas_args = [arg.copy() for arg in args] - for arg in pandas_args: - arg["foo"] = arg["foo"].fillna(0) - - # Note: Different indexes must be aligned before the mask is computed. - # This requires using an internal function (_align_indices), and that - # is unlikely to change for the foreseeable future. - aligned = ( - cudf.core.dataframe._align_indices(*args) - if indexed and ufunc.nin == 2 - else args - ) - mask = reduce( - operator.or_, (a["foo"].isna() for a in aligned) - ).to_pandas() - - got = ufunc(*args) - - with _hide_ufunc_warnings(ufunc): - expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) - - if ufunc.nout > 1: - for g, e in zip(got, expect): - if has_nulls: - e[mask] = np.nan - assert_eq(g, e, check_exact=False) - else: - if has_nulls: - with expect_warning_if( - fname - in ( - "isfinite", - "isinf", - "isnan", - "logical_and", - "logical_not", - "logical_or", - "logical_xor", - "signbit", - "equal", - "greater", - "greater_equal", - "less", - "less_equal", - "not_equal", - ) - ): - expect[mask] = np.nan - assert_eq(got, expect, check_exact=False) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py deleted file mode 100644 index 5acdf36de80..00000000000 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ /dev/null @@ -1,656 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -import datetime -import io -import pathlib - -import fastavro -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing.dataset_generator import rand_dataframe - - -def cudf_from_avro_util(schema: dict, records: list) -> cudf.DataFrame: - schema = [] if schema is None else fastavro.parse_schema(schema) - buffer = io.BytesIO() - fastavro.writer(buffer, schema, records) - buffer.seek(0) - return cudf.read_avro(buffer) - - -avro_type_params = [ - ("boolean", "bool"), - ("int", "int32"), - ("long", "int64"), - ("float", "float32"), - ("double", "float64"), - ("bytes", "str"), - ("string", "str"), -] - - -@pytest.mark.parametrize("avro_type, expected_dtype", avro_type_params) -@pytest.mark.parametrize("namespace", [None, "root_ns"]) -@pytest.mark.parametrize("nullable", [True, False]) -def test_can_detect_dtype_from_avro_type( - avro_type, expected_dtype, namespace, nullable -): - avro_type = avro_type if not nullable else ["null", avro_type] - - schema = fastavro.parse_schema( - { - "type": "record", - "name": "test", - "namespace": namespace, - "fields": [{"name": "prop", "type": avro_type}], - } - ) - - actual = cudf_from_avro_util(schema, []) - - expected = cudf.DataFrame( - {"prop": cudf.Series(None, None, expected_dtype)} - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("avro_type, expected_dtype", avro_type_params) -@pytest.mark.parametrize("namespace", [None, "root_ns"]) -@pytest.mark.parametrize("nullable", [True, False]) -def test_can_detect_dtype_from_avro_type_nested( - avro_type, expected_dtype, namespace, nullable -): - avro_type = avro_type if not nullable else ["null", avro_type] - - schema_leaf = { - "name": "leaf", - "type": "record", - "fields": [{"name": "prop3", "type": avro_type}], - } - - schema_child = { - "name": "child", - "type": "record", - "fields": [{"name": "prop2", "type": schema_leaf}], - } - - schema_root = { - "name": "root", - "type": "record", - "namespace": namespace, - "fields": [{"name": "prop1", "type": schema_child}], - } - - actual = cudf_from_avro_util(schema_root, []) - - col_name = "{ns}child.{ns}leaf.prop3".format( - ns="" if namespace is None else namespace + "." - ) - - expected = cudf.DataFrame( - {col_name: cudf.Series(None, None, expected_dtype)} - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "avro_type, cudf_type, avro_val, cudf_val", - [ - ("boolean", "bool", True, True), - ("boolean", "bool", False, False), - ("int", "int32", 1234, 1234), - ("long", "int64", 1234, 1234), - ("float", "float32", 12.34, 12.34), - ("double", "float64", 12.34, 12.34), - ("string", "str", "heyϴ", "heyϴ"), - # ("bytes", "str", "heyϴ", "heyϴ"), - ], -) -def test_can_parse_single_value(avro_type, cudf_type, avro_val, cudf_val): - schema_root = { - "name": "root", - "type": "record", - "fields": [{"name": "prop", "type": ["null", avro_type]}], - } - - records = [ - {"prop": avro_val}, - ] - - actual = cudf_from_avro_util(schema_root, records) - - expected = cudf.DataFrame( - {"prop": cudf.Series(data=[cudf_val], dtype=cudf_type)} - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("avro_type, cudf_type", avro_type_params) -def test_can_parse_single_null(avro_type, cudf_type): - schema_root = { - "name": "root", - "type": "record", - "fields": [{"name": "prop", "type": ["null", avro_type]}], - } - - records = [{"prop": None}] - - actual = cudf_from_avro_util(schema_root, records) - - expected = cudf.DataFrame( - {"prop": cudf.Series(data=[None], dtype=cudf_type)} - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("avro_type, cudf_type", avro_type_params) -def test_can_parse_no_data(avro_type, cudf_type): - schema_root = { - "name": "root", - "type": "record", - "fields": [{"name": "prop", "type": ["null", avro_type]}], - } - - records = [] - - actual = cudf_from_avro_util(schema_root, records) - - expected = cudf.DataFrame({"prop": cudf.Series(data=[], dtype=cudf_type)}) - - assert_eq(expected, actual) - - -@pytest.mark.xfail( - reason="cudf avro reader is unable to parse zero-field metadata." -) -@pytest.mark.parametrize("avro_type, cudf_type", avro_type_params) -def test_can_parse_no_fields(avro_type, cudf_type): - schema_root = { - "name": "root", - "type": "record", - "fields": [], - } - - records = [] - - actual = cudf_from_avro_util(schema_root, records) - - expected = cudf.DataFrame() - - assert_eq(expected, actual) - - -def test_can_parse_no_schema(): - schema_root = None - records = [] - actual = cudf_from_avro_util(schema_root, records) - expected = cudf.DataFrame() - assert_eq(expected, actual) - - -@pytest.mark.parametrize("rows", [0, 1, 10, 1000]) -@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) -def test_avro_compression(rows, codec): - schema = { - "name": "root", - "type": "record", - "fields": [ - {"name": "0", "type": "int"}, - {"name": "1", "type": "string"}, - ], - } - - # N.B. rand_dataframe() is brutally slow for some reason. Switching to - # np.random() speeds things up by a factor of 10. - # See also: https://github.com/rapidsai/cudf/issues/13128 - df = rand_dataframe( - [ - {"dtype": "int32", "null_frequency": 0, "cardinality": 1000}, - { - "dtype": "str", - "null_frequency": 0, - "cardinality": 100, - "max_string_length": 10, - }, - ], - rows, - seed=0, - ) - expected_df = cudf.DataFrame.from_arrow(df) - - records = df.to_pandas().to_dict(orient="records") - - buffer = io.BytesIO() - fastavro.writer(buffer, schema, records, codec=codec) - buffer.seek(0) - got_df = cudf.read_avro(buffer) - - assert_eq(expected_df, got_df) - - -avro_logical_type_params = [ - # (avro logical type, avro primitive type, cudf expected dtype) - ("date", "int", "datetime64[s]"), -] - - -@pytest.mark.parametrize( - "logical_type, primitive_type, expected_dtype", avro_logical_type_params -) -@pytest.mark.parametrize("namespace", [None, "root_ns"]) -@pytest.mark.parametrize("nullable", [True, False]) -@pytest.mark.parametrize("prepend_null", [True, False]) -def test_can_detect_dtypes_from_avro_logical_type( - logical_type, - primitive_type, - expected_dtype, - namespace, - nullable, - prepend_null, -): - avro_type = [{"logicalType": logical_type, "type": primitive_type}] - if nullable: - if prepend_null: - avro_type.insert(0, "null") - else: - avro_type.append("null") - - schema = fastavro.parse_schema( - { - "type": "record", - "name": "test", - "namespace": namespace, - "fields": [{"name": "prop", "type": avro_type}], - } - ) - - actual = cudf_from_avro_util(schema, []) - - expected = cudf.DataFrame( - {"prop": cudf.Series(None, None, expected_dtype)} - ) - - assert_eq(expected, actual) - - -def get_days_from_epoch(date: datetime.date | None) -> int | None: - if date is None: - return None - return (date - datetime.date(1970, 1, 1)).days - - -@pytest.mark.parametrize("namespace", [None, "root_ns"]) -@pytest.mark.parametrize("nullable", [True, False]) -@pytest.mark.parametrize("prepend_null", [True, False]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas (datetime(9999, ...) too large)", -) -def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null): - avro_type = {"logicalType": "date", "type": "int"} - if nullable: - if prepend_null: - avro_type = ["null", avro_type] - else: - avro_type = [avro_type, "null"] - - schema_dict = { - "type": "record", - "name": "test", - "fields": [ - {"name": "o_date", "type": avro_type}, - ], - } - - if namespace: - schema_dict["namespace"] = namespace - - schema = fastavro.parse_schema(schema_dict) - - # Insert some None values in no particular order. These will get converted - # into avro "nulls" by the fastavro writer (or filtered out if we're not - # nullable). The first and last dates are epoch min/max values, the rest - # are arbitrarily chosen. - dates = [ - None, - datetime.date(1970, 1, 1), - datetime.date(1970, 1, 2), - datetime.date(1981, 10, 25), - None, - None, - datetime.date(2012, 5, 18), - None, - datetime.date(2019, 9, 3), - None, - datetime.date(9999, 12, 31), - ] - - if not nullable: - dates = [date for date in dates if date is not None] - - days_from_epoch = [get_days_from_epoch(date) for date in dates] - - records = [{"o_date": day} for day in days_from_epoch] - - actual = cudf_from_avro_util(schema, records) - - expected = cudf.DataFrame( - {"o_date": cudf.Series(dates, dtype="datetime64[s]")} - ) - - assert_eq(expected, actual) - - -def test_alltypes_plain_avro(): - # During development of the logical type support, the Java avro tests were - # triggering CUDA kernel crashes (null pointer dereferences). We were able - # to replicate the behavior in a C++ test case, and then subsequently came - # up with this Python unit test to also trigger the problematic code path. - # - # So, unlike the other tests, this test is inherently reactive in nature, - # added simply to verify we fixed the problematic code path that was - # causing CUDA kernel crashes. - # - # See https://github.com/rapidsai/cudf/pull/12788#issuecomment-1468822875 - # for more information. - relpath = "../../../../java/src/test/resources/alltypes_plain.avro" - path = pathlib.Path(__file__).parent.joinpath(relpath).resolve() - assert path.is_file(), path - path = str(path) - - with open(path, "rb") as f: - reader = fastavro.reader(f) - records = [record for record in reader] - - # For reference: - # - # >>> from pprint import pprint - # >>> pprint(reader.writer_schema) - # {'fields': [{'name': 'id', 'type': ['int', 'null']}, - # {'name': 'bool_col', 'type': ['boolean', 'null']}, - # {'name': 'tinyint_col', 'type': ['int', 'null']}, - # {'name': 'smallint_col', 'type': ['int', 'null']}, - # {'name': 'int_col', 'type': ['int', 'null']}, - # {'name': 'bigint_col', 'type': ['long', 'null']}, - # {'name': 'float_col', 'type': ['float', 'null']}, - # {'name': 'double_col', 'type': ['double', 'null']}, - # {'name': 'date_string_col', 'type': ['bytes', 'null']}, - # {'name': 'string_col', 'type': ['bytes', 'null']}, - # {'name': 'timestamp_col', - # 'type': [{'logicalType': 'timestamp-micros', - # 'type': 'long'}, - # 'null']}], - # 'name': 'topLevelRecord', - # 'type': 'record'} - # - # >>> pprint(records[0]) - # {'bigint_col': 0, - # 'bool_col': True, - # 'date_string_col': b'03/01/09', - # 'double_col': 0.0, - # 'float_col': 0.0, - # 'id': 4, - # 'int_col': 0, - # 'smallint_col': 0, - # 'string_col': b'0', - # 'timestamp_col': datetime.datetime(2009, 3, 1, 0, 0, - # tzinfo=datetime.timezone.utc), - # 'tinyint_col': 0} - - # Nothing particularly special about these columns, other than them being - # the ones that @davidwendt used to coerce the crash. - columns = ["bool_col", "int_col", "timestamp_col"] - - # This next line would trigger the fatal CUDA kernel crash. - actual = cudf.read_avro(path, columns=columns) - - # If we get here, we haven't crashed, obviously. Verify the returned data - # frame meets our expectations. We need to fiddle with the dtypes of the - # expected data frame in order to correctly match the schema definition and - # our corresponding read_avro()-returned data frame. - - data = [{column: row[column] for column in columns} for row in records] - - # discard timezone information as we don't support it: - expected = pd.DataFrame(data) - expected["timestamp_col"].dt.tz_localize(None) - - # The fastavro.reader supports the `'logicalType': 'timestamp-micros'` used - # by the 'timestamp_col' column, which is converted into Python - # datetime.datetime() objects (see output of pprint(records[0]) above). - # As we don't support that logical type yet in cudf, we need to convert to - # int64, then divide by 1000 to convert from nanoseconds to microseconds. - timestamps = expected["timestamp_col"].astype("int64") - timestamps //= 1000 - expected["timestamp_col"] = timestamps - - # Furthermore, we need to force the 'int_col' into an int32, per the schema - # definition. (It ends up as an int64 due to cudf.DataFrame() defaulting - # all Python int values to int64 sans a dtype= override.) - expected["int_col"] = expected["int_col"].astype("int32") - - assert_eq(actual, expected) - - -def multiblock_testname_ids(param): - (total_rows, num_rows, skip_rows, sync_interval) = param - return f"{total_rows=}-{num_rows=}-{skip_rows=}-{sync_interval=}" - - -# The following values are used to test various boundary conditions associated -# with multiblock avro files. Each tuple consists of four values: total number -# of rows to generate, number of rows to limit the result set to, number of -# rows to skip, and number of rows per block. If the total number of rows and -# number of rows (i.e. first and second tuple elements) are equal, it means -# that all rows will be returned. If the rows per block also equals the first -# two numbers, it means that a single block will be used. -@pytest.fixture( - ids=multiblock_testname_ids, - params=[ - (10, 10, 9, 9), - (10, 10, 9, 5), - (10, 10, 9, 3), - (10, 10, 9, 2), - (10, 10, 9, 10), - (10, 10, 8, 2), - (10, 10, 5, 5), - (10, 10, 2, 9), - (10, 10, 2, 2), - (10, 10, 1, 9), - (10, 10, 1, 5), - (10, 10, 1, 2), - (10, 10, 1, 10), - (10, 10, 10, 9), - (10, 10, 10, 5), - (10, 10, 10, 2), - (10, 10, 10, 10), - (10, 10, 0, 9), - (10, 10, 0, 5), - (10, 10, 0, 2), - (10, 10, 0, 10), - (100, 100, 99, 10), - (100, 100, 90, 90), - (100, 100, 90, 89), - (100, 100, 90, 88), - (100, 100, 90, 87), - (100, 100, 90, 5), - (100, 100, 89, 90), - (100, 100, 87, 90), - (100, 100, 50, 7), - (100, 100, 50, 31), - (10, 1, 8, 9), - (100, 1, 99, 10), - (100, 1, 98, 10), - (100, 1, 97, 10), - (100, 3, 90, 87), - (100, 4, 90, 5), - (100, 2, 89, 90), - (100, 9, 87, 90), - (100, 20, 50, 7), - (100, 10, 50, 31), - (100, 20, 50, 31), - (100, 30, 50, 31), - (256, 256, 0, 256), - (256, 256, 0, 32), - (256, 256, 0, 31), - (256, 256, 0, 33), - (256, 256, 31, 32), - (256, 256, 32, 31), - (256, 256, 31, 33), - (512, 512, 0, 32), - (512, 512, 0, 31), - (512, 512, 0, 33), - (512, 512, 31, 32), - (512, 512, 32, 31), - (512, 512, 31, 33), - (1024, 1024, 0, 1), - (1024, 1024, 0, 3), - (1024, 1024, 0, 7), - (1024, 1024, 0, 8), - (1024, 1024, 0, 9), - (1024, 1024, 0, 15), - (1024, 1024, 0, 16), - (1024, 1024, 0, 17), - (1024, 1024, 0, 32), - (1024, 1024, 0, 31), - (1024, 1024, 0, 33), - (1024, 1024, 31, 32), - (1024, 1024, 32, 31), - (1024, 1024, 31, 33), - (16384, 16384, 0, 31), - (16384, 16384, 0, 32), - (16384, 16384, 0, 33), - (16384, 16384, 0, 16384), - ], -) -def total_rows_and_num_rows_and_skip_rows_and_rows_per_block(request): - return request.param - - -# N.B. The float32 and float64 types are chosen specifically to exercise -# the only path in the avro reader GPU code that can process multiple -# rows in parallel (via warp-level parallelism). See the logic around -# the line `if (cur + min_row_size * rows_remaining == end)` in -# gpuDecodeAvroColumnData(). -@pytest.mark.parametrize("dtype", ["str", "float32", "float64"]) -@pytest.mark.parametrize( - "use_sync_interval", - [True, False], - ids=["use_sync_interval", "ignore_sync_interval"], -) -@pytest.mark.parametrize("codec", ["null", "deflate", "snappy"]) -def test_avro_reader_multiblock( - dtype, - codec, - use_sync_interval, - total_rows_and_num_rows_and_skip_rows_and_rows_per_block, -): - ( - total_rows, - num_rows, - skip_rows, - rows_per_block, - ) = total_rows_and_num_rows_and_skip_rows_and_rows_per_block - - assert total_rows >= num_rows - assert rows_per_block <= total_rows - - limit_rows = num_rows != total_rows - if limit_rows: - assert total_rows >= num_rows + skip_rows - - if dtype == "str": - avro_type = "string" - - # Generate a list of strings, each of which is a 6-digit number, padded - # with leading zeros. This data set was very useful during development - # of the multiblock avro reader logic, as you get implicit feedback as - # to what may have gone wrong when the test fails, based on the - # expected vs actual values. - values = [f"{i:0>6}" for i in range(0, total_rows)] - - # Strings are encoded in avro with a zigzag-encoded length prefix, and - # then the string data. As all of our strings are fixed at length 6, - # we only need one byte to encode the length prefix (0xc). Thus, our - # bytes per row is 6 + 1 = 7. - bytes_per_row = len(values[0]) + 1 - assert bytes_per_row == 7, bytes_per_row - else: - assert dtype in ("float32", "float64") - avro_type = "float" if dtype == "float32" else "double" - np.random.seed(0) - # We don't use rand_dataframe() here, because it increases the - # execution time of each test by a factor of 10 or more (it appears - # to use a very costly approach to generating random data). - # See also: https://github.com/rapidsai/cudf/issues/13128 - values = np.random.rand(total_rows).astype(dtype) - bytes_per_row = values.dtype.itemsize - - # The sync_interval is the number of bytes between sync blocks. We know - # how many bytes we need per row, so we can calculate the number of bytes - # per block by multiplying the number of rows per block by the bytes per - # row. This is the sync interval. - total_bytes_per_block = rows_per_block * bytes_per_row - sync_interval = total_bytes_per_block - - source_df = cudf.DataFrame({"0": pd.Series(values)}) - - if limit_rows: - expected_df = source_df[skip_rows : skip_rows + num_rows].reset_index( - drop=True - ) - else: - expected_df = source_df[skip_rows:].reset_index(drop=True) - - records = source_df.to_pandas().to_dict(orient="records") - - schema = { - "name": "root", - "type": "record", - "fields": [ - {"name": "0", "type": avro_type}, - ], - } - - if use_sync_interval: - kwds = {"sync_interval": sync_interval} - else: - kwds = {} - - kwds["codec"] = codec - - buffer = io.BytesIO() - fastavro.writer(buffer, schema, records, **kwds) - buffer.seek(0) - - if not limit_rows: - # Explicitly set num_rows to None if we want to read all rows. This - # ensures we exercise the logic behind a read_avro() call where the - # caller doesn't specify the number of rows desired (which will be the - # most common use case). - num_rows = None - actual_df = cudf.read_avro(buffer, skiprows=skip_rows, num_rows=num_rows) - - assert_eq(expected_df, actual_df) diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py deleted file mode 100644 index 2e8519509e2..00000000000 --- a/python/cudf/cudf/tests/test_binops.py +++ /dev/null @@ -1,3415 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import decimal -import operator -import random -import warnings -from itertools import combinations_with_replacement, product - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Index, Series -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.testing import _utils as utils, assert_eq -from cudf.utils.dtypes import ( - BOOL_TYPES, - DATETIME_TYPES, - FLOAT_TYPES, - INTEGER_TYPES, - NUMERIC_TYPES, - TIMEDELTA_TYPES, -) - -STRING_TYPES = {"str"} - -_binops = [ - operator.add, - operator.sub, - operator.mul, - operator.floordiv, - operator.truediv, - operator.mod, - operator.pow, -] - -_binops_compare = [ - operator.eq, - operator.ne, - operator.lt, - operator.le, - operator.gt, - operator.ge, -] - -_bitwise_binops = [operator.and_, operator.or_, operator.xor] - -_int_types = [ - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", -] - -_cmpops = [ - operator.lt, - operator.gt, - operator.le, - operator.ge, - operator.eq, - operator.ne, -] - -_reflected_ops = [ - lambda x: 1 + x, - lambda x: 2 * x, - lambda x: 2 - x, - lambda x: 2 // x, - lambda x: 2 / x, - lambda x: 3 + x, - lambda x: 3 * x, - lambda x: 3 - x, - lambda x: 3 // x, - lambda x: 3 / x, - lambda x: 3 % x, - lambda x: -1 + x, - lambda x: -2 * x, - lambda x: -2 - x, - lambda x: -2 // x, - lambda x: -2 / x, - lambda x: -3 + x, - lambda x: -3 * x, - lambda x: -3 - x, - lambda x: -3 // x, - lambda x: -3 / x, - lambda x: -3 % x, - lambda x: 0 + x, - lambda x: 0 * x, - lambda x: 0 - x, - lambda x: 0 // x, - lambda x: 0 / x, -] - -_operators_arithmetic = [ - "add", - "radd", - "sub", - "rsub", - "mul", - "rmul", - "mod", - "rmod", - "pow", - "rpow", - "div", - "divide", - "floordiv", - "rfloordiv", - "truediv", - "rtruediv", -] - -_operators_comparison = ["eq", "ne", "lt", "le", "gt", "ge"] - - -_cudf_scalar_reflected_ops = [ - lambda x: cudf.Scalar(1) + x, - lambda x: cudf.Scalar(2) * x, - lambda x: cudf.Scalar(2) - x, - lambda x: cudf.Scalar(2) // x, - lambda x: cudf.Scalar(2) / x, - lambda x: cudf.Scalar(3) + x, - lambda x: cudf.Scalar(3) * x, - lambda x: cudf.Scalar(3) - x, - lambda x: cudf.Scalar(3) // x, - lambda x: cudf.Scalar(3) / x, - lambda x: cudf.Scalar(3) % x, - lambda x: cudf.Scalar(-1) + x, - lambda x: cudf.Scalar(-2) * x, - lambda x: cudf.Scalar(-2) - x, - lambda x: cudf.Scalar(-2) // x, - lambda x: cudf.Scalar(-2) / x, - lambda x: cudf.Scalar(-3) + x, - lambda x: cudf.Scalar(-3) * x, - lambda x: cudf.Scalar(-3) - x, - lambda x: cudf.Scalar(-3) // x, - lambda x: cudf.Scalar(-3) / x, - lambda x: cudf.Scalar(-3) % x, - lambda x: cudf.Scalar(0) + x, - lambda x: cudf.Scalar(0) * x, - lambda x: cudf.Scalar(0) - x, - lambda x: cudf.Scalar(0) // x, - lambda x: cudf.Scalar(0) / x, -] - - -pytest_xfail = pytest.mark.xfail -pytestmark = pytest.mark.spilling - -# If spilling is enabled globally, we skip many test permutations -# to reduce running time. -if get_global_manager() is not None: - _binops = _binops[:1] - _binops_compare = _binops_compare[:1] - _int_types = _int_types[-1:] - _cmpops = _cmpops[:1] - _reflected_ops = _reflected_ops[:1] - _operators_arithmetic = _operators_arithmetic[:1] - _operators_comparison = _operators_comparison[:1] - _cudf_scalar_reflected_ops = _cudf_scalar_reflected_ops[:1] - DATETIME_TYPES = {"datetime64[ms]"} # noqa: F811 - NUMERIC_TYPES = {"float32"} # noqa: F811 - FLOAT_TYPES = {"float64"} # noqa: F811 - INTEGER_TYPES = {"int16"} # noqa: F811 - TIMEDELTA_TYPES = {"timedelta64[s]"} # noqa: F811 - # To save time, we skip tests marked "pytest.mark.xfail" - pytest_xfail = pytest.mark.skipif - - -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("binop", _binops) -def test_series_binop(binop, obj_class): - nelem = 1000 - arr1 = utils.gen_rand("float64", nelem) * 10000 - # Keeping a low value because CUDA 'pow' has 2 full range error - arr2 = utils.gen_rand("float64", nelem) * 10 - - sr1 = Series(arr1) - sr2 = Series(arr2) - - if obj_class == "Index": - sr1 = Index(sr1) - sr2 = Index(sr2) - - result = binop(sr1, sr2) - expect = binop(pd.Series(arr1), pd.Series(arr2)) - - if obj_class == "Index": - result = Series(result) - - assert_eq(result, expect) - - -@pytest.mark.parametrize("binop", _binops) -def test_series_binop_concurrent(binop): - def func(index): - arr = np.random.random(100) * 10 - sr = Series(arr) - - result = binop(sr.astype("int32"), sr) - expect = binop(arr.astype("int32"), arr) - - np.testing.assert_almost_equal(result.to_numpy(), expect, decimal=5) - - from concurrent.futures import ThreadPoolExecutor - - indices = range(10) - with ThreadPoolExecutor(4) as e: # four processes - list(e.map(func, indices)) - - -@pytest.mark.parametrize("use_cudf_scalar", [False, True]) -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("nelem,binop", list(product([1, 2, 100], _binops))) -def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): - arr = np.random.random(nelem) - rhs = random.choice(arr).item() - - sr = Series(arr) - if obj_class == "Index": - sr = Index(sr) - - if use_cudf_scalar: - result = binop(sr, rhs) - else: - result = binop(sr, cudf.Scalar(rhs)) - - if obj_class == "Index": - result = Series(result) - - np.testing.assert_almost_equal(result.to_numpy(), binop(arr, rhs)) - - -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("binop", _bitwise_binops) -@pytest.mark.parametrize( - "lhs_dtype,rhs_dtype", list(product(_int_types, _int_types)) -) -def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): - arr1 = (np.random.random(100) * 100).astype(lhs_dtype) - sr1 = Series(arr1) - - arr2 = (np.random.random(100) * 100).astype(rhs_dtype) - sr2 = Series(arr2) - - if obj_class == "Index": - sr1 = Index(sr1) - sr2 = Index(sr2) - - result = binop(sr1, sr2) - - if obj_class == "Index": - result = Series(result) - - np.testing.assert_almost_equal(result.to_numpy(), binop(arr1, arr2)) - - -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("cmpop", _cmpops) -@pytest.mark.parametrize( - "dtype", ["int8", "int32", "int64", "float32", "float64", "datetime64[ms]"] -) -def test_series_compare(cmpop, obj_class, dtype): - arr1 = np.random.randint(0, 100, 100).astype(dtype) - arr2 = np.random.randint(0, 100, 100).astype(dtype) - sr1 = Series(arr1) - sr2 = Series(arr2) - - if obj_class == "Index": - sr1 = Index(sr1) - sr2 = Index(sr2) - - result1 = cmpop(sr1, sr1) - result2 = cmpop(sr2, sr2) - result3 = cmpop(sr1, sr2) - - if obj_class == "Index": - result1 = Series(result1) - result2 = Series(result2) - result3 = Series(result3) - - np.testing.assert_equal(result1.to_numpy(), cmpop(arr1, arr1)) - np.testing.assert_equal(result2.to_numpy(), cmpop(arr2, arr2)) - np.testing.assert_equal(result3.to_numpy(), cmpop(arr1, arr2)) - - -@pytest.mark.parametrize( - "dtype,val", - [("int8", 200), ("int32", 2**32), ("uint8", -128), ("uint64", -1)], -) -@pytest.mark.parametrize( - "op", - [ - operator.eq, - operator.ne, - operator.lt, - operator.le, - operator.gt, - operator.ge, - ], -) -@pytest.mark.parametrize("reverse", [False, True]) -def test_series_compare_integer(dtype, val, op, reverse): - # Tests that these actually work, even though they are out of bound. - force_cast_val = np.array(val).astype(dtype) - sr = Series( - [np.iinfo(dtype).min, np.iinfo(dtype).max, force_cast_val, None], - dtype=dtype, - ) - - if reverse: - _op = op - - def op(x, y): - return _op(y, x) - - # We expect the same result as comparing to a value within range (e.g. 0) - # except that a NULL value evaluates to False - if op(0, val): - expected = Series([True, True, True, None]) - else: - expected = Series([False, False, False, None]) - - res = op(sr, val) - assert_eq(res, expected) - - -def _series_compare_nulls_typegen(): - return [ - *combinations_with_replacement(DATETIME_TYPES, 2), - *combinations_with_replacement(TIMEDELTA_TYPES, 2), - *combinations_with_replacement(NUMERIC_TYPES, 2), - *combinations_with_replacement(STRING_TYPES, 2), - ] - - -@pytest.mark.parametrize("cmpop", _cmpops) -@pytest.mark.parametrize("dtypes", _series_compare_nulls_typegen()) -def test_series_compare_nulls(cmpop, dtypes): - ltype, rtype = dtypes - - ldata = [1, 2, None, None, 5] - rdata = [2, 1, None, 4, None] - - lser = Series(ldata, dtype=ltype) - rser = Series(rdata, dtype=rtype) - - lmask = ~lser.isnull() - rmask = ~rser.isnull() - - expect_mask = np.logical_and(lmask, rmask) - expect = cudf.Series([None] * 5, dtype="bool") - expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask]) - - got = cmpop(lser, rser) - assert_eq(expect, got) - - -@pytest.fixture -def str_series_cmp_data(): - return pd.Series(["a", "b", None, "d", "e", None], dtype="string") - - -@pytest.fixture(ids=[op.__name__ for op in _cmpops], params=_cmpops) -def str_series_compare_str_cmpop(request): - return request.param - - -@pytest.fixture(ids=["eq", "ne"], params=[operator.eq, operator.ne]) -def str_series_compare_num_cmpop(request): - return request.param - - -@pytest.fixture(ids=["int", "float", "bool"], params=[1, 1.5, True]) -def cmp_scalar(request): - return request.param - - -def test_str_series_compare_str( - str_series_cmp_data, str_series_compare_str_cmpop -): - expect = str_series_compare_str_cmpop(str_series_cmp_data, "a") - got = str_series_compare_str_cmpop( - Series.from_pandas(str_series_cmp_data), "a" - ) - - assert_eq(expect, got.to_pandas(nullable=True)) - - -def test_str_series_compare_str_reflected( - str_series_cmp_data, str_series_compare_str_cmpop -): - expect = str_series_compare_str_cmpop("a", str_series_cmp_data) - got = str_series_compare_str_cmpop( - "a", Series.from_pandas(str_series_cmp_data) - ) - - assert_eq(expect, got.to_pandas(nullable=True)) - - -def test_str_series_compare_num( - str_series_cmp_data, str_series_compare_num_cmpop, cmp_scalar -): - expect = str_series_compare_num_cmpop(str_series_cmp_data, cmp_scalar) - got = str_series_compare_num_cmpop( - Series.from_pandas(str_series_cmp_data), cmp_scalar - ) - - assert_eq(expect, got.to_pandas(nullable=True)) - - -def test_str_series_compare_num_reflected( - str_series_cmp_data, str_series_compare_num_cmpop, cmp_scalar -): - expect = str_series_compare_num_cmpop(cmp_scalar, str_series_cmp_data) - got = str_series_compare_num_cmpop( - cmp_scalar, Series.from_pandas(str_series_cmp_data) - ) - - assert_eq(expect, got.to_pandas(nullable=True)) - - -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize("nelem", [1, 2, 100]) -@pytest.mark.parametrize("cmpop", _cmpops) -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES + ["datetime64[ms]"]) -@pytest.mark.parametrize("use_cudf_scalar", [True, False]) -def test_series_compare_scalar( - nelem, cmpop, obj_class, dtype, use_cudf_scalar -): - arr1 = np.random.randint(0, 100, 100).astype(dtype) - sr1 = Series(arr1) - rhs = random.choice(arr1).item() - - if use_cudf_scalar: - rhs = cudf.Scalar(rhs) - - if obj_class == "Index": - sr1 = Index(sr1) - - result1 = cmpop(sr1, rhs) - result2 = cmpop(rhs, sr1) - - if obj_class == "Index": - result1 = Series(result1) - result2 = Series(result2) - - np.testing.assert_equal(result1.to_numpy(), cmpop(arr1, rhs)) - np.testing.assert_equal(result2.to_numpy(), cmpop(rhs, arr1)) - - -_nulls = ["none", "some"] - - -@pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128]) -@pytest.mark.parametrize("lhs_nulls,rhs_nulls", list(product(_nulls, _nulls))) -def test_validity_add(nelem, lhs_nulls, rhs_nulls): - np.random.seed(0) - # LHS - lhs_data = np.random.random(nelem) - if lhs_nulls == "some": - lhs_mask = utils.random_bitmask(nelem) - lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem] - lhs_null_count = utils.count_zero(lhs_bitmask) - assert lhs_null_count >= 0 - lhs = Series.from_masked_array(lhs_data, lhs_mask) - assert lhs.null_count == lhs_null_count - else: - lhs = Series(lhs_data) - # RHS - rhs_data = np.random.random(nelem) - if rhs_nulls == "some": - rhs_mask = utils.random_bitmask(nelem) - rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem] - rhs_null_count = utils.count_zero(rhs_bitmask) - assert rhs_null_count >= 0 - rhs = Series.from_masked_array(rhs_data, rhs_mask) - assert rhs.null_count == rhs_null_count - else: - rhs = Series(rhs_data) - # Result - res = lhs + rhs - if lhs_nulls == "some" and rhs_nulls == "some": - res_mask = np.asarray( - utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool_ - )[:nelem] - if lhs_nulls == "some" and rhs_nulls == "none": - res_mask = np.asarray( - utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool_ - )[:nelem] - if lhs_nulls == "none" and rhs_nulls == "some": - res_mask = np.asarray( - utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool_ - )[:nelem] - # Fill NA values - na_value = -10000 - got = res.fillna(na_value).to_numpy() - expect = lhs_data + rhs_data - if lhs_nulls == "some" or rhs_nulls == "some": - expect[~res_mask] = na_value - - np.testing.assert_array_equal(expect, got) - - -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize( - "binop,lhs_dtype,rhs_dtype", - list( - product( - [operator.add, operator.mul], - utils.NUMERIC_TYPES, - utils.NUMERIC_TYPES, - ) - ), -) -def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): - nelem = 10 - lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) - rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) - - sr1 = Series(lhs) - sr2 = Series(rhs) - - if obj_class == "Index": - sr1 = Index(sr1) - sr2 = Index(sr2) - - result = binop(Series(sr1), Series(sr2)) - - if obj_class == "Index": - result = Series(result) - - np.testing.assert_almost_equal(result.to_numpy(), binop(lhs, rhs)) - - -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize( - "cmpop,lhs_dtype,rhs_dtype", - list(product(_cmpops, utils.NUMERIC_TYPES, utils.NUMERIC_TYPES)), -) -def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): - nelem = 5 - lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) - rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) - - sr1 = Series(lhs) - sr2 = Series(rhs) - - if obj_class == "Index": - sr1 = Index(sr1) - sr2 = Index(sr2) - - result = cmpop(Series(sr1), Series(sr2)) - - if obj_class == "Index": - result = Series(result) - - np.testing.assert_array_equal(result.to_numpy(), cmpop(lhs, rhs)) - - -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize( - "func, dtype", list(product(_reflected_ops, utils.NUMERIC_TYPES)) -) -def test_series_reflected_ops_scalar(func, dtype, obj_class): - # create random series - np.random.seed(12) - random_series = utils.gen_rand(dtype, 100, low=10) - - # gpu series - gs = Series(random_series) - - # class typing - if obj_class == "Index": - gs = Index(gs) - - try: - gs_result = func(gs) - except OverflowError: - # An error is fine, if pandas raises the same error: - with pytest.raises(OverflowError): - func(random_series) - - return - - # class typing - if obj_class == "Index": - gs = Series(gs) - - # pandas - ps_result = func(random_series) - - # verify - np.testing.assert_allclose(ps_result, gs_result.to_numpy()) - - -@pytest.mark.parametrize( - "func, dtype", list(product(_reflected_ops, utils.NUMERIC_TYPES)) -) -def test_cudf_scalar_reflected_ops_scalar(func, dtype): - value = 42 - scalar = cudf.Scalar(42) - - expected = func(value) - actual = func(scalar).value - - assert np.isclose(expected, actual) - - -@pytest.mark.parametrize("obj_class", ["Series", "Index"]) -@pytest.mark.parametrize( - "funcs, dtype", - list( - product( - list(zip(_reflected_ops, _cudf_scalar_reflected_ops)), - utils.NUMERIC_TYPES, - ) - ), -) -def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class): - cpu_func, gpu_func = funcs - - # create random series - np.random.seed(12) - random_series = utils.gen_rand(dtype, 100, low=10) - - # gpu series - gs = Series(random_series) - - # class typing - if obj_class == "Index": - gs = Index(gs) - - try: - gs_result = gpu_func(gs) - except OverflowError: - # An error is fine, if pandas raises the same error: - with pytest.raises(OverflowError): - cpu_func(random_series) - - return - - # class typing - if obj_class == "Index": - gs = Series(gs) - - # pandas - ps_result = cpu_func(random_series) - - # verify - np.testing.assert_allclose(ps_result, gs_result.to_numpy()) - - -@pytest.mark.parametrize("binop", _binops) -def test_different_shapes_and_columns(binop): - # TODO: support `pow()` on NaN values. Particularly, the cases: - # `pow(1, NaN) == 1` and `pow(NaN, 0) == 1` - if binop is operator.pow: - return - - # Empty frame on the right side - pd_frame = binop(pd.DataFrame({"x": [1, 2]}), pd.DataFrame({})) - cd_frame = binop(cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({})) - assert_eq(cd_frame, pd_frame) - - # Empty frame on the left side - pd_frame = pd.DataFrame({}) + pd.DataFrame({"x": [1, 2]}) - cd_frame = cudf.DataFrame({}) + cudf.DataFrame({"x": [1, 2]}) - assert_eq(cd_frame, pd_frame) - - # Note: the below rely on a discrepancy between cudf and pandas - # While pandas inserts columns in alphabetical order, cudf inserts in the - # order of whichever column comes first. So the following code will not - # work if the names of columns are reversed i.e. ('y', 'x') != ('x', 'y') - - # More rows on the left side - pd_frame = pd.DataFrame({"x": [1, 2, 3]}) + pd.DataFrame({"y": [1, 2]}) - cd_frame = cudf.DataFrame({"x": [1, 2, 3]}) + cudf.DataFrame({"y": [1, 2]}) - assert_eq(cd_frame, pd_frame) - - # More rows on the right side - pd_frame = pd.DataFrame({"x": [1, 2]}) + pd.DataFrame({"y": [1, 2, 3]}) - cd_frame = cudf.DataFrame({"x": [1, 2]}) + cudf.DataFrame({"y": [1, 2, 3]}) - assert_eq(cd_frame, pd_frame) - - -@pytest.mark.parametrize("binop", _binops) -def test_different_shapes_and_same_columns(binop): - # TODO: support `pow()` on NaN values. Particularly, the cases: - # `pow(1, NaN) == 1` and `pow(NaN, 0) == 1` - if binop is operator.pow: - return - - pd_frame = binop( - pd.DataFrame({"x": [1, 2]}), pd.DataFrame({"x": [1, 2, 3]}) - ) - cd_frame = binop( - cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({"x": [1, 2, 3]}) - ) - # cast x as float64 so it matches pandas dtype - cd_frame["x"] = cd_frame["x"].astype(np.float64) - assert_eq(cd_frame, pd_frame) - - -@pytest.mark.parametrize("binop", _binops) -def test_different_shapes_and_columns_with_unaligned_indices(binop): - # TODO: support `pow()` on NaN values. Particularly, the cases: - # `pow(1, NaN) == 1` and `pow(NaN, 0) == 1` - if binop is operator.pow: - return - - # Test with a RangeIndex - pdf1 = pd.DataFrame({"x": [4, 3, 2, 1], "y": [7, 3, 8, 6]}) - # Test with an Index - pdf2 = pd.DataFrame( - {"x": [1, 2, 3, 7], "y": [4, 5, 6, 7]}, index=[0, 1, 3, 4] - ) - # Test with an Index in a different order - pdf3 = pd.DataFrame( - {"x": [4, 5, 6, 7], "y": [1, 2, 3, 7], "z": [0, 5, 3, 7]}, - index=[0, 3, 5, 3], - ) - gdf1 = cudf.DataFrame.from_pandas(pdf1) - gdf2 = cudf.DataFrame.from_pandas(pdf2) - gdf3 = cudf.DataFrame.from_pandas(pdf3) - - pd_frame = binop(binop(pdf1, pdf2), pdf3) - cd_frame = binop(binop(gdf1, gdf2), gdf3) - # cast x and y as float64 so it matches pandas dtype - cd_frame["x"] = cd_frame["x"].astype(np.float64) - cd_frame["y"] = cd_frame["y"].astype(np.float64) - assert_eq(cd_frame, pd_frame) - - pdf1 = pd.DataFrame({"x": [1, 1]}, index=["a", "a"]) - pdf2 = pd.DataFrame({"x": [2]}, index=["a"]) - gdf1 = cudf.DataFrame.from_pandas(pdf1) - gdf2 = cudf.DataFrame.from_pandas(pdf2) - pd_frame = binop(pdf1, pdf2) - cd_frame = binop(gdf1, gdf2) - assert_eq(pd_frame, cd_frame) - - -@pytest.mark.parametrize( - "df2", - [ - cudf.DataFrame({"a": [3, 2, 1]}, index=[3, 2, 1]), - cudf.DataFrame([3, 2]), - ], -) -@pytest.mark.parametrize("binop", [operator.eq, operator.ne]) -def test_df_different_index_shape(df2, binop): - df1 = cudf.DataFrame([1, 2, 3], index=[1, 2, 3]) - - pdf1 = df1.to_pandas() - pdf2 = df2.to_pandas() - - utils.assert_exceptions_equal( - lfunc=binop, - rfunc=binop, - lfunc_args_and_kwargs=([pdf1, pdf2],), - rfunc_args_and_kwargs=([df1, df2],), - ) - - -@pytest.mark.parametrize("op", [operator.eq, operator.ne]) -def test_boolean_scalar_binop(op): - psr = pd.Series(np.random.choice([True, False], 10)) - gsr = cudf.from_pandas(psr) - assert_eq(op(psr, True), op(gsr, True)) - assert_eq(op(psr, False), op(gsr, False)) - - # cuDF scalar - assert_eq(op(psr, True), op(gsr, cudf.Scalar(True))) - assert_eq(op(psr, False), op(gsr, cudf.Scalar(False))) - - -@pytest.mark.parametrize("func", _operators_arithmetic) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("fill_value", [None, 27]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_operator_func_between_series(dtype, func, has_nulls, fill_value): - count = 1000 - gdf_series_a = utils.gen_rand_series( - dtype, count, has_nulls=has_nulls, stride=10000 - ) - gdf_series_b = utils.gen_rand_series( - dtype, count, has_nulls=has_nulls, stride=100 - ) - pdf_series_a = gdf_series_a.to_pandas() - pdf_series_b = gdf_series_b.to_pandas() - - gdf_result = getattr(gdf_series_a, func)( - gdf_series_b, fill_value=fill_value - ) - pdf_result = getattr(pdf_series_a, func)( - pdf_series_b, fill_value=fill_value - ) - - assert_eq(pdf_result, gdf_result) - - -@pytest.mark.parametrize("func", _operators_arithmetic) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("fill_value", [None, 27]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -@pytest.mark.parametrize("use_cudf_scalar", [False, True]) -def test_operator_func_series_and_scalar( - dtype, func, has_nulls, fill_value, use_cudf_scalar -): - count = 1000 - scalar = 59 - gdf_series = utils.gen_rand_series( - dtype, count, has_nulls=has_nulls, stride=10000 - ) - pdf_series = gdf_series.to_pandas() - - gdf_series_result = getattr(gdf_series, func)( - cudf.Scalar(scalar) if use_cudf_scalar else scalar, - fill_value=fill_value, - ) - pdf_series_result = getattr(pdf_series, func)( - np.array(scalar)[()] if use_cudf_scalar else scalar, - fill_value=fill_value, - ) - - assert_eq(pdf_series_result, gdf_series_result) - - -_permu_values = [0, 1, None, np.nan] - - -@pytest.mark.parametrize("fill_value", _permu_values) -@pytest.mark.parametrize("scalar_a", _permu_values) -@pytest.mark.parametrize("scalar_b", _permu_values) -@pytest.mark.parametrize("func", _operators_comparison) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_operator_func_between_series_logical( - dtype, func, scalar_a, scalar_b, fill_value -): - gdf_series_a = Series([scalar_a], nan_as_null=False).astype(dtype) - gdf_series_b = Series([scalar_b], nan_as_null=False).astype(dtype) - - pdf_series_a = gdf_series_a.to_pandas(nullable=True) - pdf_series_b = gdf_series_b.to_pandas(nullable=True) - - gdf_series_result = getattr(gdf_series_a, func)( - gdf_series_b, fill_value=fill_value - ) - pdf_series_result = getattr(pdf_series_a, func)( - pdf_series_b, fill_value=fill_value - ) - expect = pdf_series_result - got = gdf_series_result.to_pandas(nullable=True) - - # If fill_value is np.nan, things break down a bit, - # because setting a NaN into a pandas nullable float - # array still gets transformed to . As such, - # pd_series_with_nulls.fillna(np.nan) has no effect. - if ( - (pdf_series_a.isnull().sum() != pdf_series_b.isnull().sum()) - and np.isscalar(fill_value) - and np.isnan(fill_value) - ): - with pytest.raises(AssertionError): - assert_eq(expect, got) - return - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -@pytest.mark.parametrize("func", _operators_comparison) -@pytest.mark.parametrize("has_nulls", [True, False]) -@pytest.mark.parametrize("scalar", [-59.0, np.nan, 0, 59.0]) -@pytest.mark.parametrize("fill_value", [None, 1.0]) -@pytest.mark.parametrize("use_cudf_scalar", [False, True]) -def test_operator_func_series_and_scalar_logical( - request, dtype, func, has_nulls, scalar, fill_value, use_cudf_scalar -): - request.applymarker( - pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and fill_value == 1.0 - and scalar is np.nan - and (has_nulls or (not has_nulls and func not in {"eq", "ne"})), - reason="https://github.com/pandas-dev/pandas/issues/57447", - ) - ) - if has_nulls: - gdf_series = cudf.Series([-1.0, 0, cudf.NA, 1.1], dtype=dtype) - else: - gdf_series = cudf.Series([-1.0, 0, 10.5, 1.1], dtype=dtype) - pdf_series = gdf_series.to_pandas(nullable=True) - gdf_series_result = getattr(gdf_series, func)( - cudf.Scalar(scalar) if use_cudf_scalar else scalar, - fill_value=fill_value, - ) - pdf_series_result = getattr(pdf_series, func)( - scalar, fill_value=fill_value - ) - - expect = pdf_series_result - got = gdf_series_result.to_pandas(nullable=True) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("func", _operators_arithmetic) -@pytest.mark.parametrize("nulls", _nulls) -@pytest.mark.parametrize("fill_value", [None, 27]) -@pytest.mark.parametrize("other", ["df", "scalar"]) -def test_operator_func_dataframe(func, nulls, fill_value, other): - num_rows = 100 - num_cols = 3 - - def gen_df(): - pdf = pd.DataFrame() - from string import ascii_lowercase - - cols = np.random.choice(num_cols + 5, num_cols, replace=False) - - for i in range(num_cols): - colname = ascii_lowercase[cols[i]] - data = utils.gen_rand("float64", num_rows) * 10000 - if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) - data[idx] = np.nan - pdf[colname] = data - return pdf - - pdf1 = gen_df() - pdf2 = gen_df() if other == "df" else 59.0 - gdf1 = cudf.DataFrame.from_pandas(pdf1) - gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0 - - got = getattr(gdf1, func)(gdf2, fill_value=fill_value) - expect = getattr(pdf1, func)(pdf2, fill_value=fill_value)[list(got._data)] - - assert_eq(expect, got) - - -@pytest.mark.parametrize("func", _operators_comparison) -@pytest.mark.parametrize("nulls", _nulls) -@pytest.mark.parametrize("other", ["df", "scalar"]) -def test_logical_operator_func_dataframe(func, nulls, other): - np.random.seed(0) - num_rows = 100 - num_cols = 3 - - def gen_df(): - pdf = pd.DataFrame() - from string import ascii_lowercase - - cols = np.random.choice(num_cols + 5, num_cols, replace=False) - - for i in range(num_cols): - colname = ascii_lowercase[cols[i]] - data = utils.gen_rand("float64", num_rows) * 10000 - if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) - data[idx] = np.nan - pdf[colname] = data - return pdf - - pdf1 = gen_df() - pdf2 = gen_df() if other == "df" else 59.0 - gdf1 = cudf.DataFrame.from_pandas(pdf1) - gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0 - - got = getattr(gdf1, func)(gdf2) - expect = getattr(pdf1, func)(pdf2)[list(got._data)] - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "func", - [op for op in _operators_arithmetic if op not in {"rmod", "rfloordiv"}] - + _operators_comparison - + [ - pytest.param( - "rmod", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/12162" - ), - ), - pytest.param( - "rfloordiv", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/12162" - ), - ), - ], -) -@pytest.mark.parametrize("rhs", [0, 1, 2, 128]) -def test_binop_bool_uint(func, rhs): - psr = pd.Series([True, False, False]) - gsr = cudf.from_pandas(psr) - assert_eq( - getattr(psr, func)(rhs), getattr(gsr, func)(rhs), check_dtype=False - ) - - -@pytest.mark.parametrize( - "series_dtype", (np.int8, np.uint8, np.int64, np.uint64) -) -@pytest.mark.parametrize( - "divisor_dtype", - ( - np.int8, - np.uint8, - np.int64, - np.uint64, - ), -) -@pytest.mark.parametrize("scalar_divisor", [False, True]) -def test_floordiv_zero_float64(series_dtype, divisor_dtype, scalar_divisor): - sr = pd.Series([1, 2, 3], dtype=series_dtype) - cr = cudf.from_pandas(sr) - - if scalar_divisor: - pd_div = divisor_dtype(0) - cudf_div = cudf.Scalar(0, dtype=divisor_dtype) - else: - pd_div = pd.Series([0], dtype=divisor_dtype) - cudf_div = cudf.from_pandas(pd_div) - assert_eq(sr // pd_div, cr // cudf_div) - - -@pytest.mark.parametrize("scalar_divisor", [False, True]) -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12162") -def test_floordiv_zero_bool(scalar_divisor): - sr = pd.Series([True, True, False], dtype=np.bool_) - cr = cudf.from_pandas(sr) - - if scalar_divisor: - pd_div = np.bool_(0) - cudf_div = cudf.Scalar(0, dtype=np.bool_) - else: - pd_div = pd.Series([0], dtype=np.bool_) - cudf_div = cudf.from_pandas(pd_div) - - with pytest.raises((NotImplementedError, ZeroDivisionError)): - # Pandas does raise - sr // pd_div - with pytest.raises((NotImplementedError, ZeroDivisionError)): - # Cudf does not - cr // cudf_div - - -@pytest.mark.parametrize( - "dtype", - ( - pytest.param( - np.bool_, - marks=pytest_xfail( - reason=( - "Pandas handling of division by zero-bool is too strange" - ) - ), - ), - np.int8, - np.uint8, - np.int64, - np.uint64, - np.float32, - np.float64, - ), -) -def test_rmod_zero_nan(dtype): - sr = pd.Series([1, 1, 0], dtype=dtype) - cr = cudf.from_pandas(sr) - assert_eq(1 % sr, 1 % cr) - expected_dtype = np.float64 if cr.dtype.kind != "f" else dtype - assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype)) - - -def test_series_misc_binop(): - pds = pd.Series([1, 2, 4], name="abc xyz") - gds = cudf.Series([1, 2, 4], name="abc xyz") - - assert_eq(pds + 1, gds + 1) - assert_eq(1 + pds, 1 + gds) - - assert_eq(pds + pds, gds + gds) - - pds1 = pd.Series([1, 2, 4], name="hello world") - gds1 = cudf.Series([1, 2, 4], name="hello world") - - assert_eq(pds + pds1, gds + gds1) - assert_eq(pds1 + pds, gds1 + gds) - - assert_eq(pds1 + pds + 5, gds1 + gds + 5) - - -def test_int8_float16_binop(): - a = cudf.Series([1], dtype="int8") - b = np.float16(2) - expect = cudf.Series([0.5]) - got = a / b - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("dtype", ["int64", "float64", "str"]) -def test_vector_to_none_binops(dtype): - data = Series([1, 2, 3, None], dtype=dtype) - - expect = Series([None] * 4).astype(dtype) - got = data + None - - assert_eq(expect, got) - - -def dtype_scalar(val, dtype): - if dtype == "str": - return str(val) - dtype = cudf.dtype(dtype) - if dtype.type in {np.datetime64, np.timedelta64}: - res, _ = np.datetime_data(dtype) - return dtype.type(val, res) - else: - return dtype.type(val) - - -def make_scalar_add_data(): - valid = set() - - # to any int, we may add any kind of - # other int, float, datetime timedelta, or bool - valid |= set( - product( - INTEGER_TYPES, - FLOAT_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, - ) - ) - - # to any float, we may add any int, float, or bool - valid |= set( - product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) - ) - - # to any datetime, we may add any int, timedelta, or bool - valid |= set( - product(DATETIME_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES) - ) - - # to any timedelta, we may add any int, datetime, other timedelta, or bool - valid |= set( - product(TIMEDELTA_TYPES, INTEGER_TYPES | DATETIME_TYPES | BOOL_TYPES) - ) - - # to any bool, we may add any int, float, datetime, timedelta, or bool - valid |= set( - product( - BOOL_TYPES, - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, - ) - ) - - # to any string, we may add any other string - valid |= {("str", "str")} - - return sorted(list(valid)) - - -def make_invalid_scalar_add_data(): - invalid = set() - - # we can not add a datetime to a float - invalid |= set(product(FLOAT_TYPES, DATETIME_TYPES)) - - # We can not add a timedelta to a float - invalid |= set(product(FLOAT_TYPES, TIMEDELTA_TYPES)) - - # we can not add a float to any datetime - invalid |= set(product(DATETIME_TYPES, FLOAT_TYPES)) - - # can can not add a datetime to a datetime - invalid |= set(product(DATETIME_TYPES, DATETIME_TYPES)) - - # can not add a timedelta to a float - invalid |= set(product(FLOAT_TYPES, TIMEDELTA_TYPES)) - - return sorted(list(invalid)) - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_add_data()) -def test_scalar_add(dtype_l, dtype_r): - test_value = 1 - - lval_host = dtype_scalar(test_value, dtype=dtype_l) - rval_host = dtype_scalar(test_value, dtype=dtype_r) - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - # expect = np.add(lval_host, rval_host) - expect = lval_host + rval_host - got = lval_gpu + rval_gpu - - assert expect == got.value - if not dtype_l == dtype_r == "str": - assert expect.dtype == got.dtype - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_invalid_scalar_add_data()) -def test_scalar_add_invalid(dtype_l, dtype_r): - test_value = 1 - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - with pytest.raises(TypeError): - lval_gpu + rval_gpu - - -def make_scalar_difference_data(): - valid = set() - - # from an int, we may subtract any int, float, timedelta, - # or boolean value - valid |= set( - product( - INTEGER_TYPES, - INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, - ) - ) - - # from any float, we may subtract any int, float, or bool - valid |= set( - product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) - ) - - # from any datetime we may subtract any int, datetime, timedelta, or bool - valid |= set( - product( - DATETIME_TYPES, - INTEGER_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, - ) - ) - - # from any timedelta we may subtract any int, timedelta, or bool - valid |= set( - product(TIMEDELTA_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES | BOOL_TYPES) - ) - - # from any bool we may subtract any int, float or timedelta - valid |= set( - product(BOOL_TYPES, INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES) - ) - - return sorted(list(valid)) - - -def make_scalar_difference_data_invalid(): - invalid = set() - - # we can't subtract a datetime from an int - invalid |= set(product(INTEGER_TYPES, DATETIME_TYPES)) - - # we can't subtract a datetime or timedelta from a float - invalid |= set(product(FLOAT_TYPES, DATETIME_TYPES | TIMEDELTA_TYPES)) - - # we can't subtract a float from a datetime or timedelta - invalid |= set(product(DATETIME_TYPES | TIMEDELTA_TYPES, FLOAT_TYPES)) - - # We can't subtract a datetime from a timedelta - invalid |= set(product(TIMEDELTA_TYPES, DATETIME_TYPES)) - - # we can't subtract a datetime or bool from a bool - invalid |= set(product(BOOL_TYPES, BOOL_TYPES | DATETIME_TYPES)) - - return sorted(list(invalid)) - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_difference_data()) -def test_scalar_difference(dtype_l, dtype_r): - test_value = 1 - - lval_host = dtype_scalar(test_value, dtype=dtype_l) - rval_host = dtype_scalar(test_value, dtype=dtype_r) - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - expect = lval_host - rval_host - got = lval_gpu - rval_gpu - - assert expect == got.value - assert expect.dtype == got.dtype - - -@pytest.mark.parametrize( - "dtype_l,dtype_r", make_scalar_difference_data_invalid() -) -def test_scalar_difference_invalid(dtype_l, dtype_r): - test_value = 1 - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - with pytest.raises(TypeError): - lval_gpu - rval_gpu - - -def make_scalar_product_data(): - valid = set() - - # we can multiply an int, or bool by any int, float, timedelta, or bool - valid |= set( - product( - INTEGER_TYPES | BOOL_TYPES, - INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES | BOOL_TYPES, - ) - ) - - # we can multiply any timedelta by any int, or bool - valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | BOOL_TYPES)) - - # we can multiply a float by any int, float, or bool - valid |= set( - product(FLOAT_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES) - ) - - return sorted(list(valid)) - - -def make_scalar_product_data_invalid(): - invalid = set() - - # can't multiply a ints, floats, datetimes, timedeltas, - # or bools by datetimes - invalid |= set( - product( - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, - DATETIME_TYPES, - ) - ) - - # can't multiply datetimes with anything really - invalid |= set( - product( - DATETIME_TYPES, - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, - ) - ) - - # can't multiply timedeltas by timedeltas - invalid |= set(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES)) - - return sorted(list(invalid)) - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_product_data()) -def test_scalar_product(dtype_l, dtype_r): - test_value = 1 - - lval_host = dtype_scalar(test_value, dtype=dtype_l) - rval_host = dtype_scalar(test_value, dtype=dtype_r) - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - expect = lval_host * rval_host - got = lval_gpu * rval_gpu - - assert expect == got.value - assert expect.dtype == got.dtype - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_product_data_invalid()) -def test_scalar_product_invalid(dtype_l, dtype_r): - test_value = 1 - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - with pytest.raises(TypeError): - lval_gpu * rval_gpu - - -def make_scalar_floordiv_data(): - valid = set() - - # we can divide ints and floats by other ints, floats, or bools - valid |= set( - product( - INTEGER_TYPES | FLOAT_TYPES, - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - ) - ) - - # we can divide timedeltas by ints, floats or other timedeltas - valid |= set( - product(TIMEDELTA_TYPES, INTEGER_TYPES | FLOAT_TYPES | TIMEDELTA_TYPES) - ) - - # we can divide bools by ints, floats or bools - valid |= set(product(BOOL_TYPES, INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES)) - - return sorted(list(valid)) - - -def make_scalar_floordiv_data_invalid(): - invalid = set() - - # we can't numeric types into datelike types - invalid |= set( - product( - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - DATETIME_TYPES | TIMEDELTA_TYPES, - ) - ) - - # we can't divide datetime types into anything - invalid |= set( - product( - DATETIME_TYPES, - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, - ) - ) - - # we can't divide timedeltas into bools, or datetimes - invalid |= set(product(TIMEDELTA_TYPES, BOOL_TYPES | DATETIME_TYPES)) - - return sorted(list(invalid)) - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_floordiv_data()) -def test_scalar_floordiv(dtype_l, dtype_r): - test_value = 1 - - lval_host = dtype_scalar(test_value, dtype=dtype_l) - rval_host = dtype_scalar(test_value, dtype=dtype_r) - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - expect = lval_host // rval_host - got = lval_gpu // rval_gpu - - assert expect == got.value - assert expect.dtype == got.dtype - - -@pytest.mark.parametrize( - "dtype_l,dtype_r", make_scalar_floordiv_data_invalid() -) -def test_scalar_floordiv_invalid(dtype_l, dtype_r): - test_value = 1 - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - with pytest.raises(TypeError): - lval_gpu // rval_gpu - - -def make_scalar_truediv_data(): - valid = set() - - # we can true divide ints, floats, or bools by other - # ints, floats or bools - valid |= set( - product( - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - ) - ) - - # we can true divide timedeltas by ints floats or timedeltas - valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | TIMEDELTA_TYPES)) - - return sorted(list(valid)) - - -def make_scalar_truediv_data_invalid(): - invalid = set() - - # we can't divide ints, floats or bools by datetimes - # or timedeltas - invalid |= set( - product( - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - DATETIME_TYPES | TIMEDELTA_TYPES, - ) - ) - - # we cant true divide datetime types by anything - invalid |= set( - product( - DATETIME_TYPES, - INTEGER_TYPES - | FLOAT_TYPES - | DATETIME_TYPES - | TIMEDELTA_TYPES - | BOOL_TYPES, - ) - ) - - # we cant true divide timedeltas by datetimes or bools or floats - invalid |= set( - product(TIMEDELTA_TYPES, DATETIME_TYPES | BOOL_TYPES | FLOAT_TYPES) - ) - - return sorted(list(invalid)) - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_truediv_data()) -def test_scalar_truediv(dtype_l, dtype_r): - test_value = 1 - - lval_host = dtype_scalar(test_value, dtype=dtype_l) - rval_host = dtype_scalar(test_value, dtype=dtype_r) - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - expect = np.true_divide(lval_host, rval_host) - got = lval_gpu / rval_gpu - - assert expect == got.value - - # numpy bug - - if np.dtype(dtype_l).itemsize <= 2 and np.dtype(dtype_r).itemsize <= 2: - assert expect.dtype == "float64" and got.dtype == "float32" - else: - assert expect.dtype == got.dtype - # assert expect.dtype == got.dtype - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_truediv_data_invalid()) -def test_scalar_truediv_invalid(dtype_l, dtype_r): - test_value = 1 - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - with pytest.raises(TypeError): - lval_gpu / rval_gpu - - -def make_scalar_remainder_data(): - valid = set() - - # can mod numeric types with each other - valid |= set( - product( - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - ) - ) - - # can mod timedeltas by other timedeltas - valid |= set(product(TIMEDELTA_TYPES, TIMEDELTA_TYPES)) - - return sorted(list(valid)) - - -def make_scalar_remainder_data_invalid(): - invalid = set() - - # numeric types cant be modded against timedeltas - # or datetimes. Also, datetimes can't be modded - # against datetimes or timedeltas - invalid |= set( - product( - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES | DATETIME_TYPES, - DATETIME_TYPES | TIMEDELTA_TYPES, - ) - ) - - # datetime and timedelta types cant be modded against - # any numeric types - invalid |= set( - product( - DATETIME_TYPES | TIMEDELTA_TYPES, - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - ) - ) - - # timedeltas cant mod with datetimes - invalid |= set(product(TIMEDELTA_TYPES, DATETIME_TYPES)) - - return sorted(list(invalid)) - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_remainder_data()) -def test_scalar_remainder(dtype_l, dtype_r): - test_value = 1 - - lval_host = dtype_scalar(test_value, dtype=dtype_l) - rval_host = dtype_scalar(test_value, dtype=dtype_r) - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - expect = lval_host % rval_host - got = lval_gpu % rval_gpu - - assert expect == got.value - assert expect.dtype == got.dtype - - -@pytest.mark.parametrize( - "dtype_l,dtype_r", make_scalar_remainder_data_invalid() -) -def test_scalar_remainder_invalid(dtype_l, dtype_r): - test_value = 1 - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - with pytest.raises(TypeError): - lval_gpu % rval_gpu - - -def make_scalar_power_data(): - # only numeric values form valid operands for power - return sorted( - product( - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - INTEGER_TYPES | FLOAT_TYPES | BOOL_TYPES, - ) - ) - - -def make_scalar_power_data_invalid(): - invalid = set() - - # datetimes and timedeltas cant go in exponents - invalid |= set( - product( - INTEGER_TYPES - | FLOAT_TYPES - | TIMEDELTA_TYPES - | DATETIME_TYPES - | BOOL_TYPES, - DATETIME_TYPES | TIMEDELTA_TYPES, - ) - ) - - # datetimes and timedeltas may not be raised to - # any exponent of any dtype - invalid |= set( - product( - DATETIME_TYPES | TIMEDELTA_TYPES, - DATETIME_TYPES - | TIMEDELTA_TYPES - | INTEGER_TYPES - | FLOAT_TYPES - | BOOL_TYPES, - ) - ) - - return sorted(list(invalid)) - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_power_data()) -def test_scalar_power(dtype_l, dtype_r): - test_value = 1 - - lval_host = dtype_scalar(test_value, dtype=dtype_l) - rval_host = dtype_scalar(test_value, dtype=dtype_r) - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - expect = lval_host**rval_host - got = lval_gpu**rval_gpu - - assert expect == got.value - assert expect.dtype == got.dtype - - -@pytest.mark.parametrize("dtype_l,dtype_r", make_scalar_power_data_invalid()) -def test_scalar_power_invalid(dtype_l, dtype_r): - test_value = 1 - - lval_gpu = cudf.Scalar(test_value, dtype=dtype_l) - rval_gpu = cudf.Scalar(test_value, dtype=dtype_r) - - with pytest.raises(TypeError): - lval_gpu**rval_gpu - - -def make_scalar_null_binops_data(): - return ( - [(operator.add, *dtypes) for dtypes in make_scalar_add_data()] - + [(operator.sub, *dtypes) for dtypes in make_scalar_difference_data()] - + [(operator.mul, *dtypes) for dtypes in make_scalar_product_data()] - + [(operator.add, *dtypes) for dtypes in make_scalar_add_data()] - + [ - (operator.floordiv, *dtypes) - for dtypes in make_scalar_floordiv_data() - ] - + [ - (operator.truediv, *dtypes) - for dtypes in make_scalar_truediv_data() - ] - + [(operator.mod, *dtypes) for dtypes in make_scalar_remainder_data()] - + [(operator.pow, *dtypes) for dtypes in make_scalar_power_data()] - ) - - -@pytest.mark.parametrize("op,dtype_l,dtype_r", make_scalar_null_binops_data()) -def test_scalar_null_binops(op, dtype_l, dtype_r): - lhs = cudf.Scalar(cudf.NA, dtype=dtype_l) - rhs = cudf.Scalar(cudf.NA, dtype=dtype_r) - - result = op(lhs, rhs) - assert result.value is (cudf.NaT if result.dtype.kind in "mM" else cudf.NA) - - # make sure dtype is the same as had there been a valid scalar - valid_lhs = cudf.Scalar(1, dtype=dtype_l) - valid_rhs = cudf.Scalar(1, dtype=dtype_r) - - valid_result = op(valid_lhs, valid_rhs) - assert result.dtype == valid_result.dtype - - -@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12]) -@pytest.mark.parametrize( - "frequency", - [ - "months", - "years", - "days", - "hours", - "minutes", - "seconds", - "microseconds", - "nanoseconds", - ], -) -@pytest.mark.parametrize( - "dtype, components", - [ - ["datetime64[ns]", "00.012345678"], - ["datetime64[us]", "00.012345"], - ["datetime64[ms]", "00.012"], - ["datetime64[s]", "00"], - ], -) -@pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_datetime_dateoffset_binaryop( - request, n_periods, frequency, dtype, components, op -): - request.applymarker( - pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and dtype in {"datetime64[ms]", "datetime64[s]"} - and frequency == "microseconds" - and n_periods == 0, - reason="https://github.com/pandas-dev/pandas/issues/57448", - ) - ) - if ( - not PANDAS_GE_220 - and dtype in {"datetime64[ms]", "datetime64[s]"} - and frequency in ("microseconds", "nanoseconds") - and n_periods != 0 - ): - pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") - if ( - not PANDAS_GE_220 - and dtype == "datetime64[us]" - and frequency == "nanoseconds" - and n_periods != 0 - ): - pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") - - date_col = [ - f"2000-01-01 00:00:{components}", - f"2000-01-31 00:00:{components}", - f"2000-02-29 00:00:{components}", - ] - gsr = cudf.Series(date_col, dtype=dtype) - psr = gsr.to_pandas() - - kwargs = {frequency: n_periods} - - goffset = cudf.DateOffset(**kwargs) - poffset = pd.DateOffset(**kwargs) - - expect = op(psr, poffset) - got = op(gsr, goffset) - - assert_eq(expect, got) - - expect = op(psr, -poffset) - got = op(gsr, -goffset) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "date_col", - [ - [ - "2000-01-01 00:00:00.012345678", - "2000-01-31 00:00:00.012345678", - "2000-02-29 00:00:00.012345678", - ] - ], -) -@pytest.mark.parametrize( - "kwargs", - [ - {"months": 2, "years": 5}, - {"microseconds": 1, "seconds": 1}, - {"months": 2, "years": 5, "seconds": 923, "microseconds": 481}, - {"milliseconds": 4}, - {"milliseconds": 4, "years": 2}, - {"nanoseconds": 12}, - ], -) -@pytest.mark.filterwarnings( - "ignore:Non-vectorized DateOffset:pandas.errors.PerformanceWarning" -) -@pytest.mark.filterwarnings( - "ignore:Discarding nonzero nanoseconds:UserWarning" -) -@pytest.mark.parametrize("op", [operator.add, operator.sub]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_datetime_dateoffset_binaryop_multiple(request, date_col, kwargs, op): - gsr = cudf.Series(date_col, dtype="datetime64[ns]") - psr = gsr.to_pandas() - - poffset = pd.DateOffset(**kwargs) - goffset = cudf.DateOffset(**kwargs) - - expect = op(psr, poffset) - got = op(gsr, goffset) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12]) -@pytest.mark.parametrize( - "frequency", - [ - "months", - "years", - "days", - "hours", - "minutes", - "seconds", - "microseconds", - "nanoseconds", - ], -) -@pytest.mark.parametrize( - "dtype, components", - [ - ["datetime64[ns]", "00.012345678"], - ["datetime64[us]", "00.012345"], - ["datetime64[ms]", "00.012"], - ["datetime64[s]", "00"], - ], -) -def test_datetime_dateoffset_binaryop_reflected( - n_periods, frequency, dtype, components -): - if ( - not PANDAS_GE_220 - and dtype in {"datetime64[ms]", "datetime64[s]"} - and frequency in ("microseconds", "nanoseconds") - and n_periods != 0 - ): - pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") - if ( - not PANDAS_GE_220 - and dtype == "datetime64[us]" - and frequency == "nanoseconds" - and n_periods != 0 - ): - pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595") - - date_col = [ - f"2000-01-01 00:00:{components}", - f"2000-01-31 00:00:{components}", - f"2000-02-29 00:00:{components}", - ] - gsr = cudf.Series(date_col, dtype=dtype) - psr = gsr.to_pandas() # converts to nanos - - kwargs = {frequency: n_periods} - - goffset = cudf.DateOffset(**kwargs) - poffset = pd.DateOffset(**kwargs) - - expect = poffset + psr - got = goffset + gsr - - # TODO: Remove check_dtype once we get some clarity on: - # https://github.com/pandas-dev/pandas/issues/57448 - assert_eq(expect, got, check_dtype=False) - - with pytest.raises(TypeError): - poffset - psr - - with pytest.raises(TypeError): - goffset - gsr - - -@pytest.mark.parametrize("frame", [cudf.Series, cudf.Index, cudf.DataFrame]) -@pytest.mark.parametrize( - "dtype", ["int", "str", "datetime64[s]", "timedelta64[s]", "category"] -) -def test_binops_with_lhs_numpy_scalar(frame, dtype): - data = [1, 2, 3, 4, 5] - - data = ( - frame({"a": data}, dtype=dtype) - if isinstance(frame, cudf.DataFrame) - else frame(data, dtype=dtype) - ) - - if dtype == "datetime64[s]": - val = cudf.dtype(dtype).type(4, "s") - elif dtype == "timedelta64[s]": - val = cudf.dtype(dtype).type(4, "s") - elif dtype == "category": - val = np.int64(4) - elif dtype == "str": - val = str(4) - else: - val = cudf.dtype(dtype).type(4) - - # Compare equality with series on left side to dispatch to the pandas/cudf - # __eq__ operator and avoid a DeprecationWarning from numpy. - expected = data.to_pandas() == val - got = data == val - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "dtype", - [ - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float32", - "float64", - "datetime64[ns]", - "datetime64[us]", - "datetime64[ms]", - "datetime64[s]", - "timedelta64[ns]", - "timedelta64[us]", - "timedelta64[ms]", - "timedelta64[s]", - ], -) -@pytest.mark.parametrize("op", _operators_comparison) -def test_binops_with_NA_consistent(dtype, op): - data = [1, 2, 3] - sr = cudf.Series(data, dtype=dtype) - - result = getattr(sr, op)(cudf.NA) - if dtype in NUMERIC_TYPES: - if op == "ne": - expect_all = True - else: - expect_all = False - assert (result == expect_all).all() - elif dtype in DATETIME_TYPES & TIMEDELTA_TYPES: - assert result._column.null_count == len(data) - - -@pytest.mark.parametrize( - "op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype", - [ - ( - operator.add, - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["3.0", "4.0"], - cudf.Decimal64Dtype(scale=2, precision=4), - ), - ( - operator.add, - 2, - cudf.Decimal64Dtype(scale=2, precision=3), - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["3.5", "4.0"], - cudf.Decimal64Dtype(scale=2, precision=4), - ), - ( - operator.add, - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["2.25", "1.005"], - cudf.Decimal64Dtype(scale=3, precision=4), - ["3.75", "3.005"], - cudf.Decimal64Dtype(scale=3, precision=5), - ), - ( - operator.add, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=17), - ["0.1", "0.2"], - cudf.Decimal64Dtype(scale=3, precision=4), - ["100.1", "200.2"], - cudf.Decimal128Dtype(scale=3, precision=23), - ), - ( - operator.sub, - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=1, precision=2), - ["2.25", "1.005"], - cudf.Decimal64Dtype(scale=3, precision=4), - ["-0.75", "0.995"], - cudf.Decimal64Dtype(scale=3, precision=5), - ), - ( - operator.sub, - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=1, precision=2), - ["2.25", "1.005"], - cudf.Decimal64Dtype(scale=3, precision=4), - ["-0.75", "0.995"], - cudf.Decimal64Dtype(scale=3, precision=5), - ), - ( - operator.sub, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=10), - ["0.1", "0.2"], - cudf.Decimal64Dtype(scale=6, precision=10), - ["99.9", "199.8"], - cudf.Decimal128Dtype(scale=6, precision=19), - ), - ( - operator.sub, - 2, - cudf.Decimal64Dtype(scale=3, precision=4), - ["2.25", "1.005"], - cudf.Decimal64Dtype(scale=3, precision=4), - ["-0.25", "0.995"], - cudf.Decimal64Dtype(scale=3, precision=5), - ), - ( - operator.mul, - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["1.5", "3.0"], - cudf.Decimal64Dtype(scale=3, precision=4), - ["2.25", "6.0"], - cudf.Decimal64Dtype(scale=5, precision=8), - ), - ( - operator.mul, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - ["0.1", "0.2"], - cudf.Decimal64Dtype(scale=3, precision=4), - ["10.0", "40.0"], - cudf.Decimal64Dtype(scale=1, precision=8), - ), - ( - operator.mul, - ["1000", "2000"], - cudf.Decimal64Dtype(scale=-3, precision=4), - ["0.343", "0.500"], - cudf.Decimal64Dtype(scale=3, precision=3), - ["343.0", "1000.0"], - cudf.Decimal64Dtype(scale=0, precision=8), - ), - ( - operator.mul, - 200, - cudf.Decimal64Dtype(scale=3, precision=6), - ["0.343", "0.500"], - cudf.Decimal64Dtype(scale=3, precision=6), - ["68.60", "100.0"], - cudf.Decimal64Dtype(scale=6, precision=13), - ), - ( - operator.truediv, - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=4), - ["1.5", "3.0"], - cudf.Decimal64Dtype(scale=1, precision=4), - ["1.0", "0.6"], - cudf.Decimal64Dtype(scale=7, precision=10), - ), - ( - operator.truediv, - ["110", "200"], - cudf.Decimal64Dtype(scale=-1, precision=3), - ["0.1", "0.2"], - cudf.Decimal64Dtype(scale=2, precision=4), - ["1000.0", "1000.0"], - cudf.Decimal64Dtype(scale=6, precision=12), - ), - ( - operator.truediv, - ["132.86", "15.25"], - cudf.Decimal64Dtype(scale=4, precision=14), - ["2.34", "8.50"], - cudf.Decimal64Dtype(scale=2, precision=8), - ["56.77", "1.79"], - cudf.Decimal128Dtype(scale=13, precision=25), - ), - ( - operator.truediv, - 20, - cudf.Decimal128Dtype(scale=2, precision=6), - ["20", "20"], - cudf.Decimal128Dtype(scale=2, precision=6), - ["1.0", "1.0"], - cudf.Decimal128Dtype(scale=9, precision=15), - ), - ( - operator.add, - ["1.5", None, "2.0"], - cudf.Decimal64Dtype(scale=1, precision=2), - ["1.5", None, "2.0"], - cudf.Decimal64Dtype(scale=1, precision=2), - ["3.0", None, "4.0"], - cudf.Decimal64Dtype(scale=1, precision=3), - ), - ( - operator.add, - ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=3), - ["2.25", "1.005"], - cudf.Decimal64Dtype(scale=3, precision=4), - ["3.75", None], - cudf.Decimal64Dtype(scale=3, precision=5), - ), - ( - operator.sub, - ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=3), - ["2.25", None], - cudf.Decimal64Dtype(scale=3, precision=4), - ["-0.75", None], - cudf.Decimal64Dtype(scale=3, precision=5), - ), - ( - operator.sub, - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["2.25", None], - cudf.Decimal64Dtype(scale=3, precision=4), - ["-0.75", None], - cudf.Decimal64Dtype(scale=3, precision=5), - ), - ( - operator.mul, - ["1.5", None], - cudf.Decimal64Dtype(scale=2, precision=3), - ["1.5", None], - cudf.Decimal64Dtype(scale=3, precision=4), - ["2.25", None], - cudf.Decimal64Dtype(scale=5, precision=8), - ), - ( - operator.mul, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=10), - ["0.1", None], - cudf.Decimal64Dtype(scale=3, precision=12), - ["10.0", None], - cudf.Decimal128Dtype(scale=1, precision=23), - ), - ( - operator.eq, - ["0.18", "0.42"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.18", "0.21"], - cudf.Decimal64Dtype(scale=2, precision=3), - [True, False], - bool, - ), - ( - operator.eq, - ["0.18", "0.42"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.1800", "0.2100"], - cudf.Decimal64Dtype(scale=4, precision=5), - [True, False], - bool, - ), - ( - operator.eq, - ["100", None], - cudf.Decimal64Dtype(scale=-2, precision=3), - ["100", "200"], - cudf.Decimal64Dtype(scale=-1, precision=4), - [True, None], - bool, - ), - ( - operator.ne, - ["0.06", "0.42"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.18", "0.42"], - cudf.Decimal64Dtype(scale=2, precision=3), - [True, False], - bool, - ), - ( - operator.ne, - ["1.33", "1.21"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.1899", "1.21"], - cudf.Decimal64Dtype(scale=4, precision=5), - [True, False], - bool, - ), - ( - operator.ne, - ["300", None], - cudf.Decimal64Dtype(scale=-2, precision=3), - ["110", "5500"], - cudf.Decimal64Dtype(scale=-1, precision=4), - [True, None], - bool, - ), - ( - operator.lt, - ["0.18", "0.42", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.10", "0.87", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - [False, True, False], - bool, - ), - ( - operator.lt, - ["0.18", "0.42", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.1000", "0.8700", "1.0000"], - cudf.Decimal64Dtype(scale=4, precision=5), - [False, True, False], - bool, - ), - ( - operator.lt, - ["200", None, "100"], - cudf.Decimal64Dtype(scale=-2, precision=3), - ["100", "200", "100"], - cudf.Decimal64Dtype(scale=-1, precision=4), - [False, None, False], - bool, - ), - ( - operator.gt, - ["0.18", "0.42", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.10", "0.87", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - [True, False, False], - bool, - ), - ( - operator.gt, - ["0.18", "0.42", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.1000", "0.8700", "1.0000"], - cudf.Decimal64Dtype(scale=4, precision=5), - [True, False, False], - bool, - ), - ( - operator.gt, - ["300", None, "100"], - cudf.Decimal64Dtype(scale=-2, precision=3), - ["100", "200", "100"], - cudf.Decimal64Dtype(scale=-1, precision=4), - [True, None, False], - bool, - ), - ( - operator.le, - ["0.18", "0.42", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.10", "0.87", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - [False, True, True], - bool, - ), - ( - operator.le, - ["0.18", "0.42", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.1000", "0.8700", "1.0000"], - cudf.Decimal64Dtype(scale=4, precision=5), - [False, True, True], - bool, - ), - ( - operator.le, - ["300", None, "100"], - cudf.Decimal64Dtype(scale=-2, precision=3), - ["100", "200", "100"], - cudf.Decimal64Dtype(scale=-1, precision=4), - [False, None, True], - bool, - ), - ( - operator.ge, - ["0.18", "0.42", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.10", "0.87", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - [True, False, True], - bool, - ), - ( - operator.ge, - ["0.18", "0.42", "1.00"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["0.1000", "0.8700", "1.0000"], - cudf.Decimal64Dtype(scale=4, precision=5), - [True, False, True], - bool, - ), - ( - operator.ge, - ["300", None, "100"], - cudf.Decimal64Dtype(scale=-2, precision=3), - ["100", "200", "100"], - cudf.Decimal64Dtype(scale=-1, precision=4), - [True, None, True], - bool, - ), - ], -) -def test_binops_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype): - if isinstance(lhs, (int, float)): - a = cudf.Scalar(lhs, l_dtype) - else: - a = utils._decimal_series(lhs, l_dtype) - b = utils._decimal_series(rhs, r_dtype) - expect = ( - utils._decimal_series(expect, expect_dtype) - if isinstance( - expect_dtype, - (cudf.Decimal64Dtype, cudf.Decimal32Dtype, cudf.Decimal128Dtype), - ) - else cudf.Series(expect, dtype=expect_dtype) - ) - - got = op(a, b) - assert expect.dtype == got.dtype - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "op,lhs,l_dtype,rhs,r_dtype,expect,expect_dtype", - [ - ( - "radd", - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=2, precision=3), - ["3.0", "4.0"], - cudf.Decimal64Dtype(scale=2, precision=4), - ), - ( - "rsub", - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=10), - ["0.1", "0.2"], - cudf.Decimal64Dtype(scale=6, precision=10), - ["-99.9", "-199.8"], - cudf.Decimal128Dtype(scale=6, precision=19), - ), - ( - "rmul", - ["1000", "2000"], - cudf.Decimal64Dtype(scale=-3, precision=4), - ["0.343", "0.500"], - cudf.Decimal64Dtype(scale=3, precision=3), - ["343.0", "1000.0"], - cudf.Decimal64Dtype(scale=0, precision=8), - ), - ( - "rtruediv", - ["1.5", "0.5"], - cudf.Decimal64Dtype(scale=3, precision=6), - ["1.5", "2.0"], - cudf.Decimal64Dtype(scale=3, precision=6), - ["1.0", "4.0"], - cudf.Decimal64Dtype(scale=10, precision=16), - ), - ], -) -def test_binops_reflect_decimal( - op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype -): - a = utils._decimal_series(lhs, l_dtype) - b = utils._decimal_series(rhs, r_dtype) - expect = utils._decimal_series(expect, expect_dtype) - - got = getattr(a, op)(b) - assert expect.dtype == got.dtype - assert_eq(expect, got) - - -@pytest.mark.parametrize("powers", [0, 1, 2, 3]) -def test_binops_decimal_pow(powers): - s = cudf.Series( - [ - decimal.Decimal("1.324324"), - None, - decimal.Decimal("2"), - decimal.Decimal("3"), - decimal.Decimal("5"), - ] - ) - ps = s.to_pandas() - - assert_eq(s**powers, ps**powers, check_dtype=False) - - -def test_binops_raise_error(): - s = cudf.Series([decimal.Decimal("1.324324")]) - - with pytest.raises(TypeError): - s // 1 - - -@pytest.mark.parametrize( - "args", - [ - ( - operator.eq, - ["100", "41", None], - cudf.Decimal64Dtype(scale=0, precision=5), - [100, 42, 12], - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), - ), - ( - operator.eq, - ["100.000", "42.001", None], - cudf.Decimal64Dtype(scale=3, precision=6), - [100, 42, 12], - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), - ), - ( - operator.eq, - ["100", "40", None], - cudf.Decimal64Dtype(scale=-1, precision=3), - [100, 42, 12], - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), - ), - ( - operator.ne, - ["100", "42", "24", None], - cudf.Decimal64Dtype(scale=0, precision=3), - [100, 40, 24, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), - ), - ( - operator.ne, - ["10.1", "88", "11", None], - cudf.Decimal64Dtype(scale=1, precision=3), - [10, 42, 11, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), - ), - ( - operator.ne, - ["100.000", "42", "23.999", None], - cudf.Decimal64Dtype(scale=3, precision=6), - [100, 42, 24, 12], - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), - ), - ( - operator.lt, - ["100", "40", "28", None], - cudf.Decimal64Dtype(scale=0, precision=3), - [100, 42, 24, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), - ), - ( - operator.lt, - ["100.000", "42.002", "23.999", None], - cudf.Decimal64Dtype(scale=3, precision=6), - [100, 42, 24, 12], - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), - ), - ( - operator.lt, - ["100", "40", "10", None], - cudf.Decimal64Dtype(scale=-1, precision=3), - [100, 42, 8, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), - ), - ( - operator.gt, - ["100", "42", "20", None], - cudf.Decimal64Dtype(scale=0, precision=3), - [100, 40, 24, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), - ), - ( - operator.gt, - ["100.000", "42.002", "23.999", None], - cudf.Decimal64Dtype(scale=3, precision=6), - [100, 42, 24, 12], - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), - ), - ( - operator.gt, - ["100", "40", "10", None], - cudf.Decimal64Dtype(scale=-1, precision=3), - [100, 42, 8, 12], - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), - ), - ( - operator.le, - ["100", "40", "28", None], - cudf.Decimal64Dtype(scale=0, precision=3), - [100, 42, 24, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), - ), - ( - operator.le, - ["100.000", "42.002", "23.999", None], - cudf.Decimal64Dtype(scale=3, precision=6), - [100, 42, 24, 12], - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), - ), - ( - operator.le, - ["100", "40", "10", None], - cudf.Decimal64Dtype(scale=-1, precision=3), - [100, 42, 8, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), - ), - ( - operator.ge, - ["100", "42", "20", None], - cudf.Decimal64Dtype(scale=0, precision=3), - [100, 40, 24, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), - ), - ( - operator.ge, - ["100.000", "42.002", "23.999", None], - cudf.Decimal64Dtype(scale=3, precision=6), - [100, 42, 24, 12], - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), - ), - ( - operator.ge, - ["100", "40", "10", None], - cudf.Decimal64Dtype(scale=-1, precision=3), - [100, 42, 8, 12], - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), - ), - ], -) -@pytest.mark.parametrize("integer_dtype", utils.INTEGER_TYPES) -@pytest.mark.parametrize("reflected", [True, False]) -def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): - """ - Tested compare operations: - eq, lt, gt, le, ge - Each operation has 3 decimal data setups, with scale from {==0, >0, <0}. - Decimal precisions are sufficient to hold the digits. - For each decimal data setup, there is at least one row that lead to one - of the following compare results: {True, False, None}. - """ - if not reflected: - op, ldata, ldtype, rdata, expected, _ = args - else: - op, ldata, ldtype, rdata, _, expected = args - - lhs = utils._decimal_series(ldata, ldtype) - rhs = cudf.Series(rdata, dtype=integer_dtype) - - if reflected: - rhs, lhs = lhs, rhs - - actual = op(lhs, rhs) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "args", - [ - ( - operator.add, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal(1), - ["101", "201"], - cudf.Decimal64Dtype(scale=0, precision=6), - False, - ), - ( - operator.add, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - 1, - ["101", "201"], - cudf.Decimal64Dtype(scale=0, precision=6), - False, - ), - ( - operator.add, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal("1.5"), - ["101.5", "201.5"], - cudf.Decimal64Dtype(scale=1, precision=7), - False, - ), - ( - operator.add, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal(1), - ["101", "201"], - cudf.Decimal64Dtype(scale=0, precision=6), - True, - ), - ( - operator.add, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - 1, - ["101", "201"], - cudf.Decimal64Dtype(scale=0, precision=6), - True, - ), - ( - operator.add, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal("1.5"), - ["101.5", "201.5"], - cudf.Decimal64Dtype(scale=1, precision=7), - True, - ), - ( - operator.mul, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - 1, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=5), - False, - ), - ( - operator.mul, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal(2), - ["200", "400"], - cudf.Decimal64Dtype(scale=-2, precision=5), - False, - ), - ( - operator.mul, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal("1.5"), - ["150", "300"], - cudf.Decimal64Dtype(scale=-1, precision=6), - False, - ), - ( - operator.mul, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - 1, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=5), - True, - ), - ( - operator.mul, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal(2), - ["200", "400"], - cudf.Decimal64Dtype(scale=-2, precision=5), - True, - ), - ( - operator.mul, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal("1.5"), - ["150", "300"], - cudf.Decimal64Dtype(scale=-1, precision=6), - True, - ), - ( - operator.truediv, - ["1000", "2000"], - cudf.Decimal64Dtype(scale=-2, precision=4), - 1, - ["1000", "2000"], - cudf.Decimal64Dtype(scale=6, precision=12), - False, - ), - ( - operator.truediv, - ["100", "200"], - cudf.Decimal64Dtype(scale=2, precision=5), - decimal.Decimal(2), - ["50", "100"], - cudf.Decimal64Dtype(scale=6, precision=9), - False, - ), - ( - operator.truediv, - ["35.23", "54.91"], - cudf.Decimal64Dtype(scale=2, precision=4), - decimal.Decimal("1.5"), - ["23.4", "36.6"], - cudf.Decimal64Dtype(scale=6, precision=9), - False, - ), - ( - operator.truediv, - ["100", "200"], - cudf.Decimal64Dtype(scale=2, precision=5), - 1, - ["0", "0"], - cudf.Decimal64Dtype(scale=6, precision=9), - True, - ), - ( - operator.truediv, - ["1.2", "0.5"], - cudf.Decimal64Dtype(scale=1, precision=6), - decimal.Decimal(20), - ["10", "40"], - cudf.Decimal64Dtype(scale=7, precision=10), - True, - ), - ( - operator.truediv, - ["1.22", "5.24"], - cudf.Decimal64Dtype(scale=2, precision=3), - decimal.Decimal("8.55"), - ["7", "1"], - cudf.Decimal64Dtype(scale=6, precision=9), - True, - ), - ( - operator.sub, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal(2), - ["98", "198"], - cudf.Decimal64Dtype(scale=0, precision=6), - False, - ), - ( - operator.sub, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal("2.5"), - ["97.5", "197.5"], - cudf.Decimal64Dtype(scale=1, precision=7), - False, - ), - ( - operator.sub, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - 4, - ["96", "196"], - cudf.Decimal64Dtype(scale=0, precision=6), - False, - ), - ( - operator.sub, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal(2), - ["-98", "-198"], - cudf.Decimal64Dtype(scale=0, precision=6), - True, - ), - ( - operator.sub, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - 4, - ["-96", "-196"], - cudf.Decimal64Dtype(scale=0, precision=6), - True, - ), - ( - operator.sub, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal("2.5"), - ["-97.5", "-197.5"], - cudf.Decimal64Dtype(scale=1, precision=7), - True, - ), - ( - operator.sub, - ["100", "200"], - cudf.Decimal64Dtype(scale=-2, precision=3), - decimal.Decimal("2.5"), - ["-97.5", "-197.5"], - cudf.Decimal64Dtype(scale=1, precision=7), - True, - ), - ], -) -def test_binops_decimal_scalar(args): - op, lhs, l_dtype, rhs, expect, expect_dtype, reflect = args - - def decimal_series(input, dtype): - return cudf.Series( - [x if x is None else decimal.Decimal(x) for x in input], - dtype=dtype, - ) - - lhs = decimal_series(lhs, l_dtype) - expect = decimal_series(expect, expect_dtype) - - if reflect: - lhs, rhs = rhs, lhs - - got = op(lhs, rhs) - assert expect.dtype == got.dtype - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "args", - [ - ( - operator.eq, - ["100.00", "41", None], - cudf.Decimal64Dtype(scale=0, precision=5), - 100, - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), - ), - ( - operator.eq, - ["100.123", "41", None], - cudf.Decimal64Dtype(scale=3, precision=6), - decimal.Decimal("100.123"), - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), - ), - ( - operator.eq, - ["100.123", "41", None], - cudf.Decimal64Dtype(scale=3, precision=6), - cudf.Scalar(decimal.Decimal("100.123")), - cudf.Series([True, False, None], dtype=bool), - cudf.Series([True, False, None], dtype=bool), - ), - ( - operator.ne, - ["100.00", "41", None], - cudf.Decimal64Dtype(scale=2, precision=5), - 100, - cudf.Series([False, True, None], dtype=bool), - cudf.Series([False, True, None], dtype=bool), - ), - ( - operator.ne, - ["100.123", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - decimal.Decimal("100.123"), - cudf.Series([False, True, None], dtype=bool), - cudf.Series([False, True, None], dtype=bool), - ), - ( - operator.ne, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - cudf.Scalar(decimal.Decimal("100.123")), - cudf.Series([False, True, True, None], dtype=bool), - cudf.Series([False, True, True, None], dtype=bool), - ), - ( - operator.gt, - ["100.00", "41", "120.21", None], - cudf.Decimal64Dtype(scale=2, precision=5), - 100, - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), - ), - ( - operator.gt, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - decimal.Decimal("100.123"), - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), - ), - ( - operator.gt, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - cudf.Scalar(decimal.Decimal("100.123")), - cudf.Series([False, False, True, None], dtype=bool), - cudf.Series([False, True, False, None], dtype=bool), - ), - ( - operator.ge, - ["100.00", "41", "120.21", None], - cudf.Decimal64Dtype(scale=2, precision=5), - 100, - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), - ), - ( - operator.ge, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - decimal.Decimal("100.123"), - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), - ), - ( - operator.ge, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - cudf.Scalar(decimal.Decimal("100.123")), - cudf.Series([True, False, True, None], dtype=bool), - cudf.Series([True, True, False, None], dtype=bool), - ), - ( - operator.lt, - ["100.00", "41", "120.21", None], - cudf.Decimal64Dtype(scale=2, precision=5), - 100, - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), - ), - ( - operator.lt, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - decimal.Decimal("100.123"), - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), - ), - ( - operator.lt, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - cudf.Scalar(decimal.Decimal("100.123")), - cudf.Series([False, True, False, None], dtype=bool), - cudf.Series([False, False, True, None], dtype=bool), - ), - ( - operator.le, - ["100.00", "41", "120.21", None], - cudf.Decimal64Dtype(scale=2, precision=5), - 100, - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), - ), - ( - operator.le, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - decimal.Decimal("100.123"), - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), - ), - ( - operator.le, - ["100.123", "41", "120.21", None], - cudf.Decimal64Dtype(scale=3, precision=6), - cudf.Scalar(decimal.Decimal("100.123")), - cudf.Series([True, True, False, None], dtype=bool), - cudf.Series([True, False, True, None], dtype=bool), - ), - ], -) -@pytest.mark.parametrize("reflected", [True, False]) -def test_binops_decimal_scalar_compare(args, reflected): - """ - Tested compare operations: - eq, lt, gt, le, ge - Each operation has 3 data setups: pyints, Decimal, and - decimal cudf.Scalar - For each data setup, there is at least one row that lead to one of the - following compare results: {True, False, None}. - """ - if not reflected: - op, ldata, ldtype, rdata, expected, _ = args - else: - op, ldata, ldtype, rdata, _, expected = args - - lhs = utils._decimal_series(ldata, ldtype) - rhs = rdata - - if reflected: - rhs, lhs = lhs, rhs - - actual = op(lhs, rhs) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "dtype", - [ - "uint8", - "uint16", - "uint32", - "uint64", - "int8", - "int16", - "int32", - "int64", - "float32", - "float64", - "str", - "datetime64[ns]", - "datetime64[us]", - "datetime64[ms]", - "datetime64[s]", - "timedelta64[ns]", - "timedelta64[us]", - "timedelta64[ms]", - "timedelta64[s]", - ], -) -@pytest.mark.parametrize("null_scalar", [None, cudf.NA, np.datetime64("NaT")]) -@pytest.mark.parametrize("cmpop", _cmpops) -def test_column_null_scalar_comparison(dtype, null_scalar, cmpop): - # This test is meant to validate that comparing - # a series of any dtype with a null scalar produces - # a new series where all the elements are . - - if isinstance(null_scalar, np.datetime64): - if cudf.dtype(dtype).kind not in "mM": - pytest.skip() - null_scalar = null_scalar.astype(dtype) - - dtype = cudf.dtype(dtype) - - data = [1, 2, 3, 4, 5] - sr = cudf.Series(data, dtype=dtype) - result = cmpop(sr, null_scalar) - - assert result.isnull().all() - - -@pytest.mark.parametrize("fn", ["eq", "ne", "lt", "gt", "le", "ge"]) -def test_equality_ops_index_mismatch(fn): - a = cudf.Series( - [1, 2, 3, None, None, 4], index=["a", "b", "c", "d", "e", "f"] - ) - b = cudf.Series( - [-5, 4, 3, 2, 1, 0, 19, 11], - index=["aa", "b", "c", "d", "e", "f", "y", "z"], - ) - - pa = a.to_pandas(nullable=True) - pb = b.to_pandas(nullable=True) - expected = getattr(pa, fn)(pb) - actual = getattr(a, fn)(b).to_pandas(nullable=True) - - assert_eq(expected, actual) - - -def generate_test_null_equals_columnops_data(): - # Generate tuples of: - # (left_data, right_data, compare_bool - # where compare_bool is the correct answer to - # if the columns should compare as null equals - - def set_null_cases(column_l, column_r, case): - if case == "neither": - return column_l, column_r - elif case == "left": - column_l[1] = None - elif case == "right": - column_r[1] = None - elif case == "both": - column_l[1] = None - column_r[1] = None - else: - raise ValueError("Unknown null case") - return column_l, column_r - - null_cases = ["neither", "left", "right", "both"] - data = [1, 2, 3] - - results = [] - # TODO: Numeric types can be cross compared as null equal - for dtype in ( - list(NUMERIC_TYPES) - + list(DATETIME_TYPES) - + list(TIMEDELTA_TYPES) - + list(STRING_TYPES) - + ["category"] - ): - for case in null_cases: - left = cudf.Series(data, dtype=dtype) - right = cudf.Series(data, dtype=dtype) - if case in {"left", "right"}: - answer = False - else: - answer = True - left, right = set_null_cases(left, right, case) - results.append((left._column, right._column, answer, case)) - - return results - - -@pytest.mark.parametrize( - "lcol,rcol,ans,case", generate_test_null_equals_columnops_data() -) -def test_null_equals_columnops(lcol, rcol, ans, case): - assert lcol.equals(rcol).all() == ans - - -def test_add_series_to_dataframe(): - """Verify that missing columns result in NaNs, not NULLs.""" - assert cp.all( - cp.isnan( - ( - cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - + cudf.Series([1, 2, 3], index=["a", "b", "c"]) - )["c"] - ) - ) - - -@pytest.mark.parametrize("obj_class", [cudf.Series, cudf.Index]) -@pytest.mark.parametrize("binop", _binops) -def test_binops_cupy_array(obj_class, binop): - # Skip 0 to not deal with NaNs from division. - data = range(1, 100) - lhs = obj_class(data) - rhs = cp.array(data) - assert (binop(lhs, rhs) == binop(lhs, lhs)).all() - - -@pytest.mark.parametrize("binop", _binops + _binops_compare) -@pytest.mark.parametrize("data", [None, [-9, 7], [5, -2], [12, 18]]) -@pytest.mark.parametrize("scalar", [1, 3, 12, np.nan]) -def test_empty_column(binop, data, scalar): - gdf = cudf.DataFrame(columns=["a", "b"]) - if data is not None: - gdf["a"] = data - - pdf = gdf.to_pandas() - - got = binop(gdf, scalar) - expected = binop(pdf, scalar) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame( - [[1, 2, 3, 4], [5, 6, 7, 8], [10, 11, 12, 13], [14, 15, 16, 17]] - ), - pytest.param( - cudf.DataFrame([[1, None, None, 4], [5, 6, 7, None]]), - marks=pytest_xfail( - reason="Cannot access Frame.values if frame contains nulls" - ), - ), - cudf.DataFrame( - [ - [1.2, 2.3, 3.4, 4.5], - [5.6, 6.7, 7.8, 8.9], - [7.43, 4.2, 23.2, 23.2], - [9.1, 2.4, 4.5, 65.34], - ] - ), - cudf.Series([14, 15, 16, 17]), - cudf.Series([14.15, 15.16, 16.17, 17.18]), - ], -) -@pytest.mark.parametrize( - "other", - [ - cudf.DataFrame([[9, 10], [11, 12], [13, 14], [15, 16]]), - cudf.DataFrame( - [[9.4, 10.5], [11.6, 12.7], [13.8, 14.9], [15.1, 16.2]] - ), - cudf.Series([5, 6, 7, 8]), - cudf.Series([5.6, 6.7, 7.8, 8.9]), - np.array([5, 6, 7, 8]), - [25.5, 26.6, 27.7, 28.8], - ], -) -def test_binops_dot(df, other): - pdf = df.to_pandas() - host_other = other.to_pandas() if hasattr(other, "to_pandas") else other - - expected = pdf @ host_other - got = df @ other - - assert_eq(expected, got) - - -def test_binop_dot_preserve_index(): - ser = cudf.Series(range(2), index=["A", "B"]) - df = cudf.DataFrame(np.eye(2), columns=["A", "B"], index=["A", "B"]) - result = ser @ df - expected = ser.to_pandas() @ df.to_pandas() - assert_eq(result, expected) - - -def test_binop_series_with_repeated_index(): - # GH: #11094 - psr1 = pd.Series([1, 1], index=["a", "a"]) - psr2 = pd.Series([1], index=["a"]) - gsr1 = cudf.from_pandas(psr1) - gsr2 = cudf.from_pandas(psr2) - expected = psr1 - psr2 - got = gsr1 - gsr2 - assert_eq(expected, got) - - -def test_binop_integer_power_series_series(): - # GH: #10178 - gs_base = cudf.Series([3, -3, 8, -8]) - gs_exponent = cudf.Series([1, 1, 7, 7]) - ps_base = gs_base.to_pandas() - ps_exponent = gs_exponent.to_pandas() - expected = ps_base**ps_exponent - got = gs_base**gs_exponent - assert_eq(expected, got) - - -def test_binop_integer_power_series_scalar(): - # GH: #10178 - gs_base = cudf.Series([3, -3, 8, -8]) - exponent = cudf.Scalar(1) - ps_base = gs_base.to_pandas() - expected = ps_base**exponent.value - got = gs_base**exponent - assert_eq(expected, got) - - -def test_binop_integer_power_series_int(): - # GH: #10178 - gs_base = cudf.Series([3, -3, 8, -8]) - exponent = 1 - ps_base = gs_base.to_pandas() - expected = ps_base**exponent - got = gs_base**exponent - assert_eq(expected, got) - - -def test_binop_integer_power_scalar_series(): - # GH: #10178 - base = cudf.Scalar(3) - gs_exponent = cudf.Series([1, 1, 7, 7]) - ps_exponent = gs_exponent.to_pandas() - expected = base.value**ps_exponent - got = base**gs_exponent - assert_eq(expected, got) - - -def test_binop_integer_power_scalar_scalar(): - # GH: #10178 - base = cudf.Scalar(3) - exponent = cudf.Scalar(1) - expected = base.value**exponent.value - got = base**exponent - assert_eq(expected, got) - - -def test_binop_integer_power_scalar_int(): - # GH: #10178 - base = cudf.Scalar(3) - exponent = 1 - expected = base.value**exponent - got = base**exponent - assert_eq(expected, got) - - -def test_binop_integer_power_int_series(): - # GH: #10178 - base = 3 - gs_exponent = cudf.Series([1, 1, 7, 7]) - ps_exponent = gs_exponent.to_pandas() - expected = base**ps_exponent - got = base**gs_exponent - assert_eq(expected, got) - - -def test_binop_integer_power_int_scalar(): - # GH: #10178 - base = 3 - exponent = cudf.Scalar(1) - expected = base**exponent.value - got = base**exponent - assert_eq(expected, got) - - -def test_numpy_int_scalar_binop(): - assert (np.float32(1.0) - cudf.Scalar(1)) == 0.0 - - -@pytest.mark.parametrize("op", _binops) -def test_binop_index_series(op): - gi = cudf.Index([10, 11, 12]) - gs = cudf.Series([1, 2, 3]) - - actual = op(gi, gs) - expected = op(gi.to_pandas(), gs.to_pandas()) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("name1", utils.SERIES_OR_INDEX_NAMES) -@pytest.mark.parametrize("name2", utils.SERIES_OR_INDEX_NAMES) -def test_binop_index_dt_td_series_with_names(name1, name2): - gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]", name=name1) - gs = cudf.Series([10, 11, 12], dtype="timedelta64[ns]", name=name2) - with warnings.catch_warnings(): - # Numpy raises a deprecation warning: - # "elementwise comparison failed; this will raise an error " - warnings.simplefilter("ignore", (DeprecationWarning,)) - - expected = gi.to_pandas() + gs.to_pandas() - actual = gi + gs - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data1", [[1, 2, 3], [10, 11, None]]) -@pytest.mark.parametrize("data2", [[1, 2, 3], [10, 11, None]]) -def test_binop_eq_ne_index_series(data1, data2): - gi = cudf.Index(data1, dtype="datetime64[ns]", name=np.nan) - gs = cudf.Series(data2, dtype="timedelta64[ns]", name="abc") - - actual = gi == gs - expected = gi.to_pandas() == gs.to_pandas() - - assert_eq(expected, actual) - - actual = gi != gs - expected = gi.to_pandas() != gs.to_pandas() - - assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py deleted file mode 100644 index 03637e05eae..00000000000 --- a/python/cudf/cudf/tests/test_buffer.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import cupy as cp -import pytest - -from cudf.core.buffer import Buffer, as_buffer - -pytestmark = pytest.mark.spilling - -arr_len = 10 - - -@pytest.mark.parametrize( - "data", - [ - (cp.zeros(arr_len), True), - (cp.zeros((1, arr_len)), True), - (cp.zeros((1, arr_len, 1)), True), - (cp.zeros((arr_len, arr_len)), True), - (cp.zeros((arr_len, arr_len)).reshape(arr_len * arr_len), True), - (cp.zeros((arr_len, arr_len))[:, 0], False), - ], -) -def test_buffer_from_cuda_iface_contiguous(data): - data, expect_success = data - if expect_success: - as_buffer(data.view("|u1")) - else: - with pytest.raises(ValueError): - as_buffer(data.view("|u1")) - - -@pytest.mark.parametrize( - "data", - [ - cp.arange(arr_len), - cp.arange(arr_len).reshape(1, arr_len), - cp.arange(arr_len).reshape(1, arr_len, 1), - cp.arange(arr_len**2).reshape(arr_len, arr_len), - ], -) -@pytest.mark.parametrize("dtype", ["uint8", "int8", "float32", "int32"]) -def test_buffer_from_cuda_iface_dtype(data, dtype): - data = data.astype(dtype) - buf = as_buffer(data) - got = cp.array(buf).reshape(-1).view("uint8") - expect = data.reshape(-1).view("uint8") - assert (expect == got).all() - - -def test_buffer_creation_from_any(): - ary = cp.arange(arr_len) - b = as_buffer(ary, exposed=True) - assert isinstance(b, Buffer) - assert ary.data.ptr == b.get_ptr(mode="read") - assert ary.nbytes == b.size - - with pytest.raises( - ValueError, match="size must be specified when `data` is an integer" - ): - as_buffer(ary.data.ptr) - - b = as_buffer(ary.data.ptr, size=ary.nbytes, owner=ary, exposed=True) - assert isinstance(b, Buffer) - assert ary.data.ptr == b.get_ptr(mode="read") - assert ary.nbytes == b.size - assert b.owner.owner.owner is ary - - -@pytest.mark.parametrize("size", [10, 2**10 + 500, 2**20]) -def test_buffer_str(size): - ary = cp.arange(size, dtype="uint8") - buf = as_buffer(ary) - assert f"size={size}" in repr(buf) - - -@pytest.mark.parametrize( - "size,expect", [(10, "10B"), (2**10 + 500, "1.49KiB"), (2**20, "1MiB")] -) -def test_buffer_repr(size, expect): - ary = cp.arange(size, dtype="uint8") - buf = as_buffer(ary) - assert f"size={expect}" in str(buf) - - -@pytest.mark.parametrize( - "idx", - [ - slice(0, 0), - slice(0, 1), - slice(-2, -1), - slice(0, arr_len), - slice(2, 3), - slice(2, -1), - ], -) -def test_buffer_slice(idx): - ary = cp.arange(arr_len, dtype="uint8") - buf = as_buffer(ary) - expect = ary[idx] - got = cp.array(buf[idx]) - assert (expect == got).all() - - -@pytest.mark.parametrize( - "idx, err_type, err_msg", - [ - (1, TypeError, "Argument 'key' has incorrect type"), - (slice(3, 2), ValueError, "size cannot be negative"), - (slice(1, 2, 2), ValueError, "slice must be C-contiguous"), - (slice(1, 2, -1), ValueError, "slice must be C-contiguous"), - (slice(3, 2, -1), ValueError, "slice must be C-contiguous"), - ], -) -def test_buffer_slice_fail(idx, err_type, err_msg): - ary = cp.arange(arr_len, dtype="uint8") - buf = as_buffer(ary) - - with pytest.raises(err_type, match=err_msg): - buf[idx] diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py deleted file mode 100644 index cd1ad21ae59..00000000000 --- a/python/cudf/cudf/tests/test_categorical.py +++ /dev/null @@ -1,954 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import operator -import string -import warnings -from contextlib import contextmanager -from textwrap import dedent - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal - - -@contextmanager -def _hide_cudf_safe_casting_warning(): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "Can't safely cast column", - category=UserWarning, - ) - yield - - -@pytest.fixture -def pd_str_cat(): - categories = list("abc") - codes = [0, 0, 1, 0, 1, 2, 0, 1, 1, 2] - return pd.Categorical.from_codes(codes, categories=categories) - - -def test_categorical_basic(): - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - cudf_cat = cudf.Index(cat) - - pdsr = pd.Series(cat, index=["p", "q", "r", "s", "t"]) - sr = cudf.Series(cat, index=["p", "q", "r", "s", "t"]) - assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) - - # Test attributes - assert_eq(pdsr.cat.categories, sr.cat.categories) - assert pdsr.cat.ordered == sr.cat.ordered - - np.testing.assert_array_equal( - pdsr.cat.codes.values, sr.cat.codes.to_numpy() - ) - - string = str(sr) - expect_str = """ -p a -q a -r b -s c -t a -""" - assert all(x == y for x, y in zip(string.split(), expect_str.split())) - assert_eq(cat.codes, cudf_cat.codes.to_numpy()) - - -def test_categorical_integer(): - cat = pd.Categorical(["a", "_", "_", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - np.testing.assert_array_equal( - cat.codes, sr.cat.codes.astype(cat.codes.dtype).fillna(-1).to_numpy() - ) - assert sr.null_count == 2 - - np.testing.assert_array_equal( - pdsr.cat.codes.values, - sr.cat.codes.astype(pdsr.cat.codes.dtype).fillna(-1).to_numpy(), - ) - - expect_str = dedent( - """\ - 0 a - 1 - 2 - 3 c - 4 a - dtype: category - Categories (3, object): ['a', 'b', 'c']""" - ) - assert str(sr) == expect_str - - -def test_categorical_compare_unordered(): - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - - sr = cudf.Series(cat) - - # test equal - out = sr == sr - assert out.dtype == np.bool_ - assert type(out[0]) == np.bool_ - assert np.all(out.to_numpy()) - assert np.all(pdsr == pdsr) - - # test inequality - out = sr != sr - assert not np.any(out.to_numpy()) - assert not np.any(pdsr != pdsr) - - assert not pdsr.cat.ordered - assert not sr.cat.ordered - - # test using ordered operators - assert_exceptions_equal( - lfunc=operator.lt, - rfunc=operator.lt, - lfunc_args_and_kwargs=([pdsr, pdsr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - -def test_categorical_compare_ordered(): - cat1 = pd.Categorical( - ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True - ) - pdsr1 = pd.Series(cat1) - sr1 = cudf.Series(cat1) - cat2 = pd.Categorical( - ["a", "b", "a", "c", "b"], categories=["a", "b", "c"], ordered=True - ) - pdsr2 = pd.Series(cat2) - sr2 = cudf.Series(cat2) - - # test equal - out = sr1 == sr1 - assert out.dtype == np.bool_ - assert type(out[0]) == np.bool_ - assert np.all(out.to_numpy()) - assert np.all(pdsr1 == pdsr1) - - # test inequality - out = sr1 != sr1 - assert not np.any(out.to_numpy()) - assert not np.any(pdsr1 != pdsr1) - - assert pdsr1.cat.ordered - assert sr1.cat.ordered - - # test using ordered operators - np.testing.assert_array_equal(pdsr1 < pdsr2, (sr1 < sr2).to_numpy()) - np.testing.assert_array_equal(pdsr1 > pdsr2, (sr1 > sr2).to_numpy()) - - -def test_categorical_binary_add(): - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([pdsr, pdsr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - -def test_categorical_element_indexing(): - """ - Element indexing to a cat column must give the underlying object - not the numerical index. - """ - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - assert_eq(pdsr, sr) - assert_eq(pdsr.cat.codes, sr.cat.codes, check_dtype=False) - - -def test_categorical_masking(): - """ - Test common operation for getting a all rows that matches a certain - category. - """ - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - - # check scalar comparison - expect_matches = pdsr == "a" - got_matches = sr == "a" - - np.testing.assert_array_equal( - expect_matches.values, got_matches.to_numpy() - ) - - # mask series - expect_masked = pdsr[expect_matches] - got_masked = sr[got_matches] - - assert len(expect_masked) == len(got_masked) - assert got_masked.null_count == 0 - assert_eq(got_masked, expect_masked) - - -def test_df_cat_set_index(): - df = cudf.DataFrame() - df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) - df["b"] = np.arange(len(df)) - got = df.set_index("a") - - pddf = df.to_pandas() - expect = pddf.set_index("a") - - assert_eq(got, expect) - - -def test_df_cat_sort_index(): - df = cudf.DataFrame() - df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) - df["b"] = np.arange(len(df)) - - got = df.set_index("a").sort_index() - expect = df.to_pandas().set_index("a").sort_index() - - assert_eq(got, expect) - - -def test_cat_series_binop_error(): - df = cudf.DataFrame() - df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) - df["b"] = np.arange(len(df)) - - pdf = df.to_pandas() - - # lhs is categorical - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([pdf["a"], pdf["b"]],), - rfunc_args_and_kwargs=([df["a"], df["b"]],), - ) - - # lhs is numerical - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([pdf["b"], pdf["a"]],), - rfunc_args_and_kwargs=([df["b"], df["a"]],), - ) - - -@pytest.mark.parametrize("num_elements", [10, 100, 1000]) -def test_categorical_unique(num_elements): - # create categorical series - np.random.seed(12) - pd_cat = pd.Categorical( - pd.Series( - np.random.choice( - list(string.ascii_letters + string.digits), num_elements - ), - dtype="category", - ) - ) - - # gdf - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series.from_categorical(pd_cat) - gdf_unique_sorted = np.sort(gdf["a"].unique().to_pandas()) - - # pandas - pdf = pd.DataFrame() - pdf["a"] = pd_cat - pdf_unique_sorted = np.sort(pdf["a"].unique()) - - # verify - np.testing.assert_array_equal(pdf_unique_sorted, gdf_unique_sorted) - - -@pytest.mark.parametrize("nelem", [20, 50, 100]) -def test_categorical_unique_count(nelem): - # create categorical series - np.random.seed(12) - pd_cat = pd.Categorical( - pd.Series( - np.random.choice( - list(string.ascii_letters + string.digits), nelem - ), - dtype="category", - ) - ) - - # gdf - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series.from_categorical(pd_cat) - gdf_unique_count = gdf["a"].nunique() - - # pandas - pdf = pd.DataFrame() - pdf["a"] = pd_cat - pdf_unique = pdf["a"].unique() - - # verify - assert gdf_unique_count == len(pdf_unique) - - -def test_categorical_empty(): - cat = pd.Categorical([]) - pdsr = pd.Series(cat) - sr = cudf.Series(cat) - np.testing.assert_array_equal(cat.codes, sr.cat.codes.to_numpy()) - - # Test attributes - assert_eq(pdsr.cat.categories, sr.cat.categories) - assert pdsr.cat.ordered == sr.cat.ordered - - np.testing.assert_array_equal( - pdsr.cat.codes.values, sr.cat.codes.to_numpy() - ) - - -def test_categorical_set_categories(): - cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) - psr = pd.Series(cat) - sr = cudf.Series.from_categorical(cat) - - # adding category - expect = psr.cat.set_categories(["a", "b", "c", "d"]) - got = sr.cat.set_categories(["a", "b", "c", "d"]) - assert_eq(expect, got) - - # removing category - expect = psr.cat.set_categories(["a", "b"]) - got = sr.cat.set_categories(["a", "b"]) - assert_eq(expect, got) - - -def test_categorical_set_categories_preserves_order(): - series = pd.Series([1, 0, 0, 0, 2]).astype("category") - # reassigning categories should preserve element ordering - assert_eq( - series.cat.set_categories([1, 2]), - cudf.Series(series).cat.set_categories([1, 2]), - ) - - -def test_categorical_as_ordered(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(False)) - cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(False)) - - assert cd_sr.cat.ordered is False - assert cd_sr.cat.ordered == pd_sr.cat.ordered - - pd_sr_1 = pd_sr.cat.as_ordered() - cd_sr_1 = cd_sr.cat.as_ordered() - - assert cd_sr_1.cat.ordered is True - assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered - assert str(cd_sr_1) == str(pd_sr_1) - - -def test_categorical_as_unordered(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(True)) - cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(True)) - - assert cd_sr.cat.ordered is True - assert cd_sr.cat.ordered == pd_sr.cat.ordered - - pd_sr_1 = pd_sr.cat.as_unordered() - cd_sr_1 = cd_sr.cat.as_unordered() - - assert cd_sr_1.cat.ordered is False - assert cd_sr_1.cat.ordered == pd_sr_1.cat.ordered - assert str(cd_sr_1) == str(pd_sr_1) - - -@pytest.mark.parametrize("from_ordered", [True, False]) -@pytest.mark.parametrize("to_ordered", [True, False]) -def test_categorical_reorder_categories(pd_str_cat, from_ordered, to_ordered): - pd_sr = pd.Series(pd_str_cat.copy().set_ordered(from_ordered)) - cd_sr = cudf.Series(pd_str_cat.copy().set_ordered(from_ordered)) - - assert_eq(pd_sr, cd_sr) - - assert str(pd_sr) == str(cd_sr) - - kwargs = dict( - ordered=to_ordered, - ) - - pd_sr_1 = pd_sr.cat.reorder_categories(list("cba"), **kwargs) - cd_sr_1 = cd_sr.cat.reorder_categories(list("cba"), **kwargs) - - assert_eq(pd_sr_1, cd_sr_1) - - assert str(cd_sr_1) == str(pd_sr_1) - - -def test_categorical_add_categories(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy()) - cd_sr = cudf.Series(pd_str_cat.copy()) - - assert_eq(pd_sr, cd_sr) - - assert str(pd_sr) == str(cd_sr) - - pd_sr_1 = pd_sr.cat.add_categories(["d"]) - cd_sr_1 = cd_sr.cat.add_categories(["d"]) - - assert "d" in pd_sr_1.cat.categories.to_list() - assert "d" in cd_sr_1.cat.categories.to_pandas().to_list() - - assert_eq(pd_sr_1, cd_sr_1) - - -def test_categorical_remove_categories(pd_str_cat): - pd_sr = pd.Series(pd_str_cat.copy()) - cd_sr = cudf.Series(pd_str_cat.copy()) - - assert_eq(pd_sr, cd_sr) - - assert str(pd_sr) == str(cd_sr) - - pd_sr_1 = pd_sr.cat.remove_categories(["a"]) - cd_sr_1 = cd_sr.cat.remove_categories(["a"]) - - assert "a" not in pd_sr_1.cat.categories.to_list() - assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() - - assert_eq(pd_sr_1, cd_sr_1) - - # test using ordered operators - assert_exceptions_equal( - lfunc=cd_sr.to_pandas().cat.remove_categories, - rfunc=cd_sr.cat.remove_categories, - lfunc_args_and_kwargs=([["a", "d"]], {}), - rfunc_args_and_kwargs=([["a", "d"]], {}), - ) - - -def test_categorical_dataframe_slice_copy(): - pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")}) - gdf = cudf.from_pandas(pdf) - - exp = pdf[1:].copy() - gdf = gdf[1:].copy() - - assert_eq(exp, gdf) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([1, 2, 3, 89]), - pd.Series([1, 2, 3, 89, 3, 1, 89], dtype="category"), - pd.Series(["1", "2", "3", "4", "5"], dtype="category"), - pd.Series(["1.0", "2.5", "3.001", "9"], dtype="category"), - pd.Series(["1", "2", "3", None, "4", "5"], dtype="category"), - pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"), - pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), - pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), - pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), - pd.Series([1, 2, 3, 89], dtype="float64"), - pd.Series([1, 2.5, 3.001, 89], dtype="float64"), - pd.Series([None, None, None]), - pd.Series([], dtype="float64"), - ], -) -@pytest.mark.parametrize( - "cat_type", - [ - pd.CategoricalDtype(categories=["aa", "bb", "cc"]), - pd.CategoricalDtype(categories=[2, 4, 10, 100]), - pd.CategoricalDtype(categories=["aa", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "b", "c"]), - pd.CategoricalDtype(categories=["1", "2", "3", "4"]), - pd.CategoricalDtype(categories=["1.0", "2.5", "3.001", "9"]), - pd.CategoricalDtype(categories=[]), - ], -) -def test_categorical_typecast(data, cat_type): - pd_data = data.copy() - gd_data = cudf.from_pandas(data) - - assert_eq(pd_data.astype(cat_type), gd_data.astype(cat_type)) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([1, 2, 3, 89]), - pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]), - pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]), - pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"), - pd.Series([1, 2, 3, 89], dtype="float64"), - pd.Series([1, 2.5, 3.001, 89], dtype="float64"), - pd.Series([None, None, None]), - pd.Series([], dtype="float64"), - ], -) -@pytest.mark.parametrize( - "new_categories", - [ - ["aa", "bb", "cc"], - [2, 4, 10, 100], - ["aa", "bb", "c"], - ["a", "bb", "c"], - ["a", "b", "c"], - [], - pd.Series(["a", "b", "c"]), - pd.Series(["a", "b", "c"], dtype="category"), - pd.Series([-100, 10, 11, 0, 1, 2], dtype="category"), - ], -) -def test_categorical_set_categories_categoricals(data, new_categories): - pd_data = data.copy().astype("category") - gd_data = cudf.from_pandas(pd_data) - - expected = pd_data.cat.set_categories(new_categories=new_categories) - with _hide_cudf_safe_casting_warning(): - actual = gd_data.cat.set_categories(new_categories=new_categories) - - assert_eq(expected, actual) - - expected = pd_data.cat.set_categories( - new_categories=pd.Series(new_categories, dtype="category") - ) - with _hide_cudf_safe_casting_warning(): - actual = gd_data.cat.set_categories( - new_categories=cudf.Series(new_categories, dtype="category") - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - ["a", "1", "2", "1", "a"], - pd.Series(["a", "1", "22", "1", "aa"]), - pd.Series(["a", "1", "22", "1", "aa"], dtype="category"), - pd.Series([1, 2, 3, -4], dtype="int64"), - pd.Series([1, 2, 3, 4], dtype="uint64"), - pd.Series([1, 2.3, 3, 4], dtype="float"), - np.asarray([0, 2, 1]), - [None, 1, None, 2, None], - [], - ], -) -@pytest.mark.parametrize( - "dtype", - [ - pd.CategoricalDtype(categories=["aa", "bb", "cc"]), - pd.CategoricalDtype(categories=[2, 4, 10, 100]), - pd.CategoricalDtype(categories=["aa", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "b", "c"]), - pd.CategoricalDtype(categories=["22", "b", "c"]), - pd.CategoricalDtype(categories=[]), - ], -) -def test_categorical_creation(data, dtype): - expected = pd.Series(data, dtype=dtype) - got = cudf.Series(data, dtype=dtype) - assert_eq(expected, got) - - got = cudf.Series(data, dtype=cudf.from_pandas(dtype)) - assert_eq(expected, got) - - expected = pd.Series(data, dtype="category") - got = cudf.Series(data, dtype="category") - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "categories", - [ - [], - [1, 2, 3], - pd.Series(["a", "c", "b"], dtype="category"), - pd.Series([1, 2, 3, 4, -100], dtype="category"), - ], -) -@pytest.mark.parametrize("ordered", [True, False]) -def test_categorical_dtype(categories, ordered): - expected = pd.CategoricalDtype(categories=categories, ordered=ordered) - got = cudf.CategoricalDtype(categories=categories, ordered=ordered) - assert_eq(expected, got) - - expected = pd.CategoricalDtype(categories=categories) - got = cudf.CategoricalDtype(categories=categories) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - ("data", "expected"), - [ - (cudf.Series([1]), np.uint8), - (cudf.Series([1, None]), np.uint8), - (cudf.Series(np.arange(np.iinfo(np.int8).max)), np.uint8), - ( - cudf.Series(np.append(np.arange(np.iinfo(np.int8).max), [None])), - np.uint8, - ), - (cudf.Series(np.arange(np.iinfo(np.int16).max)), np.uint16), - ( - cudf.Series(np.append(np.arange(np.iinfo(np.int16).max), [None])), - np.uint16, - ), - (cudf.Series(np.arange(np.iinfo(np.uint8).max)), np.uint8), - ( - cudf.Series(np.append(np.arange(np.iinfo(np.uint8).max), [None])), - np.uint8, - ), - (cudf.Series(np.arange(np.iinfo(np.uint16).max)), np.uint16), - ( - cudf.Series(np.append(np.arange(np.iinfo(np.uint16).max), [None])), - np.uint16, - ), - ], -) -def test_astype_dtype(data, expected): - got = data.astype("category").cat.codes.dtype - np.testing.assert_equal(got, expected) - - -@pytest.mark.parametrize( - "data,add", - [ - ([1, 2, 3], [100, 11, 12]), - ([1, 2, 3], [0.01, 9.7, 15.0]), - ([0.0, 6.7, 10.0], [100, 11, 12]), - ([0.0, 6.7, 10.0], [0.01, 9.7, 15.0]), - (["a", "bd", "ef"], ["asdfsdf", "bddf", "eff"]), - ([1, 2, 3], []), - ([0.0, 6.7, 10.0], []), - (["a", "bd", "ef"], []), - ], -) -def test_add_categories(data, add): - pds = pd.Series(data, dtype="category") - gds = cudf.Series(data, dtype="category") - - expected = pds.cat.add_categories(add) - with _hide_cudf_safe_casting_warning(): - actual = gds.cat.add_categories(add) - - assert_eq( - expected.cat.codes, actual.cat.codes.astype(expected.cat.codes.dtype) - ) - - # Need to type-cast pandas object to str due to mixed-type - # support in "object" - assert_eq( - expected.cat.categories.astype("str") - if (expected.cat.categories.dtype == "object") - else expected.cat.categories, - actual.cat.categories, - ) - - -@pytest.mark.parametrize( - "data,add", - [ - ([1, 2, 3], [1, 3, 11]), - ([0.0, 6.7, 10.0], [1, 2, 0.0]), - (["a", "bd", "ef"], ["a", "bd", "a"]), - ], -) -def test_add_categories_error(data, add): - pds = pd.Series(data, dtype="category") - gds = cudf.Series(data, dtype="category") - - assert_exceptions_equal( - pds.cat.add_categories, - gds.cat.add_categories, - ([add],), - ([add],), - ) - - -def test_add_categories_mixed_error(): - gds = cudf.Series(["a", "bd", "ef"], dtype="category") - - with pytest.raises(TypeError): - gds.cat.add_categories([1, 2, 3]) - - gds = cudf.Series([1, 2, 3], dtype="category") - - with pytest.raises(TypeError): - gds.cat.add_categories(["a", "bd", "ef"]) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - ["a", "1", "2", "1", "a"], - pd.Series(["a", "1", "22", "1", "aa"]), - pd.Series(["a", "1", "22", "1", "aa"], dtype="category"), - pd.Series([1, 2, 3, 4], dtype="int64"), - pd.Series([1, 2.3, 3, 4], dtype="float"), - [None, 1, None, 2, None], - ["a"], - ], -) -@pytest.mark.parametrize( - "cat_dtype", - [ - pd.CategoricalDtype(categories=["aa", "bb", "cc"]), - pd.CategoricalDtype(categories=[2, 4, 10, 100]), - pd.CategoricalDtype(categories=["aa", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "bb", "c"]), - pd.CategoricalDtype(categories=["a", "b", "c"]), - pd.CategoricalDtype(categories=["22", "b", "c"]), - pd.CategoricalDtype(categories=["a"]), - ], -) -def test_categorical_assignment(data, cat_dtype): - pd_df = pd.DataFrame() - pd_df["a"] = np.ones(len(data)) - cd_df = cudf.from_pandas(pd_df) - - pd_cat_series = pd.Series(data, dtype=cat_dtype) - # assign categorical series - pd_df.assign(cat_col=pd_cat_series) - cd_df.assign(cat_col=pd_cat_series) - assert_eq(pd_df, cd_df) - - # assign categorical array - # needed for dask_cudf support for including file name - # as a categorical column - # see issue: https://github.com/rapidsai/cudf/issues/2269 - pd_df = pd.DataFrame() - pd_df["a"] = np.ones(len(data)) - cd_df = cudf.from_pandas(pd_df) - - pd_categorical = pd.Categorical(data, dtype=cat_dtype) - pd_df.assign(cat_col=pd_categorical) - cd_df.assign(cat_col=pd_categorical) - assert_eq(pd_df, cd_df) - - -def test_categorical_allow_nan(): - gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False) - gs = gs.astype("category") - expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8") - assert_eq(expected_codes, gs.cat.codes) - - expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64") - assert_eq(expected_categories, gs.cat.categories) - - actual_ps = gs.to_pandas() - expected_ps = pd.Series( - [1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category" - ) - assert_eq(actual_ps, expected_ps) - - -def test_categorical_setitem_with_nan(): - gs = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") - gs[[1, 3]] = np.nan - - expected_series = cudf.Series( - [1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False - ).astype(gs.dtype) - assert_eq(gs, expected_series) - - -@pytest.mark.parametrize("dtype", list(NUMERIC_TYPES) + ["object"]) -@pytest.mark.parametrize("input_obj", [[1, cudf.NA, 3]]) -def test_series_construction_with_nulls(input_obj, dtype): - dtype = cudf.dtype(dtype) - input_obj = [ - dtype.type(v) if v is not cudf.NA else cudf.NA for v in input_obj - ] - - expect = pd.Series(input_obj, dtype="category") - got = cudf.Series(input_obj, dtype="category").to_pandas() - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - {"a": cudf.Series(["a", "b", "c", "a", "c", "b"]).astype("category")}, - { - "a": cudf.Series(["a", "a", "b", "b"]).astype("category"), - "b": cudf.Series(["b", "b", "c", "c"]).astype("category"), - "c": cudf.Series(["c", "c", "a", "a"]).astype("category"), - }, - { - "a": cudf.Series(["a", None, "b", "b"]).astype("category"), - "b": cudf.Series(["b", "b", None, "c"]).astype("category"), - "c": cudf.Series(["c", "c", "a", None]).astype("category"), - }, - ], -) -def test_serialize_categorical_columns(data): - df = cudf.DataFrame(data) - recreated = df.__class__.deserialize(*df.serialize()) - assert_eq(recreated, df) - - -@pytest.mark.parametrize( - "data", [["$ 1", "$ 2", "hello"], ["($) 1", "( 2", "hello", "^1$"]] -) -@pytest.mark.parametrize("value", ["$ 1", "hello", "$", "^1$"]) -def test_categorical_string_index_contains(data, value): - idx = cudf.CategoricalIndex(data) - pidx = idx.to_pandas() - - assert_eq(value in idx, value in pidx) - - -def test_categorical_index_with_dtype(): - dtype = cudf.CategoricalDtype(categories=["a", "z", "c"]) - gi = cudf.Index(["z", "c", "a"], dtype=dtype) - pi = pd.Index(["z", "c", "a"], dtype=dtype.to_pandas()) - - assert_eq(gi, pi) - assert_eq(gi.dtype, pi.dtype) - assert_eq(gi.dtype.categories, pi.dtype.categories) - - -def test_cat_iterate_error(): - s = cudf.Series([1, 2, 3], dtype="category") - with pytest.raises(TypeError): - iter(s.cat) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_empty_series_category_cast(ordered): - dtype = cudf.CategoricalDtype(ordered=ordered) - ps = pd.Series([], dtype="str") - gs = cudf.from_pandas(ps) - - expected = ps.astype(dtype.to_pandas()) - actual = gs.astype(dtype) - - assert_eq(expected, actual) - assert_eq(expected.dtype.ordered, actual.dtype.ordered) - - -def test_categorical_dtype_ordered_not_settable(): - with pytest.raises(AttributeError): - cudf.CategoricalDtype().ordered = False - - -@pytest.mark.parametrize("scalar", [1, "a", None, 10.2]) -def test_cat_from_scalar(scalar): - ps = pd.Series(scalar, dtype="category") - gs = cudf.Series(scalar, dtype="category") - - assert_eq(ps, gs) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Does not warn on older versions of pandas", -) -def test_cat_groupby_fillna(): - ps = pd.Series(["a", "b", "c"], dtype="category") - gs = cudf.from_pandas(ps) - - with pytest.warns(FutureWarning): - pg = ps.groupby(ps) - gg = gs.groupby(gs) - - assert_exceptions_equal( - lfunc=pg.fillna, - rfunc=gg.fillna, - lfunc_args_and_kwargs=(("d",), {}), - rfunc_args_and_kwargs=(("d",), {}), - ) - - -@pytest.mark.parametrize("op", ["min", "max"]) -def test_categorical_maxima(op): - ser = cudf.Series( - ["a", "d", "c", "z", "g"], - dtype=cudf.CategoricalDtype(["z", "c", "g", "d", "a"], ordered=False), - ) - assert not ser.cat.ordered - - # Cannot get extrema of unordered Categorical column - with pytest.raises(TypeError, match="Categorical is not ordered"): - getattr(ser, op)() - - # Max/min should work after converting to "ordered" - ser_pd = ser.to_pandas() - result = getattr(ser.cat.as_ordered(), op)() - result_pd = getattr(ser_pd.cat.as_ordered(), op)() - assert_eq(result, result_pd) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_index_ordered(ordered): - pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered) - cudf_ci = cudf.from_pandas(pd_ci) - assert pd_ci.ordered == cudf_ci.ordered - - -@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"]) -@pytest.mark.parametrize("ordered", [True, False]) -def test_index_as_ordered(method, ordered): - pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = getattr(pd_ci, method)() - result = getattr(cudf_ci, method)() - assert_eq(result, expected) - - -def test_index_add_categories(): - pd_ci = pd.CategoricalIndex([1, 2, 3]) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = pd_ci.add_categories([4]) - result = cudf_ci.add_categories([4]) - assert_eq(result, expected) - - -def test_index_remove_categories(): - pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4]) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = pd_ci.remove_categories([4]) - result = cudf_ci.remove_categories([4]) - assert_eq(result, expected) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_index_reorder_categories(ordered): - pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4]) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered) - result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered) - assert_eq(result, expected) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_index_set_categories(ordered): - pd_ci = pd.CategoricalIndex([1, 2, 3]) - cudf_ci = cudf.from_pandas(pd_ci) - - expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered) - result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py deleted file mode 100644 index 4aa7fb27c9b..00000000000 --- a/python/cudf/cudf/tests/test_column.py +++ /dev/null @@ -1,558 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import cupy as cp -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf._lib.transform import mask_to_bools -from cudf.core.column.column import as_column -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal -from cudf.utils import dtypes as dtypeutils - -dtypes = sorted( - list( - dtypeutils.ALL_TYPES - - { - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "timedelta64[s]", - "timedelta64[ms]", - "timedelta64[us]", - } - ) -) - - -@pytest.fixture(params=dtypes, ids=dtypes) -def pandas_input(request): - dtype = request.param - rng = np.random.default_rng() - size = 100 - - def random_ints(dtype, size): - dtype_min = np.iinfo(dtype).min - dtype_max = np.iinfo(dtype).max - return rng.integers(dtype_min, dtype_max, size=size, dtype=dtype) - - try: - dtype = np.dtype(dtype) - except TypeError: - if dtype == "category": - data = random_ints(np.int64, size) - else: - raise - else: - if dtype.kind == "b": - data = rng.choice([False, True], size=size) - elif dtype.kind in ("m", "M"): - # datetime or timedelta - data = random_ints(np.int64, size) - elif dtype.kind == "U": - # Unicode strings of integers like "12345" - data = random_ints(np.int64, size).astype(dtype.str) - elif dtype.kind == "f": - # floats in [0.0, 1.0) - data = rng.random(size=size, dtype=dtype) - else: - data = random_ints(dtype, size) - return pd.Series(data, dtype=dtype) - - -def str_host_view(list_of_str, to_dtype): - return np.concatenate( - [np.frombuffer(s.encode("utf-8"), dtype=to_dtype) for s in list_of_str] - ) - - -@pytest.mark.parametrize("offset", [0, 1, 15]) -@pytest.mark.parametrize("size", [50, 10, 0]) -def test_column_offset_and_size(pandas_input, offset, size): - col = cudf.core.column.as_column(pandas_input) - col = cudf.core.column.build_column( - data=col.base_data, - dtype=col.dtype, - mask=col.base_mask, - size=size, - offset=offset, - children=col.base_children, - ) - - if isinstance(col.dtype, cudf.CategoricalDtype): - assert col.size == col.codes.size - assert col.size == (col.codes.data.size / col.codes.dtype.itemsize) - elif cudf.api.types.is_string_dtype(col.dtype): - if col.size > 0: - assert col.size == (col.children[0].size - 1) - assert col.size == ( - (col.children[0].data.size / col.children[0].dtype.itemsize) - - 1 - ) - else: - assert col.size == (col.data.size / col.dtype.itemsize) - - got = cudf.Series._from_column(col) - - if offset is None: - offset = 0 - if size is None: - size = 100 - else: - size = size + offset - - slicer = slice(offset, size) - expect = pandas_input.iloc[slicer].reset_index(drop=True) - - assert_eq(expect, got) - - -def column_slicing_test(col, offset, size, cast_to_float=False): - col_slice = col.slice(offset, offset + size) - series = cudf.Series._from_column(col) - sliced_series = cudf.Series._from_column(col_slice) - - if cast_to_float: - pd_series = series.astype(float).to_pandas() - sliced_series = sliced_series.astype(float) - else: - pd_series = series.to_pandas() - - if isinstance(col.dtype, cudf.CategoricalDtype): - # The cudf.Series is constructed from an already sliced column, whereas - # the pandas.Series is constructed from the unsliced series and then - # sliced, so the indexes should be different and we must ignore it. - # However, we must compare these as frames, not raw arrays, because - # numpy comparison of categorical values won't work. - assert_eq( - pd_series[offset : offset + size].reset_index(drop=True), - sliced_series.reset_index(drop=True), - ) - else: - assert_eq( - np.asarray(pd_series[offset : offset + size]), - sliced_series.to_numpy(), - ) - - -@pytest.mark.parametrize("offset", [0, 1, 15]) -@pytest.mark.parametrize("size", [50, 10, 0]) -def test_column_slicing(pandas_input, offset, size): - col = cudf.core.column.as_column(pandas_input) - column_slicing_test(col, offset, size) - - -@pytest.mark.parametrize("offset", [0, 1, 15]) -@pytest.mark.parametrize("size", [50, 10, 0]) -@pytest.mark.parametrize("precision", [2, 3, 5]) -@pytest.mark.parametrize("scale", [0, 1, 2]) -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype], -) -def test_decimal_column_slicing(offset, size, precision, scale, decimal_type): - col = cudf.core.column.as_column(pd.Series(np.random.rand(1000))) - col = col.astype(decimal_type(precision, scale)) - column_slicing_test(col, offset, size, True) - - -@pytest.mark.parametrize( - "data", - [ - np.array([[23, 68, 2, 38, 9, 83, 72, 6, 98, 30]]), - np.array([[1, 2], [7, 6]]), - ], -) -def test_column_series_multi_dim(data): - with pytest.raises(ValueError): - cudf.Series(data) - - with pytest.raises(ValueError): - cudf.core.column.as_column(data) - - -@pytest.mark.parametrize( - ("data", "error"), - [ - ([1, "1.0", "2", -3], cudf.errors.MixedTypeError), - ([np.nan, 0, "null", cp.nan], cudf.errors.MixedTypeError), - ( - [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)], - None, - ), - ], -) -def test_column_mixed_dtype(data, error): - if error is None: - cudf.Series(data) - else: - with pytest.raises(error): - cudf.Series(data) - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -@pytest.mark.parametrize( - "scalar", - [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)], - ids=repr, -) -@pytest.mark.parametrize("size", [1, 10]) -def test_as_column_scalar_with_nan(nan_as_null, scalar, size): - expected = ( - cudf.Series([scalar] * size, nan_as_null=nan_as_null) - .dropna() - .to_numpy() - ) - - got = ( - cudf.Series._from_column( - as_column(scalar, length=size, nan_as_null=nan_as_null) - ) - .dropna() - .to_numpy() - ) - - np.testing.assert_equal(expected, got) - - -@pytest.mark.parametrize("data", [[1.1, 2.2, 3.3, 4.4], [1, 2, 3, 4]]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -def test_column_series_cuda_array_dtype(data, dtype): - psr = pd.Series(np.asarray(data), dtype=dtype) - sr = cudf.Series(cp.asarray(data), dtype=dtype) - - assert_eq(psr, sr) - - psr = pd.Series(data, dtype=dtype) - sr = cudf.Series(data, dtype=dtype) - - assert_eq(psr, sr) - - -def test_column_zero_length_slice(): - # see https://github.com/rapidsai/cudf/pull/4777 - from numba import cuda - - x = cudf.DataFrame({"a": [1]}) - the_column = x[1:]["a"]._column - - expect = np.array([], dtype="int8") - got = cuda.as_cuda_array(the_column.data).copy_to_host() - - np.testing.assert_array_equal(expect, got) - - -def test_column_chunked_array_creation(): - pyarrow_array = pa.array([1, 2, 3] * 1000) - chunked_array = pa.chunked_array(pyarrow_array) - - actual_column = cudf.core.column.as_column(chunked_array, dtype="float") - expected_column = cudf.core.column.as_column(pyarrow_array, dtype="float") - - assert_eq( - cudf.Series._from_column(actual_column), - cudf.Series._from_column(expected_column), - ) - - actual_column = cudf.core.column.as_column(chunked_array) - expected_column = cudf.core.column.as_column(pyarrow_array) - - assert_eq( - cudf.Series._from_column(actual_column), - cudf.Series._from_column(expected_column), - ) - - -@pytest.mark.parametrize( - "data,from_dtype,to_dtype", - [ - # equal size different kind - (np.arange(3), "int64", "float64"), - (np.arange(3), "float32", "int32"), - (np.arange(1), "int64", "datetime64[ns]"), - # size / 2^n should work for all n - (np.arange(3), "int64", "int32"), - (np.arange(3), "int64", "int16"), - (np.arange(3), "int64", "int8"), - (np.arange(3), "float64", "float32"), - # evenly divides into bigger type - (np.arange(8), "int8", "int64"), - (np.arange(16), "int8", "int64"), - (np.arange(128), "int8", "int64"), - (np.arange(2), "float32", "int64"), - (np.arange(8), "int8", "datetime64[ns]"), - (np.arange(16), "int8", "datetime64[ns]"), - ], -) -def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype): - cpu_data = np.asarray(data, dtype=from_dtype) - gpu_data = as_column(data, dtype=from_dtype) - - cpu_data_view = cpu_data.view(to_dtype) - gpu_data_view = gpu_data.view(to_dtype) - - expect = pd.Series(cpu_data_view, dtype=cpu_data_view.dtype) - got = cudf.Series._from_column(gpu_data_view).astype(gpu_data_view.dtype) - - gpu_ptr = gpu_data.data.get_ptr(mode="read") - assert gpu_ptr == got._column.data.get_ptr(mode="read") - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data,from_dtype,to_dtype", - [ - (np.arange(9), "int8", "int64"), - (np.arange(3), "int8", "int16"), - (np.arange(6), "int8", "float32"), - (np.arange(1), "int8", "datetime64[ns]"), - ], -) -def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): - cpu_data = np.asarray(data, dtype=from_dtype) - gpu_data = as_column(data, dtype=from_dtype) - - assert_exceptions_equal( - lfunc=cpu_data.view, - rfunc=gpu_data.view, - lfunc_args_and_kwargs=([to_dtype],), - rfunc_args_and_kwargs=([to_dtype],), - ) - - -@pytest.mark.parametrize( - "data,to_dtype", - [ - (["a", "b", "c"], "int8"), - (["ab"], "int8"), - (["ab"], "int16"), - (["a", "ab", "a"], "int8"), - (["abcd", "efgh"], "float32"), - (["abcdefgh"], "datetime64[ns]"), - ], -) -def test_column_view_valid_string_to_numeric(data, to_dtype): - expect = cudf.Series._from_column(cudf.Series(data)._column.view(to_dtype)) - got = cudf.Series(str_host_view(data, to_dtype)) - - assert_eq(expect, got) - - -def test_column_view_nulls_widths_even(): - data = [1, 2, None, 4, None] - expect_data = [ - np.int32(val).view("float32") if val is not None else np.nan - for val in data - ] - - sr = cudf.Series(data, dtype="int32") - expect = cudf.Series(expect_data, dtype="float32") - got = cudf.Series._from_column(sr._column.view("float32")) - - assert_eq(expect, got) - - data = [None, 2.1, None, 5.3, 8.8] - expect_data = [ - np.float64(val).view("int64") if val is not None else val - for val in data - ] - - sr = cudf.Series(data, dtype="float64") - expect = cudf.Series(expect_data, dtype="int64") - got = cudf.Series._from_column(sr._column.view("int64")) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("slc", [slice(1, 5), slice(0, 4), slice(2, 4)]) -def test_column_view_numeric_slice(slc): - data = np.array([1, 2, 3, 4, 5], dtype="int32") - sr = cudf.Series(data) - - expect = cudf.Series(data[slc].view("int64")) - got = cudf.Series._from_column( - sr._column.slice(slc.start, slc.stop).view("int64") - ) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "slc", [slice(3, 5), slice(0, 4), slice(2, 5), slice(1, 3)] -) -def test_column_view_string_slice(slc): - data = ["a", "bcde", "cd", "efg", "h"] - - expect = cudf.Series._from_column( - cudf.Series(data)._column.slice(slc.start, slc.stop).view("int8") - ) - got = cudf.Series(str_host_view(data[slc], "int8")) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data,expected", - [ - ( - np.array([1, 2, 3, 4, 5], dtype="uint8"), - cudf.core.column.as_column([1, 2, 3, 4, 5], dtype="uint8"), - ), - ( - cp.array([1, 2, 3, 4, 5], dtype="uint8"), - cudf.core.column.as_column([1, 2, 3, 4, 5], dtype="uint8"), - ), - ( - cp.array([], dtype="uint8"), - cudf.core.column.column_empty(0, dtype="uint8"), - ), - ( - cp.array([255], dtype="uint8"), - cudf.core.column.as_column([255], dtype="uint8"), - ), - ], -) -def test_as_column_buffer(data, expected): - actual_column = cudf.core.column.as_column( - cudf.core.buffer.as_buffer(data), dtype=data.dtype - ) - assert_eq( - cudf.Series._from_column(actual_column), - cudf.Series._from_column(expected), - ) - - -@pytest.mark.parametrize( - "data,pyarrow_kwargs,cudf_kwargs", - [ - ( - [100, 200, 300], - {"type": pa.decimal128(3)}, - {"dtype": cudf.core.dtypes.Decimal128Dtype(3, 0)}, - ), - ( - [{"a": 1, "b": 3}, {"c": 2, "d": 4}], - {}, - {}, - ), - ( - [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], - {}, - {}, - ), - ], -) -def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs): - pyarrow_data = pa.array(data, **pyarrow_kwargs) - cudf_from_pyarrow = as_column(pyarrow_data) - expected = as_column(data, **cudf_kwargs) - assert_eq( - cudf.Series._from_column(cudf_from_pyarrow), - cudf.Series._from_column(expected), - ) - - -@pytest.mark.parametrize( - "pd_dtype,expect_dtype", - [ - # TODO: Nullable float is coming - (pd.StringDtype(), np.dtype("O")), - (pd.UInt8Dtype(), np.dtype("uint8")), - (pd.UInt16Dtype(), np.dtype("uint16")), - (pd.UInt32Dtype(), np.dtype("uint32")), - (pd.UInt64Dtype(), np.dtype("uint64")), - (pd.Int8Dtype(), np.dtype("int8")), - (pd.Int16Dtype(), np.dtype("int16")), - (pd.Int32Dtype(), np.dtype("int32")), - (pd.Int64Dtype(), np.dtype("int64")), - (pd.BooleanDtype(), np.dtype("bool")), - ], -) -def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): - if pd_dtype == pd.StringDtype(): - data = ["a", pd.NA, "c", pd.NA, "e"] - elif pd_dtype == pd.BooleanDtype(): - data = [True, pd.NA, False, pd.NA, True] - else: - data = [1, pd.NA, 3, pd.NA, 5] - - pd_data = pd.DataFrame.from_dict({"a": data}, dtype=pd_dtype) - gd_data = cudf.DataFrame.from_pandas(pd_data) - - assert gd_data["a"].dtype == expect_dtype - - # check mask - expect_mask = [x is not pd.NA for x in pd_data["a"]] - got_mask = mask_to_bools( - gd_data["a"]._column.base_mask, 0, len(gd_data) - ).values_host - - np.testing.assert_array_equal(expect_mask, got_mask) - - -@pytest.mark.parametrize( - "pd_dtype,expect_dtype", - [ - # TODO: Nullable float is coming - (pd.StringDtype(), np.dtype("O")), - (pd.UInt8Dtype(), np.dtype("uint8")), - (pd.UInt16Dtype(), np.dtype("uint16")), - (pd.UInt32Dtype(), np.dtype("uint32")), - (pd.UInt64Dtype(), np.dtype("uint64")), - (pd.Int8Dtype(), np.dtype("int8")), - (pd.Int16Dtype(), np.dtype("int16")), - (pd.Int32Dtype(), np.dtype("int32")), - (pd.Int64Dtype(), np.dtype("int64")), - (pd.BooleanDtype(), np.dtype("bool")), - ], -) -def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): - if pd_dtype == pd.StringDtype(): - data = ["a", pd.NA, "c", pd.NA, "e"] - elif pd_dtype == pd.BooleanDtype(): - data = [True, pd.NA, False, pd.NA, True] - else: - data = [1, pd.NA, 3, pd.NA, 5] - - pd_data = pd.Series(data, dtype=pd_dtype) - gd_data = cudf.Series.from_pandas(pd_data) - - assert gd_data.dtype == expect_dtype - - # check mask - expect_mask = [x is not pd.NA for x in pd_data] - got_mask = mask_to_bools( - gd_data._column.base_mask, 0, len(gd_data) - ).values_host - - np.testing.assert_array_equal(expect_mask, got_mask) - - -@pytest.mark.parametrize( - "alias,expect_dtype", - [ - ("UInt8", "uint8"), - ("UInt16", "uint16"), - ("UInt32", "uint32"), - ("UInt64", "uint64"), - ("Int8", "int8"), - ("Int16", "int16"), - ("Int32", "int32"), - ("Int64", "int64"), - ("boolean", "bool"), - ("Float32", "float32"), - ("Float64", "float64"), - ], -) -@pytest.mark.parametrize( - "data", - [[1, 2, 0]], -) -def test_astype_with_aliases(alias, expect_dtype, data): - pd_data = pd.Series(data) - gd_data = cudf.Series.from_pandas(pd_data) - - assert_eq(pd_data.astype(expect_dtype), gd_data.astype(alias)) diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py deleted file mode 100644 index 5cef077c18d..00000000000 --- a/python/cudf/cudf/tests/test_column_accessor.py +++ /dev/null @@ -1,391 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - - -import pandas as pd -import pytest - -import cudf -from cudf.core.column import as_column -from cudf.core.column_accessor import ColumnAccessor -from cudf.testing import assert_eq - -simple_test_data = [ - {}, - {"a": as_column([])}, - {"a": as_column([1])}, - {"a": as_column(["a"])}, - {"a": as_column([1, 2, 3]), "b": as_column(["a", "b", "c"])}, -] - -mi_test_data = [ - {("a", "b"): as_column([1, 2, 4]), ("a", "c"): as_column([2, 3, 4])}, - {("a", "b"): as_column([1, 2, 3]), ("a", ""): as_column([2, 3, 4])}, - {("a", "b"): as_column([1, 2, 4]), ("c", "d"): as_column([2, 3, 4])}, - { - ("a", "b"): as_column([1, 2, 3]), - ("a", "c"): as_column([2, 3, 4]), - ("b", ""): as_column([4, 5, 6]), - }, -] - - -def check_ca_equal(lhs, rhs): - assert lhs.level_names == rhs.level_names - assert lhs.multiindex == rhs.multiindex - assert lhs.rangeindex == rhs.rangeindex - assert lhs.label_dtype == rhs.label_dtype - for l_key, r_key in zip(lhs, rhs): - assert l_key == r_key - assert_eq(lhs[l_key], rhs[r_key]) - - -@pytest.fixture(params=simple_test_data) -def simple_data(request): - return request.param - - -@pytest.fixture(params=mi_test_data) -def mi_data(request): - return request.param - - -@pytest.fixture(params=simple_test_data + mi_test_data) -def all_data(request): - return request.param - - -def test_to_pandas_simple(simple_data): - """ - Test that a ColumnAccessor converts to a correct pd.Index - """ - ca = ColumnAccessor(simple_data) - # We cannot return RangeIndex, while pandas returns RangeIndex. - # Pandas compares `inferred_type` which is `empty` for - # Index([], dtype='object'), and `integer` for RangeIndex() - # to ignore this `inferred_type` comparison, we pass exact=False. - assert_eq( - ca.to_pandas_index(), - pd.DataFrame( - {key: value.values_host for key, value in simple_data.items()} - ).columns, - exact=False, - ) - - -def test_to_pandas_multiindex(mi_data): - ca = ColumnAccessor(mi_data, multiindex=True) - assert_eq( - ca.to_pandas_index(), - pd.DataFrame( - {key: value.values_host for key, value in mi_data.items()} - ).columns, - ) - - -def test_to_pandas_multiindex_names(): - ca = ColumnAccessor( - {("a", "b"): as_column([1, 2, 3]), ("c", "d"): as_column([3, 4, 5])}, - multiindex=True, - level_names=("foo", "bar"), - ) - assert_eq( - ca.to_pandas_index(), - pd.MultiIndex.from_tuples( - (("a", "b"), ("c", "d")), names=("foo", "bar") - ), - ) - - -def test_iter(simple_data): - """ - Test that iterating over the CA - yields column names. - """ - ca = ColumnAccessor(simple_data) - for expect_key, got_key in zip(simple_data, ca): - assert expect_key == got_key - - -def test_all_columns(simple_data): - """ - Test that all values of the CA are - columns. - """ - ca = ColumnAccessor(simple_data) - for col in ca.values(): - assert isinstance(col, cudf.core.column.ColumnBase) - - -def test_column_size_mismatch(): - """ - Test that constructing a CA from columns of - differing sizes throws an error. - """ - with pytest.raises(ValueError): - ColumnAccessor({"a": as_column([1]), "b": as_column([1, 2])}) - - -def test_select_by_label_simple(): - """ - Test getting a column by label - """ - ca = ColumnAccessor({"a": as_column([1, 2, 3]), "b": as_column([2, 3, 4])}) - check_ca_equal( - ca.select_by_label("a"), ColumnAccessor({"a": as_column([1, 2, 3])}) - ) - check_ca_equal( - ca.select_by_label("b"), ColumnAccessor({"b": as_column([2, 3, 4])}) - ) - - -def test_select_by_label_multiindex(): - """ - Test getting column(s) by label with MultiIndex - """ - ca = ColumnAccessor( - { - ("a", "b", "c"): as_column([1, 2, 3]), - ("a", "b", "e"): as_column([2, 3, 4]), - ("b", "x", ""): as_column([4, 5, 6]), - ("a", "d", "e"): as_column([3, 4, 5]), - }, - multiindex=True, - ) - - expect = ColumnAccessor( - { - ("b", "c"): as_column([1, 2, 3]), - ("b", "e"): as_column([2, 3, 4]), - ("d", "e"): as_column([3, 4, 5]), - }, - multiindex=True, - ) - got = ca.select_by_label("a") - check_ca_equal(expect, got) - - expect = ColumnAccessor( - {"c": as_column([1, 2, 3]), "e": as_column([2, 3, 4])}, - multiindex=False, - ) - got = ca.select_by_label(("a", "b")) - check_ca_equal(expect, got) - - expect = ColumnAccessor( - { - ("b", "c"): as_column([1, 2, 3]), - ("b", "e"): as_column([2, 3, 4]), - ("d", "e"): as_column([3, 4, 5]), - }, - multiindex=True, - ) - got = ca.select_by_label("a") - check_ca_equal(expect, got) - - expect = ColumnAccessor( - {"c": as_column([1, 2, 3]), "e": as_column([2, 3, 4])}, - multiindex=False, - ) - got = ca.select_by_label(("a", "b")) - check_ca_equal(expect, got) - - -def test_select_by_label_simple_slice(): - ca = ColumnAccessor( - { - "a": as_column([1, 2, 3]), - "b": as_column([2, 3, 4]), - "c": as_column([3, 4, 5]), - } - ) - expect = ColumnAccessor( - {"b": as_column([2, 3, 4]), "c": as_column([3, 4, 5])} - ) - got = ca.select_by_label(slice("b", "c")) - check_ca_equal(expect, got) - - -def test_select_by_label_multiindex_slice(): - ca = ColumnAccessor( - { - ("a", "b", "c"): as_column([1, 2, 3]), - ("a", "b", "e"): as_column([2, 3, 4]), - ("a", "d", "e"): as_column([3, 4, 5]), - ("b", "x", ""): as_column([4, 5, 6]), - }, - multiindex=True, - ) # pandas needs columns to be sorted to do slicing with multiindex - expect = ca - got = ca.select_by_label(slice(None, None)) - check_ca_equal(expect, got) - - expect = ColumnAccessor( - { - ("a", "b", "e"): as_column([2, 3, 4]), - ("a", "d", "e"): as_column([3, 4, 5]), - ("b", "x", ""): as_column([4, 5, 6]), - }, - multiindex=True, - ) - got = ca.select_by_label(slice(("a", "b", "e"), ("b", "x", ""))) - check_ca_equal(expect, got) - - -def test_by_label_list(): - ca = ColumnAccessor( - { - "a": as_column([1, 2, 3]), - "b": as_column([2, 3, 4]), - "c": as_column([3, 4, 5]), - } - ) - expect = ColumnAccessor( - {"b": as_column([2, 3, 4]), "c": as_column([3, 4, 5])} - ) - got = ca.select_by_label(["b", "c"]) - check_ca_equal(expect, got) - - -def test_select_by_index_simple(): - """ - Test getting a column by label - """ - ca = ColumnAccessor({"a": as_column([1, 2, 3]), "b": as_column([2, 3, 4])}) - check_ca_equal( - ca.select_by_index(0), ColumnAccessor({"a": as_column([1, 2, 3])}) - ) - check_ca_equal( - ca.select_by_index(1), ColumnAccessor({"b": as_column([2, 3, 4])}) - ) - check_ca_equal(ca.select_by_index([0, 1]), ca) - check_ca_equal(ca.select_by_index(slice(0, None)), ca) - - -def test_select_by_index_multiindex(): - """ - Test getting column(s) by label with MultiIndex - """ - ca = ColumnAccessor( - { - ("a", "b", "c"): as_column([1, 2, 3]), - ("a", "b", "e"): as_column([2, 3, 4]), - ("b", "x", ""): as_column([4, 5, 6]), - ("a", "d", "e"): as_column([3, 4, 5]), - }, - multiindex=True, - ) - - expect = ColumnAccessor( - { - ("a", "b", "c"): as_column([1, 2, 3]), - ("a", "b", "e"): as_column([2, 3, 4]), - ("b", "x", ""): as_column([4, 5, 6]), - }, - multiindex=True, - ) - got = ca.select_by_index(slice(0, 3)) - check_ca_equal(expect, got) - - expect = ColumnAccessor( - { - ("a", "b", "c"): as_column([1, 2, 3]), - ("a", "b", "e"): as_column([2, 3, 4]), - ("a", "d", "e"): as_column([3, 4, 5]), - }, - multiindex=True, - ) - got = ca.select_by_index([0, 1, 3]) - check_ca_equal(expect, got) - - -def test_select_by_index_empty(): - ca = ColumnAccessor( - { - ("a", "b", "c"): as_column([1, 2, 3]), - ("a", "b", "e"): as_column([2, 3, 4]), - ("b", "x", ""): as_column([4, 5, 6]), - ("a", "d", "e"): as_column([3, 4, 5]), - }, - multiindex=True, - ) - expect = ColumnAccessor( - {}, multiindex=True, level_names=((None, None, None)) - ) - got = ca.select_by_index(slice(None, 0)) - check_ca_equal(expect, got) - - got = ca.select_by_index([]) - check_ca_equal(expect, got) - - -def test_replace_level_values_RangeIndex(): - ca = ColumnAccessor( - { - ("a"): as_column([1, 2, 3]), - ("b"): as_column([2, 3, 4]), - ("c"): as_column([3, 4, 5]), - }, - multiindex=False, - ) - - expect = ColumnAccessor( - { - ("f"): as_column([1, 2, 3]), - ("b"): as_column([2, 3, 4]), - ("c"): as_column([3, 4, 5]), - }, - multiindex=False, - ) - - got = ca.rename_levels(mapper={"a": "f"}, level=0) - check_ca_equal(expect, got) - - -def test_replace_level_values_MultiColumn(): - ca = ColumnAccessor( - { - ("a", 1): as_column([1, 2, 3]), - ("a", 2): as_column([2, 3, 4]), - ("b", 1): as_column([3, 4, 5]), - }, - multiindex=True, - ) - - expect = ColumnAccessor( - { - ("f", 1): as_column([1, 2, 3]), - ("f", 2): as_column([2, 3, 4]), - ("b", 1): as_column([3, 4, 5]), - }, - multiindex=True, - ) - - got = ca.rename_levels(mapper={"a": "f"}, level=0) - check_ca_equal(expect, got) - - # passing without level kwarg assumes level=0 - got = ca.rename_levels(mapper={"a": "f"}) - check_ca_equal(expect, got) - - -def test_clear_nrows_empty_before(): - ca = ColumnAccessor({}) - assert ca.nrows == 0 - ca.insert("new", as_column([1])) - assert ca.nrows == 1 - - -def test_clear_nrows_empty_after(): - ca = ColumnAccessor({"new": as_column([1])}) - assert ca.nrows == 1 - del ca["new"] - assert ca.nrows == 0 - - -def test_not_rangeindex_and_multiindex(): - with pytest.raises(ValueError): - ColumnAccessor({}, multiindex=True, rangeindex=True) - - -def test_data_values_not_column_raises(): - with pytest.raises(ValueError): - ColumnAccessor({"a": [1]}) diff --git a/python/cudf/cudf/tests/test_compile_udf.py b/python/cudf/cudf/tests/test_compile_udf.py deleted file mode 100644 index d965f35ccdd..00000000000 --- a/python/cudf/cudf/tests/test_compile_udf.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -from numba import types - -from cudf.utils import cudautils - - -def setup_function(): - cudautils._udf_code_cache.clear() - - -def assert_cache_size(size): - assert cudautils._udf_code_cache.currsize == size - - -def test_first_compile_sets_cache_entry(): - # The first compilation should put an entry in the cache - cudautils.compile_udf(lambda x: x + 1, (types.float32,)) - assert_cache_size(1) - - -def test_code_cache_same_code_different_function_hit(): - # Compilation of a distinct function with the same code and signature - # should reuse the cached entry - - cudautils.compile_udf(lambda x: x + 1, (types.float32,)) - assert_cache_size(1) - - cudautils.compile_udf(lambda x: x + 1, (types.float32,)) - assert_cache_size(1) - - -def test_code_cache_different_types_miss(): - # Compilation of a distinct function with the same code but different types - # should create an additional cache entry - - cudautils.compile_udf(lambda x: x + 1, (types.float32,)) - assert_cache_size(1) - - cudautils.compile_udf(lambda x: x + 1, (types.float64,)) - assert_cache_size(2) - - -def test_code_cache_different_cvars_miss(): - # Compilation of a distinct function with the same types and code as an - # existing entry but different closure variables should create an - # additional cache entry - - def gen_closure(y): - return lambda x: x + y - - cudautils.compile_udf(gen_closure(1), (types.float32,)) - assert_cache_size(1) - - cudautils.compile_udf(gen_closure(2), (types.float32,)) - assert_cache_size(2) - - -def test_lambda_in_loop_code_cached(): - # Compiling a UDF defined in a loop should result in the code cache being - # reused for each loop iteration after the first. We check for this by - # ensuring that there is only one entry in the code cache after the loop. - - for i in range(3): - cudautils.compile_udf(lambda x: x + 1, (types.float32,)) - - assert_cache_size(1) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py deleted file mode 100644 index 8da589ba45b..00000000000 --- a/python/cudf/cudf/tests/test_concat.py +++ /dev/null @@ -1,2027 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import warnings -from contextlib import contextmanager -from decimal import Decimal - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_GE_220 -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@contextmanager -def _hide_concat_empty_dtype_warning(): - with warnings.catch_warnings(): - # Ignoring warnings in this test as warnings are - # being caught and validated in other tests. - warnings.filterwarnings( - "ignore", - "The behavior of array concatenation with empty entries " - "is deprecated.", - category=FutureWarning, - ) - yield - - -def make_frames(index=None, nulls="none"): - df = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": list("abcde") * 2, - } - ) - df.z = df.z.astype("category") - df2 = pd.DataFrame( - { - "x": range(10, 20), - "y": list(map(float, range(10, 20))), - "z": list("edcba") * 2, - } - ) - df2.z = df2.z.astype("category") - if nulls == "all": - df.y = np.full_like(df.y, np.nan) - df2.y = np.full_like(df2.y, np.nan) - if nulls == "some": - mask = np.arange(10) - np.random.shuffle(mask) - mask = mask[:5] - df.loc[mask, "y"] = np.nan - df2.loc[mask, "y"] = np.nan - gdf = cudf.DataFrame.from_pandas(df) - gdf2 = cudf.DataFrame.from_pandas(df2) - if index: - df = df.set_index(index) - df2 = df2.set_index(index) - gdf = gdf.set_index(index) - gdf2 = gdf2.set_index(index) - return df, df2, gdf, gdf2 - - -@pytest.mark.parametrize("nulls", ["none", "some", "all"]) -@pytest.mark.parametrize("index", [False, "z", "y"]) -@pytest.mark.parametrize("axis", [0, "index"]) -def test_concat_dataframe(index, nulls, axis): - if index == "y" and nulls in ("some", "all"): - pytest.skip("nulls in columns, dont index") - df, df2, gdf, gdf2 = make_frames(index, nulls=nulls) - # Make empty frame - gdf_empty1 = gdf2[:0] - assert len(gdf_empty1) == 0 - df_empty1 = gdf_empty1.to_pandas() - - # DataFrame - with _hide_concat_empty_dtype_warning(): - res = cudf.concat([gdf, gdf2, gdf, gdf_empty1], axis=axis).to_pandas() - sol = pd.concat([df, df2, df, df_empty1], axis=axis) - assert_eq( - res, - sol, - check_names=False, - check_categorical=False, - check_index_type=True, - ) - - # Series - for c in [i for i in ("x", "y", "z") if i != index]: - res = cudf.concat([gdf[c], gdf2[c], gdf[c]], axis=axis).to_pandas() - sol = pd.concat([df[c], df2[c], df[c]], axis=axis) - assert_eq( - res, - sol, - check_names=False, - check_categorical=False, - check_index_type=True, - ) - - # Index - res = cudf.concat([gdf.index, gdf2.index], axis=axis).to_pandas() - sol = df.index.append(df2.index) - assert_eq(res, sol, check_names=False, check_categorical=False) - - -@pytest.mark.parametrize( - "values", - [["foo", "bar"], [1.0, 2.0], pd.Series(["one", "two"], dtype="category")], -) -def test_concat_all_nulls(values): - pa = pd.Series(values) - pb = pd.Series([None]) - ps = pd.concat([pa, pb]) - - ga = cudf.Series(values) - gb = cudf.Series([None]) - gs = cudf.concat([ga, gb]) - - assert_eq( - ps, - gs, - check_dtype=False, - check_categorical=False, - check_index_type=True, - ) - - -def test_concat_errors(): - df, df2, gdf, gdf2 = make_frames() - - # No objs - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": []}), - rfunc_args_and_kwargs=([], {"objs": []}), - ) - - # All None - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": [None, None]}), - rfunc_args_and_kwargs=([], {"objs": [None, None]}), - ) - - # Mismatched types - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": [df, df.index, df.x]}), - rfunc_args_and_kwargs=([], {"objs": [gdf, gdf.index, gdf.x]}), - ) - - # Unknown type - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": ["bar", "foo"]}), - rfunc_args_and_kwargs=([], {"objs": ["bar", "foo"]}), - ) - - # Mismatched index dtypes - gdf3 = gdf2.copy() - del gdf3["z"] - gdf4 = gdf2.set_index("z") - - with pytest.raises(ValueError, match="All columns must be the same type"): - cudf.concat([gdf3, gdf4]) - - # Bad axis value - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=( - [], - {"objs": [gdf.to_pandas(), gdf2.to_pandas()], "axis": "bad_value"}, - ), - rfunc_args_and_kwargs=([], {"objs": [gdf, gdf2], "axis": "bad_value"}), - ) - - -def test_concat_misordered_columns(): - df, df2, gdf, gdf2 = make_frames(False) - gdf2 = gdf2[["z", "x", "y"]] - df2 = df2[["z", "x", "y"]] - - res = cudf.concat([gdf, gdf2]).to_pandas() - sol = pd.concat([df, df2], sort=False) - - assert_eq( - res, - sol, - check_names=False, - check_categorical=False, - check_index_type=True, - ) - - -@pytest.mark.parametrize("axis", [1, "columns"]) -def test_concat_columns(axis): - pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3]) - pdf2 = pd.DataFrame( - np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7] - ) - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - - expect = pd.concat([pdf1, pdf2], axis=axis) - got = cudf.concat([gdf1, gdf2], axis=axis) - - assert_eq(expect, got, check_index_type=True) - - -@pytest.mark.parametrize("axis", [0, 1]) -def test_concat_multiindex_dataframe(axis): - gdf = cudf.DataFrame( - { - "w": np.arange(4), - "x": np.arange(4), - "y": np.arange(4), - "z": np.arange(4), - } - ) - gdg = gdf.groupby(["w", "x"]).min() - pdg = gdg.to_pandas() - pdg1 = pdg.iloc[:, :1] - pdg2 = pdg.iloc[:, 1:] - gdg1 = cudf.from_pandas(pdg1) - gdg2 = cudf.from_pandas(pdg2) - expected = pd.concat([pdg1, pdg2], axis=axis) - result = cudf.concat([gdg1, gdg2], axis=axis) - assert_eq( - expected, - result, - check_index_type=True, - ) - - -def test_concat_multiindex_series(): - gdf = cudf.DataFrame( - { - "w": np.arange(4), - "x": np.arange(4), - "y": np.arange(4), - "z": np.arange(4), - } - ) - gdg = gdf.groupby(["w", "x"]).min() - pdg = gdg.to_pandas() - pdg1 = pdg["y"] - pdg2 = pdg["z"] - gdg1 = cudf.from_pandas(pdg1) - gdg2 = cudf.from_pandas(pdg2) - assert_eq( - cudf.concat([gdg1, gdg2]), - pd.concat([pdg1, pdg2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1) - ) - - -def test_concat_multiindex_dataframe_and_series(): - gdf = cudf.DataFrame( - { - "w": np.arange(4), - "x": np.arange(4), - "y": np.arange(4), - "z": np.arange(4), - } - ) - gdg = gdf.groupby(["w", "x"]).min() - pdg = gdg.to_pandas() - pdg1 = pdg[["y", "z"]] - pdg2 = pdg["z"] - pdg2.name = "a" - gdg1 = cudf.from_pandas(pdg1) - gdg2 = cudf.from_pandas(pdg2) - assert_eq( - cudf.concat([gdg1, gdg2], axis=1), - pd.concat([pdg1, pdg2], axis=1), - check_index_type=True, - ) - - -def test_concat_multiindex_series_and_dataframe(): - gdf = cudf.DataFrame( - { - "w": np.arange(4), - "x": np.arange(4), - "y": np.arange(4), - "z": np.arange(4), - } - ) - gdg = gdf.groupby(["w", "x"]).min() - pdg = gdg.to_pandas() - pdg1 = pdg["z"] - pdg2 = pdg[["y", "z"]] - pdg1.name = "a" - gdg1 = cudf.from_pandas(pdg1) - gdg2 = cudf.from_pandas(pdg2) - assert_eq( - cudf.concat([gdg1, gdg2], axis=1), - pd.concat([pdg1, pdg2], axis=1), - check_index_type=True, - ) - - -@pytest.mark.parametrize("myindex", ["a", "b"]) -def test_concat_string_index_name(myindex): - # GH-Issue #3420 - data = {"a": [123, 456], "b": ["s1", "s2"]} - df1 = cudf.DataFrame(data).set_index(myindex) - df2 = df1.copy() - df3 = cudf.concat([df1, df2]) - - assert df3.index.name == myindex - - -def test_pandas_concat_compatibility_axis1(): - d1 = cudf.datasets.randomdata( - 3, dtypes={"a": float, "ind": float} - ).set_index("ind") - d2 = cudf.datasets.randomdata( - 3, dtypes={"b": float, "ind": float} - ).set_index("ind") - d3 = cudf.datasets.randomdata( - 3, dtypes={"c": float, "ind": float} - ).set_index("ind") - d4 = cudf.datasets.randomdata( - 3, dtypes={"d": float, "ind": float} - ).set_index("ind") - d5 = cudf.datasets.randomdata( - 3, dtypes={"e": float, "ind": float} - ).set_index("ind") - - pd1 = d1.to_pandas() - pd2 = d2.to_pandas() - pd3 = d3.to_pandas() - pd4 = d4.to_pandas() - pd5 = d5.to_pandas() - - expect = pd.concat([pd1, pd2, pd3, pd4, pd5], axis=1) - got = cudf.concat([d1, d2, d3, d4, d5], axis=1) - - assert_eq( - got.sort_index(), - expect.sort_index(), - check_index_type=True, - ) - - -@pytest.mark.parametrize("index", [[0, 1, 2], [2, 1, 0], [5, 9, 10]]) -@pytest.mark.parametrize("names", [False, (0, 1)]) -@pytest.mark.parametrize( - "data", - [ - (["a", "b", "c"], ["a", "b", "c"]), - (["a", "b", "c"], ["XX", "YY", "ZZ"]), - ], -) -def test_pandas_concat_compatibility_axis1_overlap(index, names, data): - s1 = cudf.Series(data[0], index=[0, 1, 2]) - s2 = cudf.Series(data[1], index=index) - if names: - s1.name = names[0] - s2.name = names[1] - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - got = cudf.concat([s1, s2], axis=1) - expect = pd.concat([ps1, ps2], axis=1) - assert_eq(got, expect, check_index_type=True) - - -def test_pandas_concat_compatibility_axis1_eq_index(): - s1 = cudf.Series(["a", "b", "c"], index=[0, 1, 2]) - s2 = cudf.Series(["a", "b", "c"], index=[1, 1, 1]) - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - - assert_exceptions_equal( - lfunc=pd.concat, - rfunc=cudf.concat, - lfunc_args_and_kwargs=([], {"objs": [ps1, ps2], "axis": 1}), - rfunc_args_and_kwargs=([], {"objs": [s1, s2], "axis": 1}), - ) - - -@pytest.mark.parametrize("name", [None, "a"]) -def test_pandas_concat_compatibility_axis1_single_column(name): - # Pandas renames series name `None` to 0 - # and preserves anything else - s = cudf.Series([1, 2, 3], name=name) - got = cudf.concat([s], axis=1) - expected = pd.concat([s.to_pandas()], axis=1) - assert_eq(expected, got) - - -def test_concat_duplicate_columns(): - cdf = cudf.DataFrame( - { - "id4": 4 * list(range(6)), - "id5": 4 * list(reversed(range(6))), - "v3": 6 * list(range(4)), - } - ) - cdf_std = cdf.groupby(["id4", "id5"])[["v3"]].std() - cdf_med = cdf.groupby(["id4", "id5"])[["v3"]].quantile(q=0.5) - with pytest.raises(NotImplementedError): - cudf.concat([cdf_med, cdf_std], axis=1) - - -def test_concat_mixed_input(): - pdf1 = pd.DataFrame({"a": [10, 20, 30]}) - pdf2 = pd.DataFrame({"a": [11, 22, 33]}) - - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - - assert_eq( - pd.concat([pdf1, None, pdf2, None]), - cudf.concat([gdf1, None, gdf2, None]), - check_index_type=True, - ) - assert_eq( - pd.concat([pdf1, None]), - cudf.concat([gdf1, None]), - check_index_type=True, - ) - assert_eq( - pd.concat([None, pdf2]), - cudf.concat([None, gdf2]), - check_index_type=True, - ) - assert_eq( - pd.concat([None, pdf2, pdf1]), - cudf.concat([None, gdf2, gdf1]), - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [pd.Series([1, 2, 3]), pd.DataFrame({"a": [1, 2]})], - [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})], - [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})], - [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})], - pytest.param( - [ - pd.Series([1, 2, 3.0, 1.2], name="abc"), - pd.DataFrame({"a": [1, 2]}), - ], - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - pytest.param( - [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130] - ), - pd.DataFrame({"a": [1, 2]}), - ], - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - pytest.param( - [ - pd.Series( - [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"] - ), - pd.DataFrame({"a": [1, 2]}, index=["a", "b"]), - ], - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - pytest.param( - [ - pd.Series( - [1, 2, 3.0, 1.2, 8, 100], - name="New name", - index=["a", "b", "c", "d", "e", "f"], - ), - pd.DataFrame( - {"a": [1, 2, 4, 10, 11, 12]}, - index=["a", "b", "c", "d", "e", "f"], - ), - ], - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - pytest.param( - [ - pd.Series( - [1, 2, 3.0, 1.2, 8, 100], - name="New name", - index=["a", "b", "c", "d", "e", "f"], - ), - pd.DataFrame( - {"a": [1, 2, 4, 10, 11, 12]}, - index=["a", "b", "c", "d", "e", "f"], - ), - ] - * 7, - marks=pytest.mark.skipif( - not PANDAS_GE_220, - reason="https://github.com/pandas-dev/pandas/pull/56365", - ), - ), - ], -) -def test_concat_series_dataframe_input(objs): - pd_objs = objs - gd_objs = [cudf.from_pandas(obj) for obj in objs] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat(pd_objs) - actual = cudf.concat(gd_objs) - - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.Series(["a", "b", "c", "d"]), - pd.Series(["1", "2", "3", "4"]), - pd.DataFrame({"first col": ["10", "11", "12", "13"]}), - ], - [ - pd.Series(["a", "b", "c", "d"]), - pd.Series(["1", "2", "3", "4"]), - pd.DataFrame( - { - "first col": ["10", "11", "12", "13"], - "second col": ["a", "b", "c", "d"], - } - ), - ], - [ - pd.Series(["a", "b", "c"]), - pd.Series(["1", "2", "3", "4"]), - pd.DataFrame( - { - "first col": ["10", "11", "12", "13"], - "second col": ["a", "b", "c", "d"], - } - ), - ], - ], -) -def test_concat_series_dataframe_input_str(objs): - pd_objs = objs - gd_objs = [cudf.from_pandas(obj) for obj in objs] - - expected = pd.concat(pd_objs) - actual = cudf.concat(gd_objs) - assert_eq(expected, actual, check_dtype=False, check_index_type=False) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame([], index=[100]), - pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), - ], -) -@pytest.mark.parametrize( - "other", - [ - [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], - [ - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - ], - [ - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), - ], - [ - pd.DataFrame([]), - pd.DataFrame([], index=[100]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), - ], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_concat_empty_dataframes(df, other, ignore_index): - other_pd = [df] + other - - gdf = cudf.from_pandas(df) - other_gd = [gdf] + [cudf.from_pandas(o) for o in other] - - expected = pd.concat(other_pd, ignore_index=ignore_index) - actual = cudf.concat(other_gd, ignore_index=ignore_index) - if expected.shape != df.shape: - for key, col in actual[actual.columns].items(): - if isinstance(col.dtype, cudf.CategoricalDtype): - if not isinstance(expected[key].dtype, pd.CategoricalDtype): - # TODO: Pandas bug: - # https://github.com/pandas-dev/pandas/issues/42840 - expected[key] = expected[key].fillna("-1").astype("str") - else: - expected[key] = ( - expected[key] - .cat.add_categories(["-1"]) - .fillna("-1") - .astype("str") - ) - actual[key] = col.astype("str").fillna("-1") - else: - expected[key] = expected[key].fillna(-1) - actual[key] = col.fillna(-1) - assert_eq(expected, actual, check_dtype=False, check_index_type=True) - else: - assert_eq( - expected, - actual, - check_index_type=not gdf.empty, - check_column_type=False, - ) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("axis", [0, "index"]) -@pytest.mark.parametrize( - "data", - [ - (["a", "b", "c"], ["a", "b", "c"]), - (["a", "b", "c"], ["XX", "YY", "ZZ"]), - ], -) -def test_concat_empty_and_nonempty_series(ignore_index, data, axis): - s1 = cudf.Series() - s2 = cudf.Series(data[0]) - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - got = cudf.concat([s1, s2], axis=axis, ignore_index=ignore_index) - expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) - - assert_eq(got, expect, check_index_type=True) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("axis", [0, "index"]) -def test_concat_two_empty_series(ignore_index, axis): - s1 = cudf.Series() - s2 = cudf.Series() - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - got = cudf.concat([s1, s2], axis=axis, ignore_index=ignore_index) - expect = pd.concat([ps1, ps2], axis=axis, ignore_index=ignore_index) - - assert_eq(got, expect, check_index_type=True) - - -@pytest.mark.parametrize( - "df1,df2", - [ - ( - cudf.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}), - cudf.DataFrame({"k1": [1, 0], "k2": [3, 2], "v2": [6, 7]}), - ), - ( - cudf.DataFrame({"k1": [0, 1], "k2": [2, 3], "v1": [4, 5]}), - cudf.DataFrame({"k1": [0, 1], "k2": [3, 2], "v2": [6, 7]}), - ), - ], -) -def test_concat_dataframe_with_multiindex(df1, df2): - gdf1 = df1 - gdf1 = gdf1.set_index(["k1", "k2"]) - - gdf2 = df2 - gdf2 = gdf2.set_index(["k1", "k2"]) - - pdf1 = gdf1.to_pandas() - pdf2 = gdf2.to_pandas() - - actual = cudf.concat([gdf1, gdf2], axis=1) - expected = pd.concat([pdf1, pdf2], axis=1) - - # Will need to sort_index before comparing as - # ordering is not deterministic in case of pandas - # multiIndex with concat. - assert_eq( - expected.sort_index(), - actual.sort_index(), - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))} - ), - ], - [ - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - }, - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], - ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))}, - index=["k", "l", "m", "n", "o", "p", "q", "r", "s", "t"], - ), - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - }, - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], - ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))}, - index=["a", "b", "c", "d", "z", "f", "g", "h", "i", "w"], - ), - ], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [0]) -def test_concat_join(objs, ignore_index, sort, join, axis): - gpu_objs = [cudf.from_pandas(o) for o in objs] - - assert_eq( - pd.concat( - objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis - ), - cudf.concat( - gpu_objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ), - pd.DataFrame( - {"x": range(10, 20), "y": list(map(float, range(10, 20)))} - ), - ], - ], -) -def test_concat_join_axis_1_dup_error(objs): - gpu_objs = [cudf.from_pandas(o) for o in objs] - # we do not support duplicate columns - with pytest.raises(NotImplementedError): - assert_eq( - pd.concat( - objs, - axis=1, - ), - cudf.concat( - gpu_objs, - axis=1, - ), - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ), - pd.DataFrame( - {"l": range(10, 20), "m": list(map(float, range(10, 20)))} - ), - ], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [1]) -def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): - # no duplicate columns - gpu_objs = [cudf.from_pandas(o) for o in objs] - expected = pd.concat( - objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis - ) - actual = cudf.concat( - gpu_objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - - assert_eq(expected, actual, check_index_type=True) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [1, 0]) -def test_concat_join_many_df_and_empty_df(ignore_index, sort, join, axis): - # no duplicate columns - pdf1 = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ) - pdf2 = pd.DataFrame( - {"l": range(10, 20), "m": list(map(float, range(10, 20)))} - ) - pdf3 = pd.DataFrame({"j": [1, 2], "k": [1, 2], "s": [1, 2], "t": [1, 2]}) - pdf_empty1 = pd.DataFrame() - - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - gdf3 = cudf.from_pandas(pdf3) - gdf_empty1 = cudf.from_pandas(pdf_empty1) - - with _hide_concat_empty_dtype_warning(): - assert_eq( - pd.concat( - [pdf1, pdf2, pdf3, pdf_empty1], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - cudf.concat( - [gdf1, gdf2, gdf3, gdf_empty1], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - check_index_type=False, - ) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_concat_join_one_df(ignore_index, sort, join, axis): - pdf1 = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ) - - gdf1 = cudf.from_pandas(pdf1) - expected = pd.concat( - [pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis - ) - actual = cudf.concat( - [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis - ) - - assert_eq(expected, actual, check_index_type=True) - - -@pytest.mark.parametrize( - "pdf1,pdf2", - [ - ( - pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), - pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}), - ), - ( - pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["p", "q", "r"] - ), - pd.DataFrame( - {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["r", "p", "z"] - ), - ), - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_concat_join_no_overlapping_columns( - pdf1, pdf2, ignore_index, sort, join, axis -): - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - - expected = pd.concat( - [pdf1, pdf2], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = cudf.concat( - [gdf1, gdf2], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - - assert_eq(expected, actual, check_index_type=True) - - -@pytest.mark.parametrize("ignore_index", [False, True]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_concat_join_no_overlapping_columns_many_and_empty( - ignore_index, sort, join, axis -): - pdf4 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - pdf5 = pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}) - pdf6 = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ) - pdf_empty = pd.DataFrame() - - gdf4 = cudf.from_pandas(pdf4) - gdf5 = cudf.from_pandas(pdf5) - gdf6 = cudf.from_pandas(pdf6) - gdf_empty = cudf.from_pandas(pdf_empty) - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf4, pdf5, pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = cudf.concat( - [gdf4, gdf5, gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - assert_eq( - expected, - actual, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "objs", - [ - [ - pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6]}, index=["z", "t", "k"] - ), - pd.DataFrame( - {"c": [7, 8, 9], "d": [10, 11, 12]}, index=["z", "t", "k"] - ), - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - }, - index=["z", "t", "k", "a", "b", "c", "d", "e", "f", "g"], - ), - pd.DataFrame(index=pd.Index([], dtype="str")), - ], - [ - pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), - pd.DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]}), - pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ), - pd.DataFrame(index=pd.Index([], dtype="str")), - ], - pytest.param( - [ - pd.DataFrame( - {"a": [1, 2, 3], "nb": [10, 11, 12]}, index=["Q", "W", "R"] - ), - None, - ], - ), - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("join", ["outer", "inner"]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_concat_join_no_overlapping_columns_many_and_empty2( - objs, ignore_index, sort, join, axis -): - objs_gd = [cudf.from_pandas(o) if o is not None else o for o in objs] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = cudf.concat( - objs_gd, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - assert_eq(expected, actual, check_index_type=False) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_concat_join_no_overlapping_columns_empty_df_basic( - ignore_index, sort, join, axis -): - pdf6 = pd.DataFrame( - { - "x": range(10), - "y": list(map(float, range(10))), - "z": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - } - ) - pdf_empty = pd.DataFrame() - - gdf6 = cudf.from_pandas(pdf6) - gdf_empty = cudf.from_pandas(pdf_empty) - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - actual = cudf.concat( - [gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - assert_eq( - expected, - actual, - check_index_type=True, - check_column_type=False, - ) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_concat_join_series(ignore_index, sort, join, axis): - s1 = cudf.Series(["a", "b", "c"]) - s2 = cudf.Series(["a", "b"]) - s3 = cudf.Series(["a", "b", "c", "d"]) - s4 = cudf.Series(dtype="str") - - ps1 = s1.to_pandas() - ps2 = s2.to_pandas() - ps3 = s3.to_pandas() - ps4 = s4.to_pandas() - - expected = pd.concat( - [ps1, ps2, ps3, ps4], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - with expect_warning_if(axis == 1): - actual = cudf.concat( - [s1, s2, s3, s4], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ) - - assert_eq( - expected, - actual, - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame([], index=[100]), - pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), - ], -) -@pytest.mark.parametrize( - "other", - [ - [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], - [ - pd.DataFrame( - {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - ], - [ - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"k": [10]}, index=[200]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), - ], - [ - pd.DataFrame([]), - pd.DataFrame([], index=[100]), - pd.DataFrame( - {"cat": pd.Series(["two", "three"], dtype="category")} - ), - ], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -def test_concat_join_empty_dataframes( - request, df, other, ignore_index, join, sort -): - axis = 0 - other_pd = [df] + other - gdf = cudf.from_pandas(df) - other_gd = [gdf] + [cudf.from_pandas(o) for o in other] - - expected = pd.concat( - other_pd, ignore_index=ignore_index, axis=axis, join=join, sort=sort - ) - actual = cudf.concat( - other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort - ) - if ( - join == "outer" - and any( - isinstance(dtype, pd.CategoricalDtype) - for dtype in df.dtypes.tolist() - ) - and any( - isinstance(dtype, pd.CategoricalDtype) - for other_df in other - for dtype in other_df.dtypes.tolist() - ) - ): - request.applymarker( - pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/42840" - ) - ) - assert_eq( - expected, - actual, - check_dtype=False, - check_column_type=False, - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"m": [10]}, index=[200]), - pd.DataFrame([], index=[100]), - pd.DataFrame({"cat": pd.Series(["one", "two"], dtype="category")}), - ], -) -@pytest.mark.parametrize( - "other", - [ - [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], - [ - pd.DataFrame( - {"b": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame([[5, 6], [7, 8]], columns=list("CD")), - ], - [ - pd.DataFrame({"g": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"h": [10]}), - pd.DataFrame({"k": [10]}, index=[200]), - pd.DataFrame( - {"dog": pd.Series(["two", "three"], dtype="category")} - ), - ], - [ - pd.DataFrame([]), - pd.DataFrame([], index=[100]), - pd.DataFrame( - {"bird": pd.Series(["two", "three"], dtype="category")} - ), - ], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("join", ["inner", "outer"]) -@pytest.mark.parametrize("axis", [1]) -def test_concat_join_empty_dataframes_axis_1( - df, other, ignore_index, axis, join, sort -): - # no duplicate columns - other_pd = [df] + other - gdf = cudf.from_pandas(df) - other_gd = [gdf] + [cudf.from_pandas(o) for o in other] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - other_pd, - ignore_index=ignore_index, - axis=axis, - join=join, - sort=sort, - ) - actual = cudf.concat( - other_gd, - ignore_index=ignore_index, - axis=axis, - join=join, - sort=sort, - ) - if expected.shape != df.shape: - if axis == 0: - for key, col in actual[actual.columns].items(): - if isinstance(expected[key].dtype, pd.CategoricalDtype): - expected[key] = expected[key].fillna("-1") - actual[key] = col.astype("str").fillna("-1") - # if not expected.empty: - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_index_type=False - if len(expected) == 0 or actual.empty - else True, - check_column_type=False, - ) - else: - # no need to fill in if axis=1 - assert_eq( - expected, - actual, - check_index_type=False, - check_column_type=False, - ) - assert_eq( - expected, actual, check_index_type=False, check_column_type=False - ) - - -def test_concat_preserve_order(): - """Ensure that order is preserved on 'inner' concatenations.""" - df = pd.DataFrame([["d", 3, 4.0], ["c", 4, 5.0]], columns=["c", "b", "a"]) - dfs = [df, df] - - assert_eq( - pd.concat(dfs, join="inner"), - cudf.concat([cudf.DataFrame(df) for df in dfs], join="inner"), - check_index_type=True, - ) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("typ", [cudf.DataFrame, cudf.Series]) -def test_concat_single_object(ignore_index, typ): - """Ensure that concat on a single object does not change it.""" - obj = typ([1, 2, 3]) - assert_eq( - cudf.concat([obj], ignore_index=ignore_index, axis=0), - obj, - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "ltype", - [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2), Decimal64Dtype(8, 4)], -) -@pytest.mark.parametrize( - "rtype", - [ - Decimal64Dtype(3, 2), - Decimal64Dtype(8, 4), - cudf.Decimal128Dtype(3, 2), - cudf.Decimal32Dtype(8, 4), - ], -) -def test_concat_decimal_dataframe(ltype, rtype): - gdf1 = cudf.DataFrame( - {"id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"]} - ) - gdf2 = cudf.DataFrame( - {"id": np.random.randint(0, 10, 3), "val": ["2.35", "5.59", "8.14"]} - ) - - gdf1["val"] = gdf1["val"].astype(ltype) - gdf2["val"] = gdf2["val"].astype(rtype) - - pdf1 = gdf1.to_pandas() - pdf2 = gdf2.to_pandas() - - got = cudf.concat([gdf1, gdf2]) - expected = pd.concat([pdf1, pdf2]) - - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)]) -@pytest.mark.parametrize( - "rtype", - [ - Decimal64Dtype(4, 3), - Decimal64Dtype(10, 4), - Decimal32Dtype(8, 3), - Decimal128Dtype(18, 3), - ], -) -def test_concat_decimal_series(ltype, rtype): - gs1 = cudf.Series(["228.3", "559.5", "281.1"]).astype(ltype) - gs2 = cudf.Series(["2.345", "5.259", "8.154"]).astype(rtype) - - ps1 = gs1.to_pandas() - ps2 = gs2.to_pandas() - - got = cudf.concat([gs1, gs2]) - expected = pd.concat([ps1, ps2]) - - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize( - "df1, df2, df3, expected", - [ - ( - cudf.DataFrame( - {"val": [Decimal("42.5"), Decimal("8.7")]}, - dtype=Decimal64Dtype(5, 2), - ), - cudf.DataFrame( - {"val": [Decimal("9.23"), Decimal("-67.49")]}, - dtype=Decimal64Dtype(6, 4), - ), - cudf.DataFrame({"val": [8, -5]}, dtype="int32"), - cudf.DataFrame( - { - "val": [ - Decimal("42.5"), - Decimal("8.7"), - Decimal("9.23"), - Decimal("-67.49"), - Decimal("8"), - Decimal("-5"), - ] - }, - dtype=Decimal32Dtype(7, 4), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ( - cudf.DataFrame( - {"val": [Decimal("95.2"), Decimal("23.4")]}, - dtype=Decimal64Dtype(5, 2), - ), - cudf.DataFrame({"val": [54, 509]}, dtype="uint16"), - cudf.DataFrame({"val": [24, -48]}, dtype="int32"), - cudf.DataFrame( - { - "val": [ - Decimal("95.2"), - Decimal("23.4"), - Decimal("54"), - Decimal("509"), - Decimal("24"), - Decimal("-48"), - ] - }, - dtype=Decimal32Dtype(5, 2), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ( - cudf.DataFrame( - {"val": [Decimal("36.56"), Decimal("-59.24")]}, - dtype=Decimal64Dtype(9, 4), - ), - cudf.DataFrame({"val": [403.21, 45.13]}, dtype="float32"), - cudf.DataFrame({"val": [52.262, -49.25]}, dtype="float64"), - cudf.DataFrame( - { - "val": [ - Decimal("36.56"), - Decimal("-59.24"), - Decimal("403.21"), - Decimal("45.13"), - Decimal("52.262"), - Decimal("-49.25"), - ] - }, - dtype=Decimal32Dtype(9, 4), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ( - cudf.DataFrame( - {"val": [Decimal("9563.24"), Decimal("236.633")]}, - dtype=Decimal64Dtype(9, 4), - ), - cudf.DataFrame({"val": [5393, -95832]}, dtype="int64"), - cudf.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), - cudf.DataFrame( - { - "val": [ - Decimal("9563.24"), - Decimal("236.633"), - Decimal("5393"), - Decimal("-95832"), - Decimal("-29.234"), - Decimal("-31.945"), - ] - }, - dtype=Decimal32Dtype(9, 4), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ( - cudf.DataFrame( - {"val": [Decimal("95633.24"), Decimal("236.633")]}, - dtype=Decimal128Dtype(19, 4), - ), - cudf.DataFrame({"val": [5393, -95832]}, dtype="int64"), - cudf.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"), - cudf.DataFrame( - { - "val": [ - Decimal("95633.24"), - Decimal("236.633"), - Decimal("5393"), - Decimal("-95832"), - Decimal("-29.234"), - Decimal("-31.945"), - ] - }, - dtype=Decimal128Dtype(19, 4), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ], -) -def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected): - df = cudf.concat([df1, df2, df3]) - assert_eq(df, expected, check_index_type=True) - assert_eq(df.val.dtype, expected.val.dtype) - - -@pytest.mark.parametrize( - "s1, s2, s3, expected", - [ - ( - cudf.Series( - [Decimal("32.8"), Decimal("-87.7")], dtype=Decimal64Dtype(6, 2) - ), - cudf.Series( - [Decimal("101.243"), Decimal("-92.449")], - dtype=Decimal64Dtype(9, 6), - ), - cudf.Series([94, -22], dtype="int32"), - cudf.Series( - [ - Decimal("32.8"), - Decimal("-87.7"), - Decimal("101.243"), - Decimal("-92.449"), - Decimal("94"), - Decimal("-22"), - ], - dtype=Decimal64Dtype(10, 6), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ( - cudf.Series( - [Decimal("7.2"), Decimal("122.1")], dtype=Decimal64Dtype(5, 2) - ), - cudf.Series([33, 984], dtype="uint32"), - cudf.Series([593, -702], dtype="int32"), - cudf.Series( - [ - Decimal("7.2"), - Decimal("122.1"), - Decimal("33"), - Decimal("984"), - Decimal("593"), - Decimal("-702"), - ], - dtype=Decimal32Dtype(5, 2), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ( - cudf.Series( - [Decimal("982.94"), Decimal("-493.626")], - dtype=Decimal64Dtype(9, 4), - ), - cudf.Series([847.98, 254.442], dtype="float32"), - cudf.Series([5299.262, -2049.25], dtype="float64"), - cudf.Series( - [ - Decimal("982.94"), - Decimal("-493.626"), - Decimal("847.98"), - Decimal("254.442"), - Decimal("5299.262"), - Decimal("-2049.25"), - ], - dtype=Decimal32Dtype(9, 4), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ( - cudf.Series( - [Decimal("492.204"), Decimal("-72824.455")], - dtype=Decimal64Dtype(9, 4), - ), - cudf.Series([8438, -27462], dtype="int64"), - cudf.Series([-40.292, 49202.953], dtype="float64"), - cudf.Series( - [ - Decimal("492.204"), - Decimal("-72824.455"), - Decimal("8438"), - Decimal("-27462"), - Decimal("-40.292"), - Decimal("49202.953"), - ], - dtype=Decimal32Dtype(9, 4), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ( - cudf.Series( - [Decimal("492.204"), Decimal("-72824.455")], - dtype=Decimal64Dtype(10, 4), - ), - cudf.Series( - [Decimal("8438"), Decimal("-27462")], - dtype=Decimal32Dtype(9, 4), - ), - cudf.Series( - [Decimal("-40.292"), Decimal("49202.953")], - dtype=Decimal128Dtype(19, 4), - ), - cudf.Series( - [ - Decimal("492.204"), - Decimal("-72824.455"), - Decimal("8438"), - Decimal("-27462"), - Decimal("-40.292"), - Decimal("49202.953"), - ], - dtype=Decimal128Dtype(19, 4), - index=[0, 1, 0, 1, 0, 1], - ), - ), - ], -) -def test_concat_decimal_numeric_series(s1, s2, s3, expected): - s = cudf.concat([s1, s2, s3]) - assert_eq(s, expected, check_index_type=True) - - -@pytest.mark.parametrize( - "s1, s2, expected", - [ - ( - cudf.Series( - [Decimal("955.22"), Decimal("8.2")], dtype=Decimal64Dtype(5, 2) - ), - cudf.Series(["2007-06-12", "2006-03-14"], dtype="datetime64[s]"), - cudf.Series( - [ - "955.22", - "8.20", - "2007-06-12 00:00:00", - "2006-03-14 00:00:00", - ], - index=[0, 1, 0, 1], - ), - ), - ( - cudf.Series( - [Decimal("-52.44"), Decimal("365.22")], - dtype=Decimal64Dtype(5, 2), - ), - cudf.Series( - np.arange( - "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]" - ).astype("datetime64[s]"), - dtype="datetime64[s]", - ), - cudf.Series( - [ - "-52.44", - "365.22", - "2005-02-01 12:00:00", - "2005-02-01 13:00:00", - "2005-02-01 14:00:00", - ], - index=[0, 1, 0, 1, 2], - ), - ), - ( - cudf.Series( - [Decimal("753.0"), Decimal("94.22")], - dtype=Decimal64Dtype(5, 2), - ), - cudf.Series([np.timedelta64(111, "s"), np.timedelta64(509, "s")]), - cudf.Series( - ["753.00", "94.22", "0 days 00:01:51", "0 days 00:08:29"], - index=[0, 1, 0, 1], - ), - ), - ( - cudf.Series( - [Decimal("753.0"), Decimal("94.22")], - dtype=Decimal64Dtype(5, 2), - ), - cudf.Series( - [np.timedelta64(940252, "s"), np.timedelta64(758385, "s")] - ), - cudf.Series( - ["753.00", "94.22", "10 days 21:10:52", "8 days 18:39:45"], - index=[0, 1, 0, 1], - ), - ), - ], -) -def test_concat_decimal_non_numeric(s1, s2, expected): - s = cudf.concat([s1, s2]) - assert_eq(s, expected, check_index_type=True) - - -@pytest.mark.parametrize( - "s1, s2, expected", - [ - ( - cudf.Series([{"a": 5}, {"c": "hello"}, {"b": 7}]), - cudf.Series([{"a": 5, "c": "hello", "b": 7}]), - cudf.Series( - [ - {"a": 5, "b": None, "c": None}, - {"a": None, "b": None, "c": "hello"}, - {"a": None, "b": 7, "c": None}, - {"a": 5, "b": 7, "c": "hello"}, - ], - index=[0, 1, 2, 0], - ), - ) - ], -) -def test_concat_struct_column(s1, s2, expected): - s = cudf.concat([s1, s2]) - assert_eq(s, expected, check_index_type=True) - - -@pytest.mark.parametrize( - "frame1, frame2, expected", - [ - ( - cudf.Series([[{"b": 0}], [{"b": 1}], [{"b": 3}]]), - cudf.Series([[{"b": 10}], [{"b": 12}], None]), - cudf.Series( - [ - [{"b": 0}], - [{"b": 1}], - [{"b": 3}], - [{"b": 10}], - [{"b": 12}], - None, - ], - index=[0, 1, 2, 0, 1, 2], - ), - ), - ( - cudf.DataFrame({"a": [[{"b": 0}], [{"b": 1}], [{"b": 3}]]}), - cudf.DataFrame({"a": [[{"b": 10}], [{"b": 12}], None]}), - cudf.DataFrame( - { - "a": [ - [{"b": 0}], - [{"b": 1}], - [{"b": 3}], - [{"b": 10}], - [{"b": 12}], - None, - ] - }, - index=[0, 1, 2, 0, 1, 2], - ), - ), - ], -) -def test_concat_list_column(frame1, frame2, expected): - actual = cudf.concat([frame1, frame2]) - assert_eq(actual, expected, check_index_type=True) - - -def test_concat_categorical_ordering(): - # https://github.com/rapidsai/cudf/issues/11486 - sr = pd.Series( - ["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category" - ) - sr = sr.cat.set_categories(["d", "a", "b", "c", "e"]) - - df = pd.DataFrame({"a": sr}) - gdf = cudf.from_pandas(df) - - expect = pd.concat([df, df, df]) - got = cudf.concat([gdf, gdf, gdf]) - - assert_eq(expect, got) - - -@pytest.fixture(params=["rangeindex", "index"]) -def singleton_concat_index(request): - if request.param == "rangeindex": - return pd.RangeIndex(0, 4) - else: - return pd.Index(["a", "h", "g", "f"]) - - -@pytest.fixture(params=["dataframe", "series"]) -def singleton_concat_obj(request, singleton_concat_index): - if request.param == "dataframe": - return pd.DataFrame( - { - "b": [1, 2, 3, 4], - "d": [7, 8, 9, 10], - "a": [4, 5, 6, 7], - "c": [10, 11, 12, 13], - }, - index=singleton_concat_index, - ) - else: - return pd.Series([4, 5, 5, 6], index=singleton_concat_index) - - -@pytest.mark.parametrize("axis", [0, 1, "columns", "index"]) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("ignore_index", [False, True]) -def test_concat_singleton_sorting( - axis, sort, ignore_index, singleton_concat_obj -): - gobj = cudf.from_pandas(singleton_concat_obj) - gconcat = cudf.concat( - [gobj], axis=axis, sort=sort, ignore_index=ignore_index - ) - pconcat = pd.concat( - [singleton_concat_obj], axis=axis, sort=sort, ignore_index=ignore_index - ) - assert_eq(pconcat, gconcat) - - -@pytest.mark.parametrize("axis", [2, "invalid"]) -def test_concat_invalid_axis(axis): - s = cudf.Series([1, 2, 3]) - with pytest.raises(ValueError): - cudf.concat([s], axis=axis) - - -@pytest.mark.parametrize( - "s1,s2", - [ - ([1, 2], [[1, 2], [3, 4]]), - ], -) -def test_concat_mixed_list_types_error(s1, s2): - s1, s2 = cudf.Series(s1), cudf.Series(s2) - - with pytest.raises(NotImplementedError): - cudf.concat([s1, s2], ignore_index=True) - - -@pytest.mark.parametrize( - "axis", - [ - pytest.param( - 0, - marks=pytest.mark.xfail( - reason="concat dictionaries with axis=0 not implemented" - ), - ), - 1, - "columns", - ], -) -@pytest.mark.parametrize( - "d", - [ - {"first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}})}, - { - "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), - "second": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}), - "third": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), - }, - { - "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), - "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), - "third": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}), - }, - { - "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), - "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}), - "third": (cudf.DataFrame, {"data": {"A": [5, 6], "C": [7, 8]}}), - "fourth": (cudf.DataFrame, {"data": {"B": [9, 10]}}), - }, - pytest.param( - { - "first": (cudf.DataFrame, {"data": {2.0: [1, 1]}}), - "second": (cudf.DataFrame, {"data": {"test": ["abc", "def"]}}), - }, - marks=pytest.mark.xfail( - reason=( - "Cannot construct a MultiIndex column with multiple " - "label types in cuDF at this time. You must convert " - "the labels to the same type." - ) - ), - ), - { - "first": (cudf.Series, {"data": [1, 2, 3]}), - "second": (cudf.Series, {"data": [4, 5, 6]}), - }, - { - "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}), - "second": (cudf.Series, {"data": [5, 6], "name": "C"}), - }, - pytest.param( - { - "first": ( - cudf.DataFrame, - {"data": {("A", "B"): [1, 2], "C": [3, 4]}}, - ), - "second": ( - cudf.DataFrame, - {"data": {"D": [5, 6], ("A", "B"): [7, 8]}}, - ), - }, - marks=pytest.mark.xfail( - reason=( - "Cannot construct a MultiIndex column with multiple " - "label types in cuDF at this time. You must convert " - "the labels to the same type." - ) - ), - ), - pytest.param( - { - "first": ( - cudf.DataFrame, - {"data": {("A", "B"): [3, 4], 2.0: [1, 1]}}, - ), - "second": ( - cudf.DataFrame, - {"data": {("C", "D"): [3, 4], 3.0: [5, 6]}}, - ), - }, - marks=pytest.mark.xfail( - reason=( - "Cannot construct a MultiIndex column with multiple " - "label types in cuDF at this time. You must convert " - "the labels to the same type." - ) - ), - ), - { - "first": ( - cudf.DataFrame, - {"data": {(1, 2): [1, 2], (3, 4): [3, 4]}}, - ), - "second": ( - cudf.DataFrame, - {"data": {(1, 2): [5, 6], (5, 6): [7, 8]}}, - ), - }, - ], -) -def test_concat_dictionary(d, axis): - _dict = {k: c(**v) for k, (c, v) in d.items()} - result = cudf.concat(_dict, axis=axis) - expected = cudf.from_pandas( - pd.concat({k: df.to_pandas() for k, df in _dict.items()}, axis=axis) - ) - assert_eq(expected, result) - - -@pytest.mark.parametrize( - "d", - [ - {"first": cudf.Index([1, 2, 3])}, - { - "first": cudf.MultiIndex( - levels=[[1, 2], ["blue", "red"]], - codes=[[0, 0, 1, 1], [1, 0, 1, 0]], - ) - }, - {"first": cudf.CategoricalIndex([1, 2, 3])}, - ], -) -def test_concat_dict_incorrect_type_index(d): - with pytest.raises( - TypeError, - match="cannot concatenate a dictionary containing indices", - ): - cudf.concat(d, axis=1) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py deleted file mode 100644 index fe86df99d35..00000000000 --- a/python/cudf/cudf/tests/test_contains.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import datetime - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Series -from cudf.core.index import Index, RangeIndex -from cudf.testing import assert_eq -from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES - - -def cudf_date_series(start, stop, freq): - return Series(pd.date_range(start, stop, freq=freq, name="times")) - - -def cudf_num_series(start, stop, step=1): - return Series(range(start, stop, step)) - - -def get_categorical_series(): - return Series( - pd.Categorical( - ["ab", "ac", "cd", "ab", "cd"], categories=["ab", "ac", "cd"] - ) - ) - - -def get_string_series(): - return Series(["ab", "ac", "ba", "cc", "ad"]) - - -# If the type being searched is different from type of series, exceptions -# are thrown well within the python code, and needs to be handled. -# Some of the test cases check this scenario. Example : String Vs Numerical -testdata_all = [ - ( - cudf_date_series("20010101", "20020215", freq="400h"), - datetime.datetime.strptime("2001-01-01", "%Y-%m-%d"), - True, - ), - ( - cudf_date_series("20010101", "20020215", freq="400h"), - datetime.datetime.strptime("2000-01-01", "%Y-%m-%d"), - False, - ), - (cudf_date_series("20010101", "20020215", freq="400h"), 20000101, False), - (get_categorical_series(), "cd", True), - (get_categorical_series(), "dc", False), - (get_categorical_series(), "c", False), - (get_categorical_series(), "c", False), - (get_categorical_series(), 1, False), - (get_string_series(), "ac", True), - (get_string_series(), "ca", False), - (get_string_series(), "c", False), - (get_string_series(), 97, False), - (cudf_num_series(0, 100, 5), 60, True), - (cudf_num_series(0, 100, 5), 71, False), - (cudf_num_series(0, 100, 5), "a", False), -] - - -@pytest.mark.parametrize("values, item, expected", testdata_all) -def test_series_contains(values, item, expected): - assert_eq(expected, item in Series(index=values)) - - -@pytest.mark.parametrize("values, item, expected", testdata_all) -def test_index_contains(values, item, expected): - index = Index(values) - assert_eq(expected, item in index) - - -def test_rangeindex_contains(): - assert_eq(True, 9 in RangeIndex(start=0, stop=10, name="Index")) - assert_eq(False, 10 in RangeIndex(start=0, stop=10, name="Index")) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -def test_lists_contains(dtype): - dtype = cudf.dtype(dtype) - inner_data = np.array([1, 2, 3], dtype=dtype) - - data = Series([inner_data]) - - contained_scalar = inner_data.dtype.type(2) - not_contained_scalar = inner_data.dtype.type(42) - - assert data.list.contains(contained_scalar)[0] - assert not data.list.contains(not_contained_scalar)[0] - - -@pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES) -def test_lists_contains_datetime(dtype): - dtype = cudf.dtype(dtype) - inner_data = np.array([1, 2, 3]) - - unit, _ = np.datetime_data(dtype) - - data = Series([inner_data]) - - contained_scalar = inner_data.dtype.type(2) - not_contained_scalar = inner_data.dtype.type(42) - - assert data.list.contains(contained_scalar)[0] - assert not data.list.contains(not_contained_scalar)[0] - - -def test_lists_contains_bool(): - data = Series([[True, True, True]]) - - contained_scalar = True - not_contained_scalar = False - - assert data.list.contains(contained_scalar)[0] - assert not data.list.contains(not_contained_scalar)[0] diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py deleted file mode 100644 index 9b6f82ec705..00000000000 --- a/python/cudf/cudf/tests/test_copying.py +++ /dev/null @@ -1,438 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Series -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES - -pytestmark = pytest.mark.spilling - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) -def test_repeat(dtype): - arr = np.random.rand(10) * 10 - repeats = np.random.randint(10, size=10) - psr = pd.Series(arr).astype(dtype) - gsr = cudf.from_pandas(psr) - - assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) - - -def test_repeat_index(): - arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] - psr = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) - gsr = cudf.from_pandas(psr) - repeats = np.random.randint(10, size=4) - - assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) - - -def test_repeat_dataframe(): - psr = pd.DataFrame({"a": [1, 1, 2, 2]}) - gsr = cudf.from_pandas(psr) - repeats = np.random.randint(10, size=4) - - # pd.DataFrame doesn't have repeat() so as a workaround, we are - # comparing pd.Series.repeat() with cudf.DataFrame.repeat()['a'] - assert_eq(psr["a"].repeat(repeats), gsr.repeat(repeats)["a"]) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -def test_repeat_scalar(dtype): - arr = np.random.rand(10) * 10 - repeats = 10 - psr = pd.Series(arr).astype(dtype) - gsr = cudf.from_pandas(psr) - - assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) - - -def test_null_copy(): - col = Series(np.arange(2049)) - col[:] = None - assert len(col) == 2049 - - -def test_series_setitem_cow_on(): - with cudf.option_context("copy_on_write", True): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[1] = 100 - assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) - - -def test_series_setitem_cow_off(): - with cudf.option_context("copy_on_write", False): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[1] = 100 - assert_eq(actual, cudf.Series([1, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 100, 3, 4, 5])) - - -def test_series_setitem_both_slice_cow_on(): - with cudf.option_context("copy_on_write", True): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[slice(0, 2, 1)] = 100 - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 3, 4, 5])) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - - -def test_series_setitem_both_slice_cow_off(): - with cudf.option_context("copy_on_write", False): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - actual[slice(0, 2, 1)] = 100 - assert_eq(actual, cudf.Series([100, 100, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([100, 100, 3, 4, 5])) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([100, 100, 300, 300, 5])) - assert_eq(new_copy, cudf.Series([100, 100, 300, 300, 5])) - - -def test_series_setitem_partial_slice_cow_on(): - with cudf.option_context("copy_on_write", True): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - - new_slice = actual[2:] - assert ( - new_slice._column.base_data.owner == actual._column.base_data.owner - ) - new_slice[0:2] = 10 - assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) - assert_eq(actual, cudf.Series([1, 2, 3, 4, 5])) - - -def test_series_setitem_partial_slice_cow_off(): - with cudf.option_context("copy_on_write", False): - actual = cudf.Series([1, 2, 3, 4, 5]) - new_copy = actual.copy(deep=False) - - new_copy[slice(2, 4, 1)] = 300 - assert_eq(actual, cudf.Series([1, 2, 300, 300, 5])) - assert_eq(new_copy, cudf.Series([1, 2, 300, 300, 5])) - - new_slice = actual[2:] - # Since COW is off, a slice should point to the same memory - ptr1 = new_slice._column.base_data.get_ptr(mode="read") - ptr2 = actual._column.base_data.get_ptr(mode="read") - assert ptr1 == ptr2 - - new_slice[0:2] = 10 - assert_eq(new_slice, cudf.Series([10, 10, 5], index=[2, 3, 4])) - assert_eq(actual, cudf.Series([1, 2, 10, 10, 5])) - - -def test_multiple_series_cow(): - with cudf.option_context("copy_on_write", True): - # Verify constructing, modifying, deleting - # multiple copies of a series preserves - # the data appropriately when COW is enabled. - s = cudf.Series([10, 20, 30, 40, 50]) - s1 = s.copy(deep=False) - s2 = s.copy(deep=False) - s3 = s.copy(deep=False) - s4 = s2.copy(deep=False) - s5 = s4.copy(deep=False) - s6 = s3.copy(deep=False) - - s1[0:3] = 10000 - # s1 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - for ser in [s, s2, s3, s4, s5, s6]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s6[0:3] = 3000 - # s6 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s2, s3, s4, s5]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s2[1:4] = 4000 - # s2 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s3, s4, s5]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s4[2:4] = 5000 - # s4 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) - assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s3, s5]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s5[2:4] = 6000 - # s5 will be unlinked from actual data in s, - # and then modified. Rest all should - # contain the original data. - assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) - assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) - assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s3]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - s7 = s5.copy(deep=False) - assert_eq(s7, cudf.Series([10, 20, 6000, 6000, 50])) - s7[1:3] = 55 - # Making a copy of s5, i.e., s7 and modifying shouldn't - # be touching/modifying data in other series. - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) - assert_eq(s2, cudf.Series([10, 4000, 4000, 4000, 50])) - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - for ser in [s3]: - assert_eq(ser, cudf.Series([10, 20, 30, 40, 50])) - - # Deleting any of the following series objects - # shouldn't delete rest of the weekly referenced data - # elsewhere. - - del s2 - - assert_eq(s1, cudf.Series([10000, 10000, 10000, 40, 50])) - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) - assert_eq(s4, cudf.Series([10, 20, 5000, 5000, 50])) - assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - del s4 - del s1 - - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) - assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) - assert_eq(s6, cudf.Series([3000, 3000, 3000, 40, 50])) - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - del s - del s6 - - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) - assert_eq(s5, cudf.Series([10, 20, 6000, 6000, 50])) - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - del s5 - - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50])) - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - del s3 - assert_eq(s7, cudf.Series([10, 55, 55, 6000, 50])) - - -def test_series_zero_copy_cow_on(): - with cudf.option_context("copy_on_write", True): - s = cudf.Series([1, 2, 3, 4, 5]) - s1 = s.copy(deep=False) - cp_array = cp.asarray(s) - - # Ensure all original data & zero-copied - # data is same. - assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) - - cp_array[0:3] = 10 - # Modifying a zero-copied array should only - # modify `s` and will leave rest of the copies - # untouched. - - assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) - - s2 = cudf.Series(cp_array) - assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) - - s3 = s2.copy(deep=False) - cp_array[0] = 20 - # Modifying a zero-copied array should modify - # `s2` and `s` only. Because `cp_array` - # is zero-copy shared with `s` & `s2`. - - assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) - assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s3, cudf.Series([10, 10, 10, 4, 5])) - - s4 = cudf.Series([10, 20, 30, 40, 50]) - s5 = cudf.Series(s4) - assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) - s5[0:2] = 1 - # Modifying `s5` should also modify `s4` - # because they are zero-copied. - assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) - assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) - - -def test_series_zero_copy_cow_off(): - is_spill_enabled = get_global_manager() is not None - - with cudf.option_context("copy_on_write", False): - s = cudf.Series([1, 2, 3, 4, 5]) - s1 = s.copy(deep=False) - cp_array = cp.asarray(s) - - # Ensure all original data & zero-copied - # data is same. - assert_eq(s, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(s1, cudf.Series([1, 2, 3, 4, 5])) - assert_eq(cp_array, cp.array([1, 2, 3, 4, 5])) - - cp_array[0:3] = 10 - # When COW is off, modifying a zero-copied array - # will need to modify `s` & `s1` since they are - # shallow copied. - - assert_eq(s, cudf.Series([10, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([10, 10, 10, 4, 5])) - assert_eq(cp_array, cp.array([10, 10, 10, 4, 5])) - - s2 = cudf.Series(cp_array) - assert_eq(s2, cudf.Series([10, 10, 10, 4, 5])) - s3 = s2.copy(deep=False) - cp_array[0] = 20 - - # Modifying `cp_array`, will propagate the changes - # across all Series objects, because they are - # either shallow copied or zero-copied. - - assert_eq(s, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s1, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(cp_array, cp.array([20, 10, 10, 4, 5])) - if not is_spill_enabled: - # Since spilling might make a copy of the data, we cannot - # expect the two series to be a zero-copy of the cupy array - # when spilling is enabled globally. - assert_eq(s2, cudf.Series([20, 10, 10, 4, 5])) - assert_eq(s3, cudf.Series([20, 10, 10, 4, 5])) - - s4 = cudf.Series([10, 20, 30, 40, 50]) - s5 = cudf.Series(s4) - assert_eq(s5, cudf.Series([10, 20, 30, 40, 50])) - s5[0:2] = 1 - - # Modifying `s5` should also modify `s4` - # because they are zero-copied. - assert_eq(s5, cudf.Series([1, 1, 30, 40, 50])) - assert_eq(s4, cudf.Series([1, 1, 30, 40, 50])) - - -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_series_str_copy(copy_on_write): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", copy_on_write) - s = cudf.Series(["a", "b", "c", "d", "e"]) - s1 = s.copy(deep=True) - s2 = s.copy(deep=True) - - assert_eq(s, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) - - s[0:3] = "abc" - - assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "b", "c", "d", "e"])) - - s2[1:4] = "xyz" - - assert_eq(s, cudf.Series(["abc", "abc", "abc", "d", "e"])) - assert_eq(s1, cudf.Series(["a", "b", "c", "d", "e"])) - assert_eq(s2, cudf.Series(["a", "xyz", "xyz", "xyz", "e"])) - cudf.set_option("copy_on_write", original_cow_setting) - - -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_series_cat_copy(copy_on_write): - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", copy_on_write) - s = cudf.Series([10, 20, 30, 40, 50], dtype="category") - s1 = s.copy(deep=True) - s2 = s1.copy(deep=True) - s3 = s1.copy(deep=True) - - s[0] = 50 - assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - assert_eq(s2, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - assert_eq(s3, cudf.Series([10, 20, 30, 40, 50], dtype="category")) - - s2[3] = 10 - s3[2:5] = 20 - assert_eq(s, cudf.Series([50, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s1, cudf.Series([10, 20, 30, 40, 50], dtype=s.dtype)) - assert_eq(s2, cudf.Series([10, 20, 30, 10, 50], dtype=s.dtype)) - assert_eq(s3, cudf.Series([10, 20, 20, 20, 20], dtype=s.dtype)) - cudf.set_option("copy_on_write", original_cow_setting) - - -def test_dataframe_cow_slice_setitem(): - with cudf.option_context("copy_on_write", True): - df = cudf.DataFrame( - {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} - ) - slice_df = df[1:4] - - assert_eq( - slice_df, - cudf.DataFrame( - {"a": [11, 12, 13], "b": [30, 40, 50]}, index=[1, 2, 3] - ), - ) - - slice_df["a"][2] = 1111 - - assert_eq( - slice_df, - cudf.DataFrame( - {"a": [11, 1111, 13], "b": [30, 40, 50]}, index=[1, 2, 3] - ), - ) - assert_eq( - df, - cudf.DataFrame( - {"a": [10, 11, 12, 13, 14], "b": [20, 30, 40, 50, 60]} - ), - ) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py deleted file mode 100644 index b6efc8ebd88..00000000000 --- a/python/cudf/cudf/tests/test_csv.py +++ /dev/null @@ -1,2279 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import codecs -import gzip -import os -import re -import shutil -from collections import OrderedDict -from io import BytesIO, StringIO -from pathlib import Path - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import read_csv -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -def make_numeric_dataframe(nrows, dtype): - df = pd.DataFrame() - df["col1"] = np.arange(nrows, dtype=dtype) - df["col2"] = np.arange(1, 1 + nrows, dtype=dtype) - return df - - -def make_datetime_dataframe(include_non_standard=False): - df = pd.DataFrame() - df["col1"] = np.array( - [ - "31/10/2010", - "05/03/2001", - "20/10/1994", - "18/10/1990", - "1/1/1970", - "2016-04-30T01:02:03.000", - "2038-01-19 03:14:07", - ] - ) - df["col2"] = np.array( - [ - "18/04/1995", - "14 / 07 / 1994", - "07/06/2006", - "16/09/2005", - "2/2/1970", - "2007-4-30 1:6:40.000PM", - "2038-01-19 03:14:08", - ] - ) - if include_non_standard: - # Last column contains non-standard date formats - df["col3"] = np.array( - [ - "1 Jan", - "2 January 1994", - "Feb 2002", - "31-01-2000", - "1-1-1996", - "15-May-2009", - "21-Dec-3262", - ] - ) - return df - - -def make_numpy_mixed_dataframe(): - df = pd.DataFrame() - df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Date"] = np.array( - ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] - ) - df["Float"] = np.array([9.001, 8.343, 6, 2.781]) - df["Integer2"] = np.array([2345, 106, 2088, 789277]) - df["Category"] = np.array(["M", "F", "F", "F"]) - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df["Boolean"] = np.array([True, False, True, False]) - return df - - -@pytest.fixture -def pd_mixed_dataframe(): - return make_numpy_mixed_dataframe() - - -@pytest.fixture -def cudf_mixed_dataframe(): - return cudf.from_pandas(make_numpy_mixed_dataframe()) - - -def make_all_numeric_dataframe(): - df = pd.DataFrame() - - gdf_dtypes = [ - "float", - "float32", - "double", - "float64", - "int8", - "short", - "int16", - "int", - "int32", - "long", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - ] - - np_dtypes = [ - np.float32, - np.float32, - np.float64, - np.float64, - np.int8, - np.int16, - np.int16, - np.int32, - np.int32, - np.int64, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - ] - - for i in range(len(gdf_dtypes)): - df[gdf_dtypes[i]] = np.arange(10, dtype=np_dtypes[i]) - - return ( - df, - OrderedDict(zip(gdf_dtypes, gdf_dtypes)), - OrderedDict(zip(gdf_dtypes, np_dtypes)), - ) - - -def make_all_numeric_extremes_dataframe(): - # integers 0,+1,-1,min,max - # float 0.0, -0.0,+1,-1,min,max, nan, esp, espneg, tiny, [-ve values] - df, gdf_dtypes, pdf_dtypes = make_all_numeric_dataframe() - df = pd.DataFrame() - - for gdf_dtype in gdf_dtypes: - np_type = pdf_dtypes[gdf_dtype] - if np.issubdtype(np_type, np.integer): - itype = np.iinfo(np_type) - extremes = [0, +1, -1, itype.min, itype.max] - df[gdf_dtype] = np.array(extremes * 4).astype(np_type)[:20] - else: - ftype = np.finfo(np_type) - extremes = [ - 0.0, - -0.0, - +1, - -1, - np.nan, - -np.nan, - # ftype.min, # TODO enable after fixing truncation issue #6235 - # ftype.max, # TODO enable after fixing truncation issue #6235 - np_type(np.inf), - -np_type(np.inf), - ftype.eps, - ftype.epsneg, - ftype.tiny, - -ftype.eps, - -ftype.epsneg, - -ftype.tiny, - ] - df[gdf_dtype] = np.array(extremes * 4, dtype=np_type)[:20] - return ( - df, - gdf_dtypes, - pdf_dtypes, - ) - - -@pytest.fixture -def pandas_extreme_numeric_dataframe(): - return make_all_numeric_extremes_dataframe()[0] - - -@pytest.fixture -def cudf_extreme_numeric_dataframe(pandas_extreme_numeric_dataframe): - return cudf.from_pandas(pandas_extreme_numeric_dataframe) - - -@pytest.fixture -def path_or_buf(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_path_or_buf.csv") - df = make_numeric_dataframe(10, np.int32) - - df.to_csv(fname, index=False, header=False) - buffer = df.to_csv(index=False, header=False) - - def _make_path_or_buf(src): - if src == "filepath": - return str(fname) - if src == "pathobj": - return fname - if src == "bytes_io": - return BytesIO(buffer.encode()) - if src == "string_io": - return StringIO(buffer) - if src == "url": - return Path(fname).as_uri() - - raise ValueError("Invalid source type") - - yield _make_path_or_buf - - -dtypes = [np.float64, np.float32, np.int64, np.int32, np.uint64, np.uint32] -dtypes_dict = {"1": np.float64, "2": np.float32, "3": np.int64, "4": np.int32} -nelem = [5, 25, 100] - - -@pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.parametrize("nelem", nelem) -def test_csv_reader_numeric_data(dtype, nelem, tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file1.csv") - - df = make_numeric_dataframe(nelem, dtype) - df.to_csv(fname, index=False, header=False) - - dtypes = [df[k].dtype for k in df.columns] - out = read_csv(str(fname), names=list(df.columns.values), dtype=dtypes) - - assert len(out.columns) == len(df.columns) - assert_eq(df, out) - - -@pytest.mark.parametrize("parse_dates", [["date2"], [0], ["date1", 1, "bad"]]) -def test_csv_reader_datetime(parse_dates): - df = make_datetime_dataframe(include_non_standard=True) - buffer = df.to_csv(index=False, header=False) - - gdf = read_csv( - StringIO(buffer), - names=["date1", "date2", "bad"], - parse_dates=parse_dates, - dayfirst=True, - ) - # Need to used `date_format='mixed'`, - # https://github.com/pandas-dev/pandas/issues/53355 - pdf = pd.read_csv( - StringIO(buffer), - names=["date1", "date2", "bad"], - parse_dates=parse_dates, - dayfirst=True, - date_format="mixed", - ) - - assert_eq(gdf, pdf) - - -@pytest.mark.parametrize("pandas_arg", [{"delimiter": "|"}, {"sep": "|"}]) -@pytest.mark.parametrize("cudf_arg", [{"sep": "|"}, {"delimiter": "|"}]) -def test_csv_reader_mixed_data_delimiter_sep( - tmpdir, pandas_arg, cudf_arg, pd_mixed_dataframe -): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file3.csv") - - pd_mixed_dataframe.to_csv(fname, sep="|", index=False, header=False) - - gdf1 = read_csv( - str(fname), - names=["1", "2", "3", "4", "5", "6", "7"], - dtype=[ - "int64", - "datetime64[ns]", - "float64", - "int64", - "category", - "str", - "bool", - ], - dayfirst=True, - **cudf_arg, - ) - gdf2 = read_csv( - str(fname), - names=["1", "2", "3", "4", "5", "6", "7"], - dtype=[ - "int64", - "datetime64[ns]", - "float64", - "int64", - "category", - "str", - "bool", - ], - dayfirst=True, - **pandas_arg, - ) - - pdf = pd.read_csv( - fname, - names=["1", "2", "3", "4", "5", "6", "7"], - parse_dates=[1], - dayfirst=True, - **pandas_arg, - ) - - assert len(gdf1.columns) == len(pdf.columns) - assert len(gdf2.columns) == len(pdf.columns) - assert_eq(gdf1, gdf2) - - -@pytest.mark.parametrize("use_list", [False, True]) -def test_csv_reader_dtype_list(use_list): - df = make_numeric_dataframe(10, dtype=np.float32) - buffer = df.to_csv(index=False, header=False) - - # PANDAS doesn't list but cudf does (treated as implied ordered dict) - # Select first column's dtype if non-list; expect the same dtype for all - if use_list: - dtypes = [df[k].dtype for k in df.columns] - else: - dtypes = df[df.columns[0]].dtype - - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=df.columns) - - assert_eq(gdf, df) - - -@pytest.mark.parametrize("use_names", [False, True]) -def test_csv_reader_dtype_dict(use_names): - # Save with the column header if not explicitly specifying a list of names - df, gdf_dtypes, pdf_dtypes = make_all_numeric_dataframe() - buffer = df.to_csv(index=False, header=(not use_names)) - dtypes = df.dtypes.to_dict() - gdf_names = list(gdf_dtypes.keys()) if use_names else None - pdf_names = list(pdf_dtypes.keys()) if use_names else None - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=gdf_names) - pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=pdf_names) - - assert_eq(gdf, pdf) - - -@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -@pytest.mark.parametrize("use_names", [True, False]) -def test_csv_reader_dtype_extremes(use_names): - # Save with the column header if not explicitly specifying a list of names - df, gdf_dtypes, pdf_dtypes = make_all_numeric_extremes_dataframe() - buffer = df.to_csv(index=False, header=(not use_names)) - dtypes = df.dtypes.to_dict() - gdf_names = list(gdf_dtypes.keys()) if use_names else None - pdf_names = list(pdf_dtypes.keys()) if use_names else None - - gdf = read_csv(StringIO(buffer), dtype=dtypes, names=gdf_names) - pdf = pd.read_csv(StringIO(buffer), dtype=dtypes, names=pdf_names) - - assert_eq(gdf, pdf) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/52449", -) -def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file5.csv") - - pd_mixed_dataframe.to_csv( - fname, columns=["Integer", "Date", "Float"], index=False, header=False - ) - - # Using engine='python' to eliminate pandas warning of using python engine. - df_out = pd.read_csv( - fname, - names=["1", "2", "3"], - parse_dates=[1], - dayfirst=True, - skiprows=1, - skipfooter=1, - engine="python", - ) - out = read_csv( - str(fname), - names=["1", "2", "3"], - dtype=["int64", "datetime64[ns]", "float64"], - skiprows=1, - skipfooter=1, - dayfirst=True, - ) - - assert len(out.columns) == len(df_out.columns) - assert len(out) == len(df_out) - - assert_eq(df_out, out, check_dtype=False) - - -def test_csv_reader_negative_vals(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file6.csv") - - names = ["0", "1", "2"] - dtypes = ["float32", "float32", "float32"] - lines = [ - ",".join(names), - "-181.5060,-185.37000,-3", - "-127.6300,-230.54600,-9", - ] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - zero = [-181.5060, -127.6300] - one = [-185.370, -230.54600] - two = [-3, -9] - - df = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) - - np.testing.assert_allclose(zero, df["0"].to_numpy()) - np.testing.assert_allclose(one, df["1"].to_numpy()) - np.testing.assert_allclose(two, df["2"].to_numpy()) - - -def test_csv_reader_strings(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file7.csv") - - names = ["text", "int"] - dtypes = ["str", "int"] - lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - df = read_csv( - str(fname), - names=names, - dtype=dtypes, - skiprows=1, - decimal=".", - thousands="'", - ) - - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - assert df["text"][0] == "a" - assert df["text"][1] == "b" - assert df["text"][2] == "c" - assert df["text"][3] == "d" - - -def test_csv_reader_strings_quotechars(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file8.csv") - - names = ["text", "int"] - dtypes = ["str", "int"] - lines = [",".join(names), '"a,\n",0', '"b ""c"" d",0', "e,0", '"f,,!.,",0'] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - df = read_csv( - str(fname), - names=names, - dtype=dtypes, - skiprows=1, - quotechar='"', - quoting=1, - ) - - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - assert df["text"][0] == "a,\n" - assert df["text"][1] == 'b "c" d' - assert df["text"][2] == "e" - assert df["text"][3] == "f,,!.," - - -def test_csv_reader_usecols_int_char(tmpdir, pd_mixed_dataframe): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file10.csv") - pd_mixed_dataframe.to_csv( - fname, - columns=["Integer", "Date", "Float", "Integer2"], - index=False, - header=False, - ) - - df_out = pd.read_csv(fname, usecols=[0, 1, 3]) - out = read_csv(fname, usecols=[0, 1, 3]) - - assert len(out.columns) == len(df_out.columns) - assert len(out) == len(df_out) - assert_eq(df_out, out, check_names=False) - - -@pytest.mark.parametrize( - "buffer", - [ - "abc,ABC,abc,abcd,abc\n1,2,3,4,5\n", - "A,A,A.1,A,A.2,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", - "A,A,A.1,,Unnamed: 4,A,A.4,A,A\n1,2,3.1,4,a.2,a,a.4,a,a", - ], -) -@pytest.mark.parametrize("mangle_dupe_cols", [True, False]) -def test_csv_reader_mangle_dupe_cols(tmpdir, buffer, mangle_dupe_cols): - # Default: mangle_dupe_cols=True - cu_df = read_csv(StringIO(buffer), mangle_dupe_cols=mangle_dupe_cols) - if mangle_dupe_cols: - pd_df = pd.read_csv(StringIO(buffer)) - else: - # Pandas does not support mangle_dupe_cols=False - head = buffer.split("\n")[0].split(",") - first_cols = np.unique(head, return_index=True)[1] - pd_df = pd.read_csv(StringIO(buffer), usecols=first_cols) - assert_eq(cu_df, pd_df) - - -def test_csv_reader_float_decimal(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") - - names = ["basic_32", "basic_64", "round", "decimal_only", "precision"] - dtypes = ["float32", "float64", "float64", "float32", "float64"] - lines = [ - ";".join(names), - "1,2;1234,5678;12345;0,123;-73,98007199999998", - "3,4;3456,7890;67890;,456;1,7976931348623157e+307", - "5,6e0;0,5679e2;1,2e10;0,07e-001;0,0", - ] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - basic_32_ref = [1.2, 3.4, 5.6] - basic_64_ref = [1234.5678, 3456.7890, 56.79] - round_ref = [12345, 67890, 12000000000] - decimal_only_ref = [0.123, 0.456, 0.007] - precision_ref = [-73.98007199999998, 1.7976931348623157e307, 0.0] - - df = read_csv( - str(fname), - names=names, - dtype=dtypes, - skiprows=1, - delimiter=";", - decimal=",", - ) - - np.testing.assert_allclose(basic_32_ref, df["basic_32"].to_numpy()) - np.testing.assert_allclose(basic_64_ref, df["basic_64"].to_numpy()) - np.testing.assert_allclose(round_ref, df["round"].to_numpy()) - np.testing.assert_allclose(decimal_only_ref, df["decimal_only"].to_numpy()) - np.testing.assert_allclose(precision_ref, df["precision"].to_numpy()) - - -def test_csv_reader_NaN_values(): - names = dtypes = ["float32"] - empty_cells = '\n""\n' - default_na_cells = ( - "#N/A\n#N/A N/A\n#NA\n-1.#IND\n" - "-1.#QNAN\n-NaN\n-nan\n1.#IND\n" - "1.#QNAN\nN/A\n\nNA\nNULL\n" - "NaN\nn/a\nnan\nnull\n" - ) - custom_na_cells = "NV_NAN\nNotANumber\n" - all_cells = empty_cells + default_na_cells + custom_na_cells - custom_na_values = ["NV_NAN", "NotANumber"] - - # test default NA values. empty cells should also yield NaNs - gdf = read_csv( - StringIO(default_na_cells + empty_cells), names=names, dtype=dtypes - ) - pdf = pd.read_csv( - StringIO(default_na_cells + empty_cells), names=names, dtype=np.float32 - ) - assert_eq(pdf, gdf) - - # custom NA values - gdf = read_csv( - StringIO(all_cells), - names=names, - dtype=dtypes, - na_values=custom_na_values, - ) - pdf = pd.read_csv( - StringIO(all_cells), - names=names, - dtype=np.float32, - na_values=custom_na_values, - ) - assert_eq(pdf, gdf) - - # custom NA values - gdf = read_csv( - StringIO(empty_cells + default_na_cells + "_NAA_\n"), - names=names, - dtype=dtypes, - na_values="_NAA_", - ) - pdf = pd.read_csv( - StringIO(empty_cells + default_na_cells + "_NAA_\n"), - names=names, - dtype=np.float32, - na_values="_NAA_", - ) - assert_eq(pdf, gdf) - - # data type detection should evaluate the column to int8 (all nulls) - gdf = read_csv( - StringIO(all_cells), - header=None, - na_values=custom_na_values, - ) - assert gdf.dtypes.iloc[0] == "int8" - assert all(gdf["0"][idx] is cudf.NA for idx in range(len(gdf["0"]))) - - # data type detection should evaluate the column to object if some nulls - gdf = read_csv(StringIO(all_cells), header=None) - assert gdf.dtypes.iloc[0] == np.dtype("object") - - -def test_csv_reader_thousands(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file13.csv") - - names = dtypes = [ - "float32", - "float64", - "int32", - "int64", - "uint32", - "uint64", - ] - lines = [ - ",".join(names), - "1'234.5, 1'234.567, 1'234'567, 1'234'567'890,\ - 1'234'567, 1'234'567'890", - "12'345.6, 123'456.7, 12'345, 123'456'789, 12'345, 123'456'789", - ] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - f32_ref = [1234.5, 12345.6] - f64_ref = [1234.567, 123456.7] - int32_ref = [1234567, 12345] - int64_ref = [1234567890, 123456789] - uint32_ref = [1234567, 12345] - uint64_ref = [1234567890, 123456789] - - df = read_csv( - str(fname), names=names, dtype=dtypes, skiprows=1, thousands="'" - ) - - np.testing.assert_allclose(f32_ref, df["float32"].to_numpy()) - np.testing.assert_allclose(f64_ref, df["float64"].to_numpy()) - np.testing.assert_allclose(int32_ref, df["int32"].to_numpy()) - np.testing.assert_allclose(int64_ref, df["int64"].to_numpy()) - np.testing.assert_allclose(uint32_ref, df["uint32"].to_numpy()) - np.testing.assert_allclose(uint64_ref, df["uint64"].to_numpy()) - - -def test_csv_reader_buffer_strings(): - names = ["text", "int"] - dtypes = ["str", "int"] - lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] - - buffer = "\n".join(lines) - - df = read_csv(StringIO(buffer), names=names, dtype=dtypes, skiprows=1) - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - assert df["text"][0] == "a" - assert df["text"][1] == "b" - assert df["text"][2] == "c" - assert df["text"][3] == "d" - - df2 = read_csv( - BytesIO(str.encode(buffer)), names=names, dtype=dtypes, skiprows=1 - ) - assert len(df2.columns) == 2 - assert df2["text"].dtype == np.dtype("object") - assert df2["int"].dtype == np.dtype("int64") - assert df2["text"][0] == "a" - assert df2["text"][1] == "b" - assert df2["text"][2] == "c" - assert df2["text"][3] == "d" - - -@pytest.mark.parametrize( - "ext, out_comp, in_comp", - [ - (".geez", "gzip", "gzip"), - (".beez", "bz2", "bz2"), - (".gz", "gzip", "infer"), - (".bz2", "bz2", "infer"), - (".beez", "bz2", np.str_("bz2")), - (".data", None, "infer"), - (".txt", None, None), - ("", None, None), - ], -) -def test_csv_reader_compression( - tmpdir, ext, out_comp, in_comp, pd_mixed_dataframe -): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_compression" + ext) - - df = pd_mixed_dataframe - df.to_csv(fname, index=False, header=False, compression=out_comp) - - gdf = read_csv(fname, names=list(df.columns.values), compression=in_comp) - pdf = pd.read_csv( - fname, names=list(df.columns.values), compression=in_comp - ) - - assert_eq(gdf, pdf) - - -@pytest.mark.parametrize( - "names, dtypes, data, trues, falses", - [ - ( - ["A", "B"], - ["bool", "bool"], - "True,True\nFalse,False\nTrue,False", - None, - None, - ), - ( - ["A", "B"], - ["int32", "int32"], - "True,1\nFalse,2\nTrue,3", - None, - None, - ), - ( - ["A", "B"], - ["int32", "int32"], - "YES,1\nno,2\nyes,3\nNo,4\nYes,5", - ["yes", "Yes", "YES"], - ["no", "NO", "No"], - ), - (["A", "B"], ["int32", "int32"], "foo,bar\nbar,foo", ["foo"], ["bar"]), - (["x", "y"], None, "True,1\nFalse,0", None, None), - ], -) -def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file11.csv") - - lines = [",".join(names), data] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - # Usage of true_values and false_values makes that column into bool type - df_out = pd.read_csv( - fname, - names=names, - skiprows=1, - dtype=(dtypes[0] if dtypes else None), - true_values=trues, - false_values=falses, - ) - - out = read_csv( - fname, - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - - assert_eq(df_out, out) - - -def test_csv_reader_bools_custom(): - names = ["text", "bool"] - dtypes = {"text": "str", "bool": "bool"} - trues = ["foo", "1"] - falses = ["bar", "0"] - lines = [ - ",".join(names), - "true,true", - "false,false", - "foo,foo", - "bar,bar", - "0,0", - "1,1", - ] - buffer = "\n".join(lines) - - df = read_csv( - StringIO(buffer), - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - - # Note: bool literals give parsing errors as int - # "0" and "1" give parsing errors as bool in pandas - expected = pd.read_csv( - StringIO(buffer), - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - assert_eq(df, expected, check_dtype=True) - - -def test_csv_reader_bools_NA(): - names = ["text", "int"] - dtypes = ["str", "int"] - trues = ["foo"] - falses = ["bar"] - lines = [ - ",".join(names), - "true,true", - "false,false", - "foo,foo", - "bar,bar", - "qux,qux", - ] - - buffer = "\n".join(lines) - - df = read_csv( - StringIO(buffer), - names=names, - dtype=dtypes, - skiprows=1, - true_values=trues, - false_values=falses, - ) - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - expected = pd.DataFrame( - { - "text": ["true", "false", "foo", "bar", "qux"], - "int": [1.0, 0.0, 1.0, 0.0, np.nan], - } - ) - assert_eq(df, expected) - - -def test_csv_quotednumbers(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv") - - names = ["integer", "decimal"] - dtypes = ["int32", "float32"] - lines = [ - ",".join(names), - '1,"3.14"', - '"2","300"', - '"3",10101.0101', - '4,"6.28318"', - ] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - integer_ref = [1, 2, 3, 4] - decimal_ref = [3.14, 300, 10101.0101, 6.28318] - - df1 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) - df2 = read_csv(str(fname), names=names, dtype=dtypes, skiprows=1) - - assert len(df2.columns) == 2 - np.testing.assert_allclose(integer_ref, df1["integer"].to_numpy()) - np.testing.assert_allclose(decimal_ref, df1["decimal"].to_numpy()) - np.testing.assert_allclose(integer_ref, df2["integer"].to_numpy()) - np.testing.assert_allclose(decimal_ref, df2["decimal"].to_numpy()) - - -def test_csv_reader_nrows(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file14.csv") - - names = ["int1", "int2"] - dtypes = ["int32", "int32"] - - rows = 4000000 - read_rows = (rows * 3) // 4 - skip_rows = (rows - read_rows) // 2 - sample_skip = 1000 - - with open(str(fname), "w") as fp: - fp.write(",".join(names) + "\n") - for i in range(rows): - fp.write(str(i) + ", " + str(2 * i) + " \n") - - # with specified names - df = read_csv( - str(fname), - names=names, - dtype=dtypes, - skiprows=skip_rows + 1, - nrows=read_rows, - ) - assert df.shape == (read_rows, 2) - for row in range(0, read_rows // sample_skip, sample_skip): - assert df["int1"][row] == row + skip_rows - assert df["int2"][row] == 2 * (row + skip_rows) - assert df["int2"][read_rows - 1] == 2 * (read_rows - 1 + skip_rows) - - # with column name inference - df = read_csv( - str(fname), dtype=dtypes, skiprows=skip_rows + 1, nrows=read_rows - ) - assert df.shape == (read_rows, 2) - assert str(skip_rows) in list(df)[0] - assert str(2 * skip_rows) in list(df)[1] - for row in range(0, read_rows // sample_skip, sample_skip): - assert df[list(df)[0]][row] == row + skip_rows + 1 - assert df[list(df)[1]][row] == 2 * (row + skip_rows + 1) - assert df[list(df)[1]][read_rows - 1] == 2 * (read_rows + skip_rows) - - # nrows larger than the file - df = read_csv(str(fname), dtype=dtypes, nrows=rows * 2) - assert df.shape == (rows, 2) - for row in range(0, rows // sample_skip, sample_skip): - assert df["int1"][row] == row - assert df["int2"][row] == 2 * row - assert df["int2"][rows - 1] == 2 * (rows - 1) - - # nrows + skiprows larger than the file - df = read_csv( - str(fname), dtype=dtypes, nrows=read_rows, skiprows=read_rows - ) - assert df.shape == (rows - read_rows, 2) - - # nrows equal to zero - df = read_csv(str(fname), dtype=dtypes, nrows=0) - assert df.shape == (0, 2) - - # with both skipfooter and nrows - should throw - with pytest.raises(ValueError): - read_csv(str(fname), nrows=read_rows, skipfooter=1) - - -def test_csv_reader_gzip_compression_strings(tmpdir): - fnamebase = tmpdir.mkdir("gdf_csv") - fname = fnamebase.join("tmp_csvreader_file15.csv") - fnamez = fnamebase.join("tmp_csvreader_file15.csv.gz") - - names = ["text", "int"] - dtypes = ["str", "int"] - lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] - - with open(str(fname), "w") as fp: - fp.write("\n".join(lines)) - - with open(str(fname), "rb") as f_in, gzip.open(str(fnamez), "wb") as f_out: - shutil.copyfileobj(f_in, f_out) - - df = read_csv( - str(fnamez), - names=names, - dtype=dtypes, - skiprows=1, - decimal=".", - thousands="'", - compression="gzip", - ) - - assert len(df.columns) == 2 - assert df["text"].dtype == np.dtype("object") - assert df["int"].dtype == np.dtype("int64") - assert df["text"][0] == "a" - assert df["text"][1] == "b" - assert df["text"][2] == "c" - assert df["text"][3] == "d" - - -@pytest.mark.parametrize("skip_rows", [0, 2, 4]) -@pytest.mark.parametrize("header_row", [0, 2]) -def test_csv_reader_skiprows_header(skip_rows, header_row): - names = ["float_point", "integer"] - dtypes = ["float64", "int64"] - lines = [ - ",".join(names), - "1.2, 1", - "2.3, 2", - "3.4, 3", - "4.5, 4", - "5.6, 5", - "6.7, 6", - ] - buffer = "\n".join(lines) - - cu_df = read_csv( - StringIO(buffer), dtype=dtypes, skiprows=skip_rows, header=header_row - ) - pd_df = pd.read_csv( - StringIO(buffer), skiprows=skip_rows, header=header_row - ) - - assert cu_df.shape == pd_df.shape - assert list(cu_df.columns.values) == list(pd_df.columns.values) - - -def test_csv_reader_dtype_inference(): - names = ["float_point", "integer"] - lines = [ - ",".join(names), - "1.2,1", - "2.3,2", - "3.4,3", - "4.5,4", - "5.6,5", - "6.7,6", - ] - buffer = "\n".join(lines) - cu_df = read_csv(StringIO(buffer)) - pd_df = pd.read_csv(StringIO(buffer)) - - assert cu_df.shape == pd_df.shape - assert list(cu_df.columns.values) == list(pd_df.columns.values) - - -def test_csv_reader_dtype_inference_whitespace(): - names = ["float_point", "integer"] - lines = [ - ",".join(names), - " 1.2, 1", - "2.3,2 ", - " 3.4, 3", - " 4.5,4", - "5.6, 5", - " 6.7,6 ", - ] - buffer = "\n".join(lines) - cu_df = read_csv(StringIO(buffer)) - pd_df = pd.read_csv(StringIO(buffer)) - - assert cu_df.shape == pd_df.shape - assert list(cu_df.columns.values) == list(pd_df.columns.values) - - -def test_csv_reader_empty_dataframe(): - dtypes = ["float64", "int64"] - buffer = "float_point, integer" - - # should work fine with dtypes - df = read_csv(StringIO(buffer), dtype=dtypes) - assert df.shape == (0, 2) - assert all(df.dtypes == ["float64", "int64"]) - - # should default to string columns without dtypes - df = read_csv(StringIO(buffer)) - assert df.shape == (0, 2) - assert all(df.dtypes == ["object", "object"]) - - -def test_csv_reader_filenotfound(tmpdir): - fname = "non-existing-filename.csv" - - # should raise an error - with pytest.raises(FileNotFoundError): - read_csv(str(fname)) - - # should raise an error - dname = tmpdir.mkdir("gdf_csv") - with pytest.raises(FileNotFoundError): - read_csv(str(dname)) - - -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "string_io", "url"] -) -def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src): - expect = pd.read_csv(path_or_buf("filepath")) - got = cudf.read_csv(path_or_buf(src)) - - assert_eq(expect, got) - - -def test_small_zip(tmpdir): - df = pd.DataFrame( - { - "a": [1997] * 2, - "b": ["Ford"] * 2, - "c": ["Super, luxurious truck"] * 2, - } - ) - - fname = tmpdir.join("small_zip_file.zip") - df.to_csv(fname, index=False) - - got = cudf.read_csv(fname) - assert_eq(df, got) - - -def test_csv_reader_carriage_return(tmpdir): - rows = 1000 - names = ["int_row", "int_double_row"] - buffer = ",".join(names) + "\r\n" - for row in range(rows): - buffer += str(row) + ", " + str(2 * row) + "\r\n" - - df = read_csv(StringIO(buffer)) - expect = cudf.DataFrame( - {"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2} - ) - - assert len(df) == rows - assert_eq(expect, df) - - -def test_csv_reader_tabs(): - names = ["float_point", "integer", "date"] - lines = [ - ",".join(names), - "1.2,\t12, \t11/22/1995", - "3.4\t,\t34\t,\t 01/01/2001", - "\t 5.6,56 \t, 12/12/1970", - "\t7.8 , 78\t,06/15/2018 \t", - ] - buffer = "\n".join(lines) - - df = read_csv(StringIO(buffer), parse_dates=["date"]) - - assert df.shape == (4, 3) - - floats = [1.2, 3.4, 5.6, 7.8] - ints = [12, 34, 56, 78] - dates = [ - "1995-11-22T00:00:00.000000000", - "2001-01-01T00:00:00.000000000", - "1970-12-12T00:00:00.000000000", - "2018-06-15T00:00:00.000000000", - ] - np.testing.assert_allclose(floats, df["float_point"].to_numpy()) - np.testing.assert_allclose(ints, df["integer"].to_numpy()) - for row in range(4): - assert str(df["date"][row]) == dates[row] - - -@pytest.mark.parametrize("segment_bytes", [10000, 19999, 30001, 36000]) -def test_csv_reader_byte_range(tmpdir, segment_bytes): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file16.csv") - - names = ["int1", "int2"] - - rows = 10000 - with open(str(fname), "w") as fp: - for i in range(rows): - fp.write(str(i) + ", " + str(2 * i) + " \n") - file_size = os.stat(str(fname)).st_size - - ref_df = read_csv(str(fname), names=names).to_pandas() - - dfs = [] - for segment in range((file_size + segment_bytes - 1) // segment_bytes): - dfs.append( - read_csv( - str(fname), - names=names, - byte_range=(segment * segment_bytes, segment_bytes), - ) - ) - df = cudf.concat(dfs).to_pandas() - - assert list(df["int1"]) == list(ref_df["int1"]) - assert list(df["int2"]) == list(ref_df["int2"]) - - -def test_csv_reader_byte_range_type_corner_case(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file17.csv") - - cudf.datasets.timeseries( - start="2000-01-01", - end="2000-01-02", - dtypes={"name": str, "id": int, "x": float, "y": float}, - ).to_csv(fname, chunksize=100000) - - byte_range = (2_147_483_648, 0) - with pytest.raises(OverflowError, match="Offset is past end of file"): - cudf.read_csv(fname, byte_range=byte_range, header=None) - - -@pytest.mark.parametrize("segment_bytes", [10, 19, 31, 36]) -def test_csv_reader_byte_range_strings(segment_bytes): - names = ["strings"] - buffer = "\n".join('"' + str(x) + '"' for x in range(1, 100)) - file_size = len(buffer) - - ref_df = read_csv(StringIO(buffer), names=names).to_pandas() - - dfs = [] - for segment in range((file_size + segment_bytes - 1) // segment_bytes): - dfs.append( - read_csv( - StringIO(buffer), - names=names, - byte_range=(segment * segment_bytes, segment_bytes), - ) - ) - df = cudf.concat(dfs).to_pandas() - - assert list(df["strings"]) == list(ref_df["strings"]) - - -@pytest.mark.parametrize( - "header_row, skip_rows, skip_blanks", - [ - (1, 0, True), - ("infer", 2, True), - (1, 4, True), - (3, 0, False), - ("infer", 5, False), - ], -) -@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) -def test_csv_reader_blanks_and_comments( - skip_rows, header_row, skip_blanks, lineterminator -): - lines = [ - "# first comment line", - lineterminator, - "# third comment line", - "1,2,3", - "4,5,6", - "7,8,9", - lineterminator, - "# last comment line", - lineterminator, - "1,1,1", - ] - buffer = lineterminator.join(lines) - - cu_df = read_csv( - StringIO(buffer), - comment="#", - header=header_row, - skiprows=skip_rows, - skip_blank_lines=skip_blanks, - ) - pd_df = pd.read_csv( - StringIO(buffer), - comment="#", - header=header_row, - skiprows=skip_rows, - skip_blank_lines=skip_blanks, - ) - - assert cu_df.shape == pd_df.shape - assert list(cu_df.columns.values) == list(pd_df.columns.values) - - -def test_csv_reader_prefix(): - lines = ["1, 1, 1, 1"] - buffer = "\n".join(lines) - - prefix_str = "a_prefix" - df = read_csv(StringIO(buffer), header=None, prefix=prefix_str) - - column_names = list(df.columns.values) - for col in range(len(column_names)): - assert column_names[col] == prefix_str + str(col) - - -def test_csv_reader_delim_whitespace(): - buffer = "1 2 3\n4 5 6" - - # with header row - with pytest.warns(FutureWarning): - cu_df = read_csv(StringIO(buffer), delim_whitespace=True) - with expect_warning_if(PANDAS_GE_220): - pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True) - assert_eq(pd_df, cu_df) - - # without header row - with pytest.warns(FutureWarning): - cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None) - with expect_warning_if(PANDAS_GE_220): - pd_df = pd.read_csv( - StringIO(buffer), delim_whitespace=True, header=None - ) - assert pd_df.shape == cu_df.shape - - # should raise an error if used with delimiter or sep - with pytest.raises(ValueError): - with pytest.warns(FutureWarning): - read_csv(StringIO(buffer), delim_whitespace=True, delimiter=" ") - with pytest.raises(ValueError): - with pytest.warns(FutureWarning): - read_csv(StringIO(buffer), delim_whitespace=True, sep=" ") - - -def test_csv_reader_unnamed_cols(): - # first and last columns are unnamed - buffer = ",1,2,3,\n4,5,6,7,8" - - cu_df = read_csv(StringIO(buffer)) - pd_df = pd.read_csv(StringIO(buffer)) - - assert all(pd_df.columns == cu_df.columns) - assert pd_df.shape == cu_df.shape - - -def test_csv_reader_header_quotation(): - buffer = '"1,,1","2,\n,2",3\n+4,+5,+6' - - cu_df = read_csv(StringIO(buffer)) - pd_df = pd.read_csv(StringIO(buffer)) - assert cu_df.shape == (1, 3) - assert_eq(pd_df, cu_df) - - # test cases that fail with pandas - buffer_pd_fail = '"1,one," , ",2,two" ,3\n4,5,6' - cu_df = read_csv(StringIO(buffer_pd_fail)) - assert cu_df.shape == (1, 3) - - -def test_csv_reader_oversized_byte_range(): - buffer = "a,b,c,d,e\n4,5,6,7,8" - - cu_df = read_csv(StringIO(buffer), byte_range=(0, 1024)) - pd_df = pd.read_csv(StringIO(buffer)) - - assert all(pd_df.columns == cu_df.columns) - assert pd_df.shape == cu_df.shape - - -def test_csv_reader_index_col(): - buffer = "0,1,2\n3,4,5\n6,7,8" - names = ["int1", "int2", "int3"] - - # using a column name - cu_df = read_csv(StringIO(buffer), names=names, index_col="int1") - pd_df = pd.read_csv(StringIO(buffer), names=names, index_col="int1") - assert_eq(pd_df, cu_df) - - # using a column index - cu_df = read_csv(StringIO(buffer), header=None, index_col=0) - pd_df = pd.read_csv(StringIO(buffer), header=None, index_col=0) - assert_eq(cu_df.index, pd_df.index) - - # using a column index with names - cu_df = read_csv(StringIO(buffer), header=None, index_col=0, names=names) - pd_df = pd.read_csv( - StringIO(buffer), header=None, index_col=0, names=names - ) - assert_eq(cu_df.index, pd_df.index) - - # passing False to avoid using a column as index (no-op in cuDF) - cu_df = read_csv(StringIO(buffer), header=None, index_col=False) - pd_df = pd.read_csv(StringIO(buffer), header=None, index_col=False) - assert_eq(cu_df.index, pd_df.index) - - -@pytest.mark.parametrize("index_name", [None, "custom name", 124]) -@pytest.mark.parametrize("index_col", [None, 0, "a"]) -def test_csv_reader_index_names(index_name, index_col): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": [10, 11, 12]}, index=["AB", "CD", "EF"] - ) - pdf.index.name = index_name - - buffer = pdf.to_csv() - actual = cudf.read_csv(StringIO(buffer), index_col=index_col) - expected = pd.read_csv(StringIO(buffer), index_col=index_col) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "names", [["a", "b", "c"], [416, 905, 647], range(3), None] -) -def test_csv_reader_column_names(names): - buffer = "0,1,2\n3,4,5\n6,7,8" - - df = read_csv(StringIO(buffer), names=names) - if names is None: - assert list(df) == ["0", "1", "2"] - else: - assert list(df) == list(names) - - -def test_csv_reader_repeated_column_name(): - buffer = """A,A,A.1,A,A.2,A,A.4,A,A - 1,2,3.1,4,a.2,a,a.4,a,a - 2,4,6.1,8,b.2,b,b.4,b,b""" - - # pandas and cudf to have same repeated column names - pdf = pd.read_csv(StringIO(buffer)) - gdf = cudf.read_csv(StringIO(buffer)) - assert_eq(pdf.columns, gdf.columns) - - -def test_csv_reader_bools_false_positives(tmpdir): - # values that are equal to ["True", "TRUE", "False", "FALSE"] - # when using ints to detect bool values - items = [3977, 4329, 24015, 27567] - - buffer = "\n".join(str(i) for i in items) - - df = read_csv(StringIO(buffer), header=None, dtype=["int32"]) - - np.testing.assert_array_equal(items, df["0"].to_numpy()) - - -def test_csv_reader_aligned_byte_range(tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file19.csv") - nelem = 1000 - - input_df = pd.DataFrame( - {"key": np.arange(0, nelem), "zeros": np.zeros(nelem)} - ) - input_df.to_csv(fname) - - df = cudf.read_csv(str(fname), byte_range=(0, 4096)) - # read_csv call above used to crash; the assert below is not crucial - assert np.count_nonzero(df["zeros"].to_pandas().values) == 0 - - -@pytest.mark.parametrize( - "pdf_dtype, gdf_dtype", - [(None, None), ("int", "hex"), ("int32", "hex32"), ("int64", "hex64")], -) -def test_csv_reader_hexadecimals(pdf_dtype, gdf_dtype): - lines = ["0x0", "-0x1000", "0xfedcba", "0xABCDEF", "0xaBcDeF"] - values = [int(hex_int, 16) for hex_int in lines] - - buffer = "\n".join(lines) - - if gdf_dtype is not None: - # require explicit `hex` dtype to parse hexadecimals - pdf = pd.DataFrame(data=values, dtype=pdf_dtype, columns=["hex_int"]) - gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) - np.testing.assert_array_equal( - pdf["hex_int"], gdf["hex_int"].to_numpy() - ) - else: - # otherwise, dtype inference returns as object (string) - pdf = pd.read_csv(StringIO(buffer), names=["hex_int"]) - gdf = read_csv(StringIO(buffer), names=["hex_int"]) - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "np_dtype, gdf_dtype", - [("int", "hex"), ("int32", "hex32"), ("int64", "hex64")], -) -def test_csv_reader_hexadecimal_overflow(np_dtype, gdf_dtype): - # This tests values which cause an overflow warning that will become an - # error in pandas. NumPy wraps the overflow silently up to the bounds of a - # signed int64. - lines = [ - "0x0", - "-0x1000", - "0xfedcba", - "0xABCDEF", - "0xaBcDeF", - "0x9512c20b", - "0x7fffffff", - "0x7fffffffffffffff", - "-0x8000000000000000", - ] - values = [int(hex_int, 16) for hex_int in lines] - buffer = "\n".join(lines) - - gdf = read_csv(StringIO(buffer), dtype=[gdf_dtype], names=["hex_int"]) - - expected = np.array(values).astype(np_dtype) - actual = gdf["hex_int"].to_numpy() - np.testing.assert_array_equal(expected, actual) - - -@pytest.mark.parametrize("quoting", [0, 1, 2, 3]) -def test_csv_reader_pd_consistent_quotes(quoting): - names = ["text"] - dtypes = ["str"] - lines = ['"a"', '"b ""c"" d"', '"f!\n."'] - - buffer = "\n".join(lines) - - gd_df = read_csv( - StringIO(buffer), names=names, dtype=dtypes, quoting=quoting - ) - pd_df = pd.read_csv(StringIO(buffer), names=names, quoting=quoting) - - assert_eq(pd_df, gd_df) - - -def test_read_csv_names_header_combination(): - pdf = pd.DataFrame( - { - "firstname": ["Emma", "Ava", "Sophia"], - "lastname": ["Olivia", "Isabella", "Charlotte"], - "gender": ["F", "F", "F"], - } - ) - buffer = pdf.to_csv(header=True, index=False) - names = pdf.columns - - gdf = read_csv(StringIO(buffer), names=names, header=0) - assert_eq(pdf, gdf) - - gdf = read_csv(StringIO(buffer), header=0) - assert_eq(pdf, gdf) - - gdf = read_csv(StringIO(buffer)) - assert_eq(pdf, gdf) - - -def test_csv_reader_scientific_type_detection(): - buffer = """1.,1.1,-1.1,1E1,1e1,-1e1,-1e-1,1e-1,1.1e1,1.1e-1,-1.1e-1,-1.1e1 - +1.1,1E+1,1e+1,+1e1,+1e-1,1e-1,+1.1e1,1.1e+1,+1.1e+1,+1.1e1""" - expected = [ - 1.0, - 1.1, - -1.1, - 10.0, - 10.0, - -10, - -0.1, - 0.1, - 11, - 0.11, - -0.11, - -11, - 1.1, - 10.0, - 10.0, - 10, - 0.1, - 0.1, - 11, - 11, - 11, - 11, - ] - - df = read_csv(StringIO(buffer), header=None) - - for dt in df.dtypes: - assert dt == "float64" - for col in df: - assert np.isclose(df[col][0], expected[int(col)]) - - -@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) -def test_csv_blank_first_row(lineterminator): - lines = ["colA,colB", "", "1, 1.1", "2, 2.2"] - buffer = lineterminator.join(lines) - - cu_df = read_csv(StringIO(buffer)) - - assert cu_df.shape == (2, 2) - assert all(cu_df.columns == ["colA", "colB"]) - - -@pytest.mark.parametrize("contents", ["", "\n"]) -def test_csv_empty_file(tmpdir, contents): - fname = tmpdir.mkdir("gdf_csv").join("test_csv_empty_file.csv") - with open(fname, "w") as f: - f.write(contents) - - col_names = ["col1", "col2", "col3", "col4"] - in_dtypes = ["int", "str", "float", "short"] - out_dtypes = ["int64", "object", "float64", "int16"] - - # Empty dataframe if no columns names specified or inferred - df = read_csv(str(fname)) - assert len(df.columns) == 0 - - # No row dataframe if columns names are specified or inferred - df = read_csv(str(fname), dtype=in_dtypes, names=col_names) - assert all(df.columns == col_names) - assert list(df.dtypes) == out_dtypes - - -@pytest.mark.parametrize("contents", ["", "\n"]) -def test_csv_empty_buffer(tmpdir, contents): - col_names = ["col1", "col2", "col3", "col4"] - in_dtypes = ["int", "str", "float", "short"] - out_dtypes = ["int64", "object", "float64", "int16"] - - # Empty dataframe if no columns names specified or inferred - df = read_csv(StringIO(contents)) - assert len(df.columns) == 0 - - # No row dataframe if columns names are specified or inferred - df = read_csv(StringIO(contents), dtype=in_dtypes, names=col_names) - assert all(df.columns == col_names) - assert list(df.dtypes) == out_dtypes - - -@pytest.mark.parametrize( - "dtype", [["short", "float", "int"], {"A": "short", "C": "int"}] -) -def test_csv_reader_partial_dtype(dtype): - names_df = read_csv( - StringIO("0,1,2"), - names=["A", "B", "C"], - dtype=dtype, - usecols=["A", "C"], - ) - header_df = read_csv( - StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"] - ) - - assert_eq(names_df, header_df) - assert all(names_df.dtypes == ["int16", "int64"]) - - -def test_csv_writer_file_handle(tmpdir): - df = pd.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) - gdf = cudf.from_pandas(df) - - gdf_df_fname = tmpdir.join("gdf_df_1.csv") - with open(gdf_df_fname, "w") as f: - gdf.to_csv(path_or_buf=f, index=False) - assert os.path.exists(gdf_df_fname) - - gdf2 = pd.read_csv(gdf_df_fname) - assert_eq(gdf, gdf2) - - -def test_csv_writer_file_append(tmpdir): - gdf1 = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) - gdf2 = cudf.DataFrame({"a": [4, 5, 6], "b": ["foo", "bar", "baz"]}) - - gdf_df_fname = tmpdir.join("gdf_df_append.csv") - with open(gdf_df_fname, "w") as f: - gdf1.to_csv(f, index=False) - with open(gdf_df_fname, "a") as f: - gdf2.to_csv(f, header=False, index=False) - - result = cudf.read_csv(gdf_df_fname) - expected = cudf.concat([gdf1, gdf2], ignore_index=True) - assert_eq(result, expected, check_index_type=True) - - -def test_csv_writer_buffer(tmpdir): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) - - buffer = BytesIO() - gdf.to_csv(buffer, index=False) - - result = cudf.read_csv(buffer) - assert_eq(result, gdf) - - -@pytest.mark.parametrize("dtype", dtypes) -@pytest.mark.parametrize("nelem", nelem) -def test_csv_writer_numeric_data(dtype, nelem, tmpdir): - pdf_df_fname = tmpdir.join("pdf_df_1.csv") - gdf_df_fname = tmpdir.join("gdf_df_1.csv") - - df = make_numeric_dataframe(nelem, dtype) - gdf = cudf.from_pandas(df) - df.to_csv(path_or_buf=pdf_df_fname, index=False, lineterminator="\n") - gdf.to_csv(path_or_buf=gdf_df_fname, index=False) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect = pd.read_csv(pdf_df_fname) - got = pd.read_csv(gdf_df_fname) - assert_eq(expect, got) - - -def test_csv_writer_datetime_data(tmpdir): - pdf_df_fname = tmpdir.join("pdf_df_2.csv") - gdf_df_fname = tmpdir.join("gdf_df_2.csv") - - df = make_datetime_dataframe() - gdf = cudf.from_pandas(df) - df.to_csv(path_or_buf=pdf_df_fname, index=False, lineterminator="\n") - gdf.to_csv(path_or_buf=gdf_df_fname, index=False) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect = pd.read_csv(pdf_df_fname) - got = pd.read_csv(gdf_df_fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize("lineterminator", ["\r", "\n", "\t", np.str_("\n")]) -@pytest.mark.parametrize("sep", [",", "/", np.str_(",")]) -def test_csv_writer_terminator_sep(lineterminator, sep, cudf_mixed_dataframe): - df = cudf_mixed_dataframe - - buffer = BytesIO() - df.to_csv(buffer, lineterminator=lineterminator, sep=sep, index=False) - - got = read_csv(buffer, lineterminator=lineterminator, sep=sep) - assert_eq(df, got) - - -@pytest.mark.parametrize( - "lineterminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")] -) -def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): - df = cudf_mixed_dataframe - - default_terminator_csv = StringIO() - df.to_csv(default_terminator_csv) - - # Need to check manually since readers don't support - # multicharacter line terminators - expected = default_terminator_csv.getvalue().replace("\n", lineterminator) - - buffer = StringIO() - df.to_csv(buffer, lineterminator=lineterminator) - got = buffer.getvalue() - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "columns", - [ - ["Date", "Float"], - ["Integer2", "Float", "Date", "Integer", "String", "Boolean"], - None, - ], -) -@pytest.mark.parametrize( - "header", [True, False, np.bool_(True), np.bool_(False)] -) -@pytest.mark.parametrize( - "index", [True, False, np.bool_(True), np.bool_(False)] -) -def test_csv_writer_column_and_header_options( - columns, header, index, pd_mixed_dataframe -): - pdf = pd_mixed_dataframe - df = cudf.from_pandas(pdf) - - cudf_buffer = BytesIO() - df.to_csv(cudf_buffer, columns=columns, header=header, index=index) - pd_buffer = BytesIO() - pdf.to_csv(pd_buffer, columns=columns, header=header, index=index) - - expected = cudf.read_csv(pd_buffer, header=0 if header else None) - got = cudf.read_csv(cudf_buffer, header=0 if header else None) - - expected_column_cnt = (1 if index else 0) + ( - len(columns) if columns else pdf.shape[1] - ) - assert_eq(expected_column_cnt, got.shape[1]) - assert_eq(expected, got) - - -def test_csv_writer_empty_columns_parameter(cudf_mixed_dataframe): - df = cudf_mixed_dataframe - write_str = df.to_csv(columns=[], index=False) - assert_eq(write_str, "\n") - - -def test_csv_writer_multiindex(tmpdir): - pdf_df_fname = tmpdir.join("pdf_df_3.csv") - gdf_df_fname = tmpdir.join("gdf_df_3.csv") - - np.random.seed(0) - gdf = cudf.DataFrame( - { - "a": np.random.randint(0, 5, 20), - "b": np.random.randint(0, 5, 20), - "c": range(20), - "d": np.random.random(20), - } - ) - gdg = gdf.groupby(["a", "b"]).mean() - pdg = gdg.to_pandas() - pdg.to_csv(pdf_df_fname) - gdg.to_csv(gdf_df_fname) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect = pd.read_csv(pdf_df_fname) - got = pd.read_csv(gdf_df_fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize("chunksize", [None, 9, 1000]) -@pytest.mark.parametrize("dtype", dtypes) -def test_csv_writer_chunksize(chunksize, dtype): - cu_df = cudf.from_pandas(make_numeric_dataframe(100, dtype)) - - buffer = BytesIO() - cu_df.to_csv(buffer, chunksize=chunksize, index=False) - - got = cudf.read_csv(buffer, dtype=[dtype]) - assert_eq(cu_df, got) - - -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame({"vals": [1, 2, 3]}), - cudf.DataFrame( - {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]} - ), - cudf.DataFrame( - {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]} - ), - ], -) -def test_to_csv_empty_filename(df): - pdf = df.to_pandas() - - actual = df.to_csv() - expected = pdf.to_csv() - - assert actual == expected - - -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame({"vals": [1, 2, 3]}), - cudf.DataFrame( - {"vals1": [1, 2, 3], "vals2": ["hello", "rapids", "cudf"]} - ), - cudf.DataFrame( - {"vals1": [None, 2.0, 3.0], "vals2": ["hello", "rapids", None]} - ), - ], -) -def test_to_csv_StringIO(df): - cudf_io = StringIO() - pandas_io = StringIO() - - pdf = df.to_pandas() - - df.to_csv(cudf_io) - pdf.to_csv(pandas_io) - - cudf_io.seek(0) - pandas_io.seek(0) - - assert cudf_io.read() == pandas_io.read() - - -def test_csv_writer_empty_dataframe(tmpdir): - df_fname = tmpdir.join("gdf_df_5.csv") - gdf = cudf.DataFrame({"float_point": [], "integer": []}) - gdf["float_point"] = gdf["float_point"].astype("float") - gdf["integer"] = gdf["integer"].astype("int") - - gdf.to_csv(df_fname, index=False) - - df = cudf.read_csv(df_fname) - - assert df.shape == (0, 2) - assert all(df.dtypes == ["object", "object"]) - - -def test_csv_write_chunksize_corner_case(tmpdir): - # With this num of rows and chunksize - # libcudf splits table such a way that it - # will end up creating an empty table slice - # which caused the issue 5588. - df_fname = tmpdir.join("gdf_df_17.csv") - df = cudf.DataFrame({"a": np.arange(10_000)}) - df.to_csv(df_fname, chunksize=1000, index=False) - got = cudf.read_csv(df_fname) - - assert_eq(df, got) - - -def test_csv_write_no_caller_manipulation(): - df = cudf.DataFrame({"a": [1, 2, 3]}) - df_copy = df.copy(deep=True) - _ = df.to_csv(index=True) - assert_eq(df, df_copy) - - -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame({"a": [1, 2, 3], "": [10, 20, 40]}), - cudf.DataFrame({"": [10, 20, 40], "a": [1, 2, 3]}), - cudf.DataFrame( - {"a": [1, 2, 3], "": [10, 20, 40]}, - index=cudf.Index(["a", "z", "v"], name="custom name"), - ), - ], -) -@pytest.mark.parametrize("index", [True, False]) -@pytest.mark.parametrize("columns", [["a"], [""], None]) -def test_csv_write_empty_column_name(df, index, columns): - pdf = df.to_pandas() - expected = pdf.to_csv(index=index, columns=columns) - actual = df.to_csv(index=index, columns=columns) - - assert expected == actual - - -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame(), - cudf.DataFrame(index=cudf.Index([], name="index name")), - ], -) -@pytest.mark.parametrize("index", [True, False]) -def test_csv_write_empty_dataframe(df, index): - pdf = df.to_pandas() - - expected = pdf.to_csv(index=index) - actual = df.to_csv(index=index) - - assert expected == actual - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame( - { - "a": [1, 2, 3, None], - "": ["a", "v", None, None], - None: [12, 12, 32, 44], - } - ), - pd.DataFrame( - { - np.nan: [1, 2, 3, None], - "": ["a", "v", None, None], - None: [12, 12, 32, 44], - } - ), - pd.DataFrame({"": [1, None, 3, 4]}), - pd.DataFrame({None: [1, None, 3, 4]}), - pd.DataFrame(columns=[None, "", "a", "b"]), - pd.DataFrame(columns=[None]), - pd.DataFrame(columns=[""]), - ], -) -@pytest.mark.parametrize( - "na_rep", ["", "_NA_", "---", "_____CUSTOM_NA_REP______"] -) -def test_csv_write_dataframe_na_rep(df, na_rep): - gdf = cudf.from_pandas(df) - - expected = df.to_csv(na_rep=na_rep) - actual = gdf.to_csv(na_rep=na_rep) - - assert expected == actual - - -@pytest.mark.parametrize( - "dtype", - [ - "int", - "str", - "float", - np.int32, - np.dtype("float32"), - {"a": "int32", "b": "float64", "c": "uint8"}, - int, - str, - object, - ], -) -def test_csv_reader_dtypes(dtype): - buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n4,13,114\n" - - expected = pd.read_csv(StringIO(buf), dtype=dtype) - actual = cudf.read_csv(StringIO(buf), dtype=dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "dtype", ["Int64", "UInt32", {"a": "UInt64", "b": "Float64", "c": "Int32"}] -) -def test_csv_reader_nullable_dtypes(dtype): - buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n4,13,114\n" - - expected = pd.read_csv(StringIO(buf), dtype=dtype) - actual = cudf.read_csv(StringIO(buf), dtype=dtype) - - assert_eq(expected, actual.to_pandas(nullable=True)) - - -@pytest.mark.parametrize( - "dtype", sorted(list(cudf.utils.dtypes.TIMEDELTA_TYPES)) -) -def test_csv_reader_timedetla_dtypes(dtype): - buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" - - expected = pd.read_csv(StringIO(buf)).astype(dtype) - actual = cudf.read_csv(StringIO(buf), dtype=dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "dtype", sorted(list(cudf.utils.dtypes.DATETIME_TYPES)) -) -def test_csv_reader_datetime_dtypes(dtype): - buf = "a,b,c\n1,10,111\n2,11,112\n3,12,113\n43432423,13342,13243214\n" - - expected = pd.read_csv(StringIO(buf)).astype(dtype) - actual = cudf.read_csv(StringIO(buf), dtype=dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame( - { - "a": cudf.Series([1, 2, 3, 1, 2], dtype="category"), - "b": cudf.Series(["a", "c", "a", "b", "a"], dtype="category"), - } - ), - cudf.DataFrame( - { - "a": cudf.Series([1.1, 2, 3, 1.1, 2], dtype="category"), - "b": cudf.Series( - [None, "c", None, "b", "a"], dtype="category" - ), - } - ), - cudf.DataFrame( - { - "b": cudf.Series( - [1.1, 2, 3, 1.1, 2], - dtype="category", - index=cudf.CategoricalIndex( - ["abc", "def", "ghi", "jkl", "xyz"] - ), - ) - } - ), - ], -) -def test_csv_writer_category(df): - pdf = df.to_pandas() - - expected = pdf.to_csv() - actual = df.to_csv() - - assert expected == actual - - -@pytest.mark.parametrize( - "dtype", - [ - "category", - {"a": "category", "b": "str"}, - {"b": "category"}, - {"a": "category"}, - {"a": pd.CategoricalDtype([1, 2])}, - {"b": pd.CategoricalDtype([1, 2, 3])}, - {"b": pd.CategoricalDtype(["b", "a"]), "a": "str"}, - pd.CategoricalDtype(["a", "b"]), - ], -) -def test_csv_reader_category(dtype): - df = cudf.DataFrame({"a": [1, 2, 3, None], "b": ["a", "b", None, "c"]}) - csv_buf = df.to_csv() - - actual = cudf.read_csv(StringIO(csv_buf), dtype=dtype) - expected = pd.read_csv(StringIO(csv_buf), dtype=dtype) - - assert_eq(expected, actual, check_dtype=True) - - -def test_csv_writer_datetime_sep(): - df = cudf.DataFrame( - {"a": cudf.Series([22343, 2323423, 234324234], dtype="datetime64[ns]")} - ) - df["a"] = df["a"].astype("datetime64[s]") - expected = df.to_pandas().to_csv(date_format="%Y-%m-%dT%H:%M:%SZ", sep="-") - actual = df.to_csv(sep="-") - assert expected == actual - - -def test_na_filter_empty_fields(): - test_na = "TEST_NAN" - df = pd.DataFrame({"col0": ["valid", None, "also_valid", "", test_na]}) - buffer = df.to_csv(index=False) - - pdf = pd.read_csv(StringIO(buffer), na_filter=False) - gdf = cudf.read_csv(StringIO(buffer), na_filter=False) - assert_eq(pdf, gdf) - - pdf = pd.read_csv(StringIO(buffer), keep_default_na=False) - gdf = cudf.read_csv(StringIO(buffer), keep_default_na=False) - assert_eq(pdf, gdf) - - pdf = pd.read_csv( - StringIO(buffer), keep_default_na=False, na_values=test_na - ) - gdf = cudf.read_csv( - StringIO(buffer), keep_default_na=False, na_values=test_na - ) - assert_eq(pdf, gdf) - - -def test_csv_sep_error(): - pdf = pd.DataFrame({"a": [1, 2, 3]}) - gdf = cudf.DataFrame({"a": [1, 2, 3]}) - assert_exceptions_equal( - lfunc=pdf.to_csv, - rfunc=gdf.to_csv, - lfunc_args_and_kwargs=([], {"sep": "abc"}), - rfunc_args_and_kwargs=([], {"sep": "abc"}), - ) - - assert_exceptions_equal( - lfunc=pdf.to_csv, - rfunc=gdf.to_csv, - lfunc_args_and_kwargs=([], {"sep": 1}), - rfunc_args_and_kwargs=([], {"sep": 1}), - ) - - -def test_to_csv_encoding_error(): - # TODO: Remove this test once following - # issue is fixed: https://github.com/rapidsai/cudf/issues/2957 - df = cudf.DataFrame({"a": ["你好", "test"]}) - encoding = "utf-8-sig" - error_message = ( - f"Encoding {encoding} is not supported. " - + "Currently, only utf-8 encoding is supported." - ) - with pytest.raises(NotImplementedError, match=re.escape(error_message)): - df.to_csv("test.csv", encoding=encoding) - - -def test_to_csv_compression_error(): - df = cudf.DataFrame({"a": ["test"]}) - compression = "snappy" - error_message = "Writing compressed csv is not currently supported in cudf" - with pytest.raises(NotImplementedError, match=re.escape(error_message)): - df.to_csv("test.csv", compression=compression) - - -def test_empty_df_no_index(): - actual = cudf.DataFrame({}) - buffer = BytesIO() - actual.to_csv(buffer, index=False) - - result = cudf.read_csv(buffer) - - assert_eq(actual, result) - - -def test_default_integer_bitwidth( - cudf_mixed_dataframe, default_integer_bitwidth -): - # Test that integer columns in csv are _inferred_ as user specified - # bitwidth - buf = BytesIO() - cudf_mixed_dataframe.to_csv(buf) - buf.seek(0) - read = cudf.read_csv(buf) - assert read["Integer"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") - assert read["Integer2"].dtype == np.dtype( - f"i{default_integer_bitwidth//8}" - ) - - -def test_default_integer_bitwidth_partial( - cudf_mixed_dataframe, default_integer_bitwidth -): - # Test that integer columns in csv are _inferred_ as user specified - # bitwidth - buf = BytesIO() - cudf_mixed_dataframe.to_csv(buf) - buf.seek(0) - read = cudf.read_csv(buf, dtype={"Integer": "int64"}) - assert read["Integer"].dtype == np.dtype("i8") - assert read["Integer2"].dtype == np.dtype( - f"i{default_integer_bitwidth//8}" - ) - - -@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -def test_default_integer_bitwidth_extremes( - cudf_extreme_numeric_dataframe, default_integer_bitwidth -): - # Test that integer columns in csv are _inferred_ as user specified - # bitwidth - buf = BytesIO() - cudf_extreme_numeric_dataframe.to_csv(buf) - buf.seek(0) - read = cudf.read_csv(buf) - - assert read["int64"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") - assert read["long"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") - assert read["uint64"].dtype == np.dtype(f"u{default_integer_bitwidth//8}") - - -def test_default_float_bitwidth(cudf_mixed_dataframe, default_float_bitwidth): - # Test that float columns in csv are _inferred_ as user specified - # bitwidth - buf = BytesIO() - cudf_mixed_dataframe.to_csv(buf) - buf.seek(0) - read = cudf.read_csv(buf) - assert read["Float"].dtype == np.dtype(f"f{default_float_bitwidth//8}") - - -def test_default_float_bitwidth_partial(default_float_bitwidth): - # Test that float columns in csv are _inferred_ as user specified - # bitwidth - read = cudf.read_csv( - StringIO("float1,float2\n1.0,2.0\n3.0,4.0"), - dtype={"float2": "float64"}, - ) - assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth//8}") - assert read["float2"].dtype == np.dtype("f8") - - -@pytest.mark.parametrize( - "usecols,names", - [ - # selection using indices; only names of selected columns are specified - ([1, 2], ["b", "c"]), - # selection using indices; names of all columns are specified - ([1, 2], ["a", "b", "c"]), - # selection using indices; duplicates - ([2, 2], ["a", "b", "c"]), - # selection using indices; out of order - ([2, 1], ["a", "b", "c"]), - # selection using names - (["b"], ["a", "b", "c"]), - # selection using names; multiple columns - (["b", "c"], ["a", "b", "c"]), - # selection using names; duplicates - (["c", "c"], ["a", "b", "c"]), - # selection using names; out of order - (["c", "b"], ["a", "b", "c"]), - ], -) -def test_column_selection_plus_column_names(usecols, names): - lines = [ - "num,datetime,text", - "123,2018-11-13T12:00:00,abc", - "456,2018-11-14T12:35:01,def", - "789,2018-11-15T18:02:59,ghi", - ] - - buffer = "\n".join(lines) + "\n" - - assert_eq( - pd.read_csv(StringIO(buffer), usecols=usecols, names=names), - cudf.read_csv(StringIO(buffer), usecols=usecols, names=names), - ) - - -def test_read_compressed_BOM(tmpdir): - buffer = 'int, string\n1, "a"\n2, "b"\n3, "c"\n' - - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file20.gz") - with gzip.open(fname, "wt", encoding="utf-8") as f: - f.write(codecs.BOM_UTF8.decode("utf-8")) - f.write(buffer) - - assert_eq(pd.read_csv(fname), cudf.read_csv(fname)) - - -def test_read_header_none_pandas_compat_column_type(): - data = "1\n2\n" - with cudf.option_context("mode.pandas_compatible", True): - result = cudf.read_csv(StringIO(data), header=None).columns - expected = pd.read_csv(StringIO(data), header=None).columns - pd.testing.assert_index_equal(result, expected, exact=True) diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py deleted file mode 100644 index dc892caba3b..00000000000 --- a/python/cudf/cudf/tests/test_cuda_apply.py +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -""" -Test method that apply GPU kernel to a frame. -""" - -import numpy as np -import pytest -from numba import cuda - -from cudf import DataFrame -from cudf.testing import assert_eq - - -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) -def test_df_apply_rows(nelem): - def kernel(in1, in2, in3, out1, out2, extra1, extra2): - for i, (x, y, z) in enumerate(zip(in1, in2, in3)): - out1[i] = extra2 * x - extra1 * y - out2[i] = y - extra1 * z - - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) - - extra1 = 2.3 - extra2 = 3.4 - - expect_out1 = extra2 * in1 - extra1 * in2 - expect_out2 = in2 - extra1 * in3 - - outdf = df.apply_rows( - kernel, - incols=["in1", "in2", "in3"], - outcols=dict(out1=np.float64, out2=np.float64), - kwargs=dict(extra1=extra1, extra2=extra2), - ) - - got_out1 = outdf["out1"].to_numpy() - got_out2 = outdf["out2"].to_numpy() - - np.testing.assert_array_almost_equal(got_out1, expect_out1) - np.testing.assert_array_almost_equal(got_out2, expect_out2) - - -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) -@pytest.mark.parametrize("chunksize", [1, 2, 3, 4, 23]) -def test_df_apply_chunks(nelem, chunksize): - def kernel(in1, in2, in3, out1, out2, extra1, extra2): - for i, (x, y, z) in enumerate(zip(in1, in2, in3)): - out1[i] = extra2 * x - extra1 * y + z - out2[i] = i - - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) - - extra1 = 2.3 - extra2 = 3.4 - - expect_out1 = extra2 * in1 - extra1 * in2 + in3 - expect_out2 = np.arange(len(df)) % chunksize - - outdf = df.apply_chunks( - kernel, - incols=["in1", "in2", "in3"], - outcols=dict(out1=np.float64, out2=np.int32), - kwargs=dict(extra1=extra1, extra2=extra2), - chunks=chunksize, - ) - - got_out1 = outdf["out1"] - got_out2 = outdf["out2"] - - np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1) - np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) - - -@pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129]) -def test_df_apply_custom_chunks(nelem): - def kernel(in1, in2, in3, out1, out2, extra1, extra2): - for i, (x, y, z) in enumerate(zip(in1, in2, in3)): - out1[i] = extra2 * x - extra1 * y + z - out2[i] = i - - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) - - chunks = [0, 7, 11, 29, 101, 777] - chunks = [c for c in chunks if c < nelem] - - extra1 = 2.3 - extra2 = 3.4 - - expect_out1 = extra2 * in1 - extra1 * in2 + in3 - expect_out2 = np.hstack( - [np.arange(e - s) for s, e in zip(chunks, chunks[1:] + [len(df)])] - ) - - outdf = df.apply_chunks( - kernel, - incols=["in1", "in2", "in3"], - outcols=dict(out1=np.float64, out2=np.int32), - kwargs=dict(extra1=extra1, extra2=extra2), - chunks=chunks, - ) - - got_out1 = outdf["out1"] - got_out2 = outdf["out2"] - - np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1) - np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) - - -@pytest.mark.parametrize("nelem", [1, 15, 30, 64, 128, 129]) -@pytest.mark.parametrize("blkct", [None, 1, 8]) -@pytest.mark.parametrize("tpb", [1, 8, 64]) -def test_df_apply_custom_chunks_blkct_tpb(nelem, blkct, tpb): - def kernel(in1, in2, in3, out1, out2, extra1, extra2): - for i in range(cuda.threadIdx.x, in1.size, cuda.blockDim.x): - x = in1[i] - y = in2[i] - z = in3[i] - out1[i] = extra2 * x - extra1 * y + z - out2[i] = i * cuda.blockDim.x - - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) - - chunks = [0, 7, 11, 29, 101, 777] - chunks = [c for c in chunks if c < nelem] - - extra1 = 2.3 - extra2 = 3.4 - - expect_out1 = extra2 * in1 - extra1 * in2 + in3 - expect_out2 = np.hstack( - [ - tpb * np.arange(e - s) - for s, e in zip(chunks, chunks[1:] + [len(df)]) - ] - ) - - outdf = df.apply_chunks( - kernel, - incols=["in1", "in2", "in3"], - outcols=dict(out1=np.float64, out2=np.int32), - kwargs=dict(extra1=extra1, extra2=extra2), - chunks=chunks, - blkct=blkct, - tpb=tpb, - ) - - got_out1 = outdf["out1"] - got_out2 = outdf["out2"] - - np.testing.assert_array_almost_equal(got_out1.to_numpy(), expect_out1) - np.testing.assert_array_almost_equal(got_out2.to_numpy(), expect_out2) - - -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 1000, 5000]) -def test_df_apply_rows_incols_mapping(nelem): - def kernel(x, y, z, out1, out2, extra1, extra2): - for i, (a, b, c) in enumerate(zip(x, y, z)): - out1[i] = extra2 * a - extra1 * b - out2[i] = b - extra1 * c - - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) - - extra1 = 2.3 - extra2 = 3.4 - - expected_out = DataFrame() - expected_out["out1"] = extra2 * in1 - extra1 * in2 - expected_out["out2"] = in2 - extra1 * in3 - - outdf = df.apply_rows( - kernel, - incols={"in1": "x", "in2": "y", "in3": "z"}, - outcols=dict(out1=np.float64, out2=np.float64), - kwargs=dict(extra1=extra1, extra2=extra2), - ) - - assert_eq(outdf[["out1", "out2"]], expected_out) - - -@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129]) -@pytest.mark.parametrize("chunksize", [1, 2, 3, 4, 23]) -def test_df_apply_chunks_incols_mapping(nelem, chunksize): - def kernel(q, p, r, out1, out2, extra1, extra2): - for i, (a, b, c) in enumerate(zip(q, p, r)): - out1[i] = extra2 * a - extra1 * b + c - out2[i] = i - - df = DataFrame() - df["in1"] = in1 = np.arange(nelem) - df["in2"] = in2 = np.arange(nelem) - df["in3"] = in3 = np.arange(nelem) - - extra1 = 2.3 - extra2 = 3.4 - - expected_out = DataFrame() - expected_out["out1"] = extra2 * in1 - extra1 * in2 + in3 - expected_out["out2"] = np.arange(len(df)) % chunksize - - outdf = df.apply_chunks( - kernel, - incols={"in1": "q", "in2": "p", "in3": "r"}, - outcols=dict(out1=np.float64, out2=np.int64), - kwargs=dict(extra1=extra1, extra2=extra2), - chunks=chunksize, - ) - - assert_eq(outdf[["out1", "out2"]], expected_out) diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py deleted file mode 100644 index 29f2f46e3c7..00000000000 --- a/python/cudf/cudf/tests/test_cuda_array_interface.py +++ /dev/null @@ -1,243 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import types -from contextlib import ExitStack as does_not_raise - -import cupy -import numba.cuda -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.testing import assert_eq -from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) -@pytest.mark.parametrize("module", ["cupy", "numba"]) -def test_cuda_array_interface_interop_in(dtype, module): - np_data = np.arange(10).astype(dtype) - - expectation = does_not_raise() - if module == "cupy": - module_constructor = cupy.array - if dtype in DATETIME_TYPES: - expectation = pytest.raises(ValueError) - elif module == "numba": - module_constructor = numba.cuda.to_device - - with expectation: - module_data = module_constructor(np_data) - - pd_data = pd.Series(np_data) - # Test using a specific function for __cuda_array_interface__ here - cudf_data = cudf.Series(module_data) - - assert_eq(pd_data, cudf_data) - - gdf = cudf.DataFrame() - gdf["test"] = module_data - pd_data.name = "test" - assert_eq(pd_data, gdf["test"]) - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["str"] -) -@pytest.mark.parametrize("module", ["cupy", "numba"]) -def test_cuda_array_interface_interop_out(dtype, module): - expectation = does_not_raise() - if dtype == "str": - expectation = pytest.raises(AttributeError) - if module == "cupy": - module_constructor = cupy.asarray - - def to_host_function(x): - return cupy.asnumpy(x) - - elif module == "numba": - module_constructor = numba.cuda.as_cuda_array - - def to_host_function(x): - return x.copy_to_host() - - with expectation: - np_data = np.arange(10).astype(dtype) - cudf_data = cudf.Series(np_data) - assert isinstance(cudf_data.__cuda_array_interface__, dict) - - module_data = module_constructor(cudf_data) - got = to_host_function(module_data) - - expect = np_data - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES -) -@pytest.mark.parametrize("module", ["cupy", "numba"]) -def test_cuda_array_interface_interop_out_masked(dtype, module): - expectation = does_not_raise() - if module == "cupy": - pytest.skip( - "cupy doesn't support version 1 of " - "`__cuda_array_interface__` yet" - ) - module_constructor = cupy.asarray - - def to_host_function(x): - return cupy.asnumpy(x) - - elif module == "numba": - expectation = pytest.raises(NotImplementedError) - module_constructor = numba.cuda.as_cuda_array - - def to_host_function(x): - return x.copy_to_host() - - np_data = np.arange(10).astype("float64") - np_data[[0, 2, 4, 6, 8]] = np.nan - - with expectation: - cudf_data = cudf.Series(np_data).astype(dtype) - assert isinstance(cudf_data.__cuda_array_interface__, dict) - - module_data = module_constructor(cudf_data) # noqa: F841 - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES -) -@pytest.mark.parametrize("nulls", ["all", "some", "bools", "none"]) -@pytest.mark.parametrize("mask_type", ["bits", "bools"]) -def test_cuda_array_interface_as_column(dtype, nulls, mask_type): - sr = cudf.Series(np.arange(10)) - - if nulls == "some": - mask = [ - True, - False, - True, - False, - False, - True, - True, - False, - True, - True, - ] - sr[sr[~np.asarray(mask)]] = None - elif nulls == "all": - sr[:] = None - - sr = sr.astype(dtype) - - obj = types.SimpleNamespace( - __cuda_array_interface__=sr.__cuda_array_interface__ - ) - - if mask_type == "bools": - if nulls == "some": - obj.__cuda_array_interface__["mask"] = numba.cuda.to_device(mask) - elif nulls == "all": - obj.__cuda_array_interface__["mask"] = numba.cuda.to_device( - [False] * 10 - ) - - expect = sr - got = cudf.Series(obj) - - assert_eq(expect, got) - - -def test_column_from_ephemeral_cupy(): - # Test that we keep a reference to the ephemeral - # CuPy array. If we didn't, then `a` would end - # up referring to the same memory as `b` due to - # CuPy's caching allocator - a = cudf.Series(cupy.asarray([1, 2, 3])) - b = cudf.Series(cupy.asarray([1, 1, 1])) - assert_eq(pd.Series([1, 2, 3]), a) - assert_eq(pd.Series([1, 1, 1]), b) - - -def test_column_from_ephemeral_cupy_try_lose_reference(): - # Try to lose the reference we keep to the ephemeral - # CuPy array - a = cudf.Series(cupy.asarray([1, 2, 3]))._column - a = cudf.core.column.as_column(a) - b = cupy.asarray([1, 1, 1]) # noqa: F841 - assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) - - a = cudf.Series(cupy.asarray([1, 2, 3]))._column - a.name = "b" - b = cupy.asarray([1, 1, 1]) # noqa: F841 - assert_eq(pd.Index([1, 2, 3]), a.to_pandas()) - - -@pytest.mark.xfail( - get_global_manager() is not None, - reason=( - "spilling doesn't support PyTorch, see " - "`cudf.core.buffer.spillable_buffer.DelayedPointerTuple`" - ), -) -def test_cuda_array_interface_pytorch(): - torch = pytest.importorskip("torch", minversion="1.6.0") - if not torch.cuda.is_available(): - pytest.skip("need gpu version of pytorch to be installed") - - series = cudf.Series([1, -1, 10, -56]) - tensor = torch.tensor(series) - got = cudf.Series(tensor) - - assert_eq(got, series) - buffer = cudf.core.buffer.as_buffer(cupy.ones(10, dtype=np.bool_)) - tensor = torch.tensor(buffer) - got = cudf.Series(tensor, dtype=np.bool_) - - assert_eq(got, cudf.Series(buffer, dtype=np.bool_)) - - # TODO: This test fails with PyTorch 2. It appears that PyTorch - # checks that the pointer is device-accessible even when the - # size is zero. See - # https://github.com/pytorch/pytorch/issues/98133 - # - # index = cudf.Index([], dtype="float64") - # tensor = torch.tensor(index) - # got = cudf.Index(tensor) - # assert_eq(got, index) - - index = cudf.core.index.RangeIndex(start=0, stop=100) - tensor = torch.tensor(index) - got = cudf.Series(tensor) - - assert_eq(got, cudf.Series(index)) - - index = cudf.Index([1, 2, 8, 6]) - tensor = torch.tensor(index) - got = cudf.Index(tensor) - - assert_eq(got, index) - - str_series = cudf.Series(["a", "g"]) - - with pytest.raises(AttributeError): - str_series.__cuda_array_interface__ - - cat_series = str_series.astype("category") - - with pytest.raises(TypeError): - cat_series.__cuda_array_interface__ - - -def test_cai_after_indexing(): - df = cudf.DataFrame({"a": [1, 2, 3]}) - cai1 = df["a"].__cuda_array_interface__ - df[["a"]] - cai2 = df["a"].__cuda_array_interface__ - assert cai1 == cai2 diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py deleted file mode 100644 index 278e63f3e8b..00000000000 --- a/python/cudf/cudf/tests/test_custom_accessor.py +++ /dev/null @@ -1,92 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -@cudf.api.extensions.register_dataframe_accessor("point") -@pd.api.extensions.register_dataframe_accessor("point") -class PointsAccessor: - def __init__(self, obj): - self._validate(obj) - self._obj = obj - - @staticmethod - def _validate(obj): - cols = obj.columns - if not all(vertex in cols for vertex in ["x", "y"]): - raise AttributeError("Must have vertices 'x', 'y'.") - - @property - def bounding_box(self): - xs, ys = self._obj["x"], self._obj["y"] - min_x, min_y, max_x, max_y = xs.min(), ys.min(), xs.max(), ys.max() - - return (min_x, min_y, max_x, max_y) - - -@pytest.mark.parametrize( - "gdf", [cudf.datasets.randomdata(nrows=6, dtypes={"x": int, "y": int})] -) -def test_dataframe_accessor(gdf): - pdf = gdf.to_pandas() - - assert_eq(gdf.point.bounding_box, pdf.point.bounding_box) - - -@pytest.mark.parametrize( - "gdf1", [cudf.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] -) -@pytest.mark.parametrize( - "gdf2", [cudf.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] -) -def test_dataframe_accessor_idendity(gdf1, gdf2): - """Test for accessor identities - - An object should hold persistent reference to the same accessor - - Different objects should hold difference instances of the accessor - """ - - assert gdf1.point is gdf1.point - assert gdf1.point is not gdf2.point - - -@pd.api.extensions.register_index_accessor("odd") -@pd.api.extensions.register_series_accessor("odd") -@cudf.api.extensions.register_index_accessor("odd") -@cudf.api.extensions.register_series_accessor("odd") -class OddRowAccessor: - def __init__(self, obj): - self._obj = obj - - def __getitem__(self, i): - return self._obj[2 * i - 1] - - -@pytest.mark.parametrize("gidx", [cudf.Index(list(range(0, 50)))]) -def test_index_accessor(gidx): - pidx = gidx.to_pandas() - - for i in range(1, 10): - assert_eq(gidx.odd[i], pidx.odd[i]) - - -@pytest.mark.parametrize("gs", [cudf.Series(list(range(1, 50)))]) -def test_series_accessor(gs): - ps = gs.to_pandas() - - for i in range(1, 10): - assert_eq(gs.odd[i], ps.odd[i]) - - -@pytest.mark.parametrize( - "gdf", [cudf.datasets.randomdata(nrows=6, dtypes={"x": int, "y": int})] -) -@pytest.mark.parametrize("gidx", [cudf.Index(list(range(1, 50)))]) -@pytest.mark.parametrize("gs", [cudf.Series(list(range(1, 50)))]) -def test_accessor_space_separate(gdf, gidx, gs): - assert not id(gdf._accessors) == id(gidx._accessors) - assert not id(gidx._accessors) == id(gs._accessors) - assert not id(gdf._accessors) == id(gs._accessors) diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py deleted file mode 100644 index 3f31da035aa..00000000000 --- a/python/cudf/cudf/tests/test_cut.py +++ /dev/null @@ -1,310 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -""" -Test related to Cut -""" - -import numpy as np -import pandas as pd -import pytest - -from cudf.core.cut import cut -from cudf.testing import assert_eq - - -@pytest.mark.parametrize( - "x", [[1, 7, 5, 4, 6, 3], [1, 7], np.array([1, 7, 5, 4, 6, 3])] -) -@pytest.mark.parametrize("bins", [1, 2, 3]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize( - "ordered", [True] -) # if ordered is False we need labels -@pytest.mark.parametrize("precision", [1, 2, 3]) -def test_cut_basic(x, bins, right, include_lowest, ordered, precision): - # will test optional labels, retbins and duplicates separately - # they need more specific parameters to work - pcat = pd.cut( - x=x, - bins=bins, - right=right, - precision=precision, - include_lowest=include_lowest, - ordered=ordered, - ) - pindex = pd.CategoricalIndex(pcat) - gindex = cut( - x=x, - bins=bins, - right=right, - precision=precision, - include_lowest=include_lowest, - ordered=ordered, - ) - - assert_eq(pindex, gindex) - - -@pytest.mark.parametrize("x", [[1, 7, 5, 4, 6, 3]]) -@pytest.mark.parametrize("bins", [3]) # labels must be the same len as bins -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize( - "ordered", [True, False] -) # labels must be unique if ordered=True -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize( - "labels", [["bad", "medium", "good"], ["A", "B", "C"], [1, 2, 3], False] -) -def test_cut_labels( - x, bins, right, include_lowest, ordered, precision, labels -): - pcat = pd.cut( - x=x, - bins=bins, - right=right, - labels=labels, - precision=precision, - include_lowest=include_lowest, - ordered=ordered, - ) - pindex = pd.CategoricalIndex(pcat) if labels else pcat - gindex = cut( - x=x, - bins=bins, - right=right, - labels=labels, - precision=precision, - include_lowest=include_lowest, - ordered=ordered, - ) - - assert_eq(pindex, gindex) - - -@pytest.mark.parametrize("x", [[1, 7, 5, 4, 6, 3]]) -@pytest.mark.parametrize("bins", [3]) # labels must be the same len as bins -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize( - "ordered", [False] -) # labels must be unique if ordered=True -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize( - "labels", [["bad", "good", "good"], ["B", "A", "B"], [1, 2, 2], False] -) -def test_cut_labels_non_unique( - x, bins, right, include_lowest, ordered, precision, labels -): - pcat = pd.cut( - x=x, - bins=bins, - right=right, - labels=labels, - precision=precision, - include_lowest=include_lowest, - ordered=ordered, - ) - pindex = pd.CategoricalIndex(pcat) if labels else pcat - gindex = cut( - x=x, - bins=bins, - right=right, - labels=labels, - precision=precision, - include_lowest=include_lowest, - ordered=ordered, - ) - - assert_eq(pindex, gindex) - - -@pytest.mark.parametrize( - "x", - [ - [1, 7, 5, 4, 6, 3], - [1, 7], - np.array([1, 7, 5, 4, 6, 3]), - np.array([2, 4, 6, 8, 10]), - ], -) -@pytest.mark.parametrize( - "bins", - [1, 2, 3, [1, 2, 3], [0, 2, 4, 6, 10]], -) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("precision", [3]) -def test_cut_right(x, bins, right, precision): - pcat = pd.cut( - x=x, - bins=bins, - right=right, - precision=precision, - ) - pindex = pd.CategoricalIndex(pcat) - gindex = cut( - x=x, - bins=bins, - right=right, - precision=precision, - ) - - assert_eq(pindex, gindex) - - -@pytest.mark.parametrize( - "x", - [ - [1, 7, 5, 4, 6, 3], - [1, 7], - np.array([1, 7, 5, 4, 6, 3]), - np.array([2, 4, 6, 8, 10]), - ], -) -@pytest.mark.parametrize( - "bins", - [[0, 2, 4, 6, 10, 10], [1, 2, 2, 3, 3]], -) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [True]) -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize("duplicates", ["drop"]) -def test_cut_drop_duplicates( - x, bins, right, precision, duplicates, ordered, include_lowest -): - pcat = pd.cut( - x=x, - bins=bins, - right=right, - precision=precision, - duplicates=duplicates, - include_lowest=include_lowest, - ordered=ordered, - ) - pindex = pd.CategoricalIndex(pcat) - gindex = cut( - x=x, - bins=bins, - right=right, - precision=precision, - duplicates=duplicates, - include_lowest=include_lowest, - ordered=ordered, - ) - - assert_eq(pindex, gindex) - - -@pytest.mark.parametrize( - "x", - [ - [1, 7, 5, 4, 6, 3], - [1, 7], - np.array([1, 7, 5, 4, 6, 3]), - np.array([2, 4, 6, 8, 10]), - ], -) -@pytest.mark.parametrize( - "bins", - [[0, 2, 4, 6, 10, 10], [1, 2, 2, 3, 3]], -) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [True]) -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize("duplicates", ["raises"]) -def test_cut_drop_duplicates_raises( - x, bins, right, precision, duplicates, ordered, include_lowest -): - with pytest.raises(ValueError) as excgd: - cut( - x=x, - bins=bins, - right=right, - precision=precision, - duplicates=duplicates, - include_lowest=include_lowest, - ordered=ordered, - ) - with pytest.raises(ValueError) as excpd: - pd.cut( - x=x, - bins=bins, - right=right, - precision=precision, - duplicates=duplicates, - include_lowest=include_lowest, - ordered=ordered, - ) - - assert_eq(str(excgd.value), str(excpd.value)) - - -@pytest.mark.parametrize( - "x", - [ - [0, 0.5, 1.5, 2.5, 4.5], - [1, 7, 5, 4, 6, 3], - [1, 7], - np.array([1, 7, 5, 4, 6, 3]), - np.array([2, 4, 6, 8, 10]), - ], -) -@pytest.mark.parametrize( - "bins", - [pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])], -) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("precision", [1, 2, 3]) -@pytest.mark.parametrize("duplicates", ["drop", "raise"]) -def test_cut_intervalindex_bin(x, bins, right, precision, duplicates): - pcat = pd.cut( - x=x, - bins=bins, - right=right, - precision=precision, - duplicates=duplicates, - ) - pindex = pd.CategoricalIndex(pcat) - gindex = cut( - x=x, - bins=bins, - right=right, - precision=precision, - duplicates=duplicates, - ) - - assert_eq(pindex, gindex) - - -@pytest.mark.parametrize( - "x", - [pd.Series(np.array([2, 4, 6, 8, 10]), index=["a", "b", "c", "d", "e"])], -) -@pytest.mark.parametrize("bins", [1, 2, 3]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("include_lowest", [True, False]) -@pytest.mark.parametrize("ordered", [True]) -@pytest.mark.parametrize("precision", [3]) -def test_cut_series(x, bins, right, include_lowest, ordered, precision): - pcat = pd.cut( - x=x, - bins=bins, - right=right, - precision=precision, - include_lowest=include_lowest, - ordered=ordered, - ) - - gcat = cut( - x=x, - bins=bins, - right=right, - precision=precision, - include_lowest=include_lowest, - ordered=ordered, - ) - - assert_eq(pcat, gcat) diff --git a/python/cudf/cudf/tests/test_dask.py b/python/cudf/cudf/tests/test_dask.py deleted file mode 100644 index 3af21b4a7ff..00000000000 --- a/python/cudf/cudf/tests/test_dask.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. - -import pytest - -import cudf - -is_dataframe_like = pytest.importorskip( - "dask.dataframe.utils" -).is_dataframe_like -is_index_like = pytest.importorskip("dask.dataframe.utils").is_index_like -is_series_like = pytest.importorskip("dask.dataframe.utils").is_series_like - - -def test_is_dataframe_like(): - df = cudf.DataFrame({"x": [1, 2, 3]}) - assert is_dataframe_like(df) - assert is_series_like(df.x) - assert is_index_like(df.index) - assert not is_dataframe_like(df.x) - assert not is_series_like(df) - assert not is_index_like(df) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py deleted file mode 100644 index 6f88d942746..00000000000 --- a/python/cudf/cudf/tests/test_dataframe.py +++ /dev/null @@ -1,11148 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import array as arr -import contextlib -import datetime -import decimal -import functools -import io -import operator -import random -import re -import string -import textwrap -import warnings -from collections import OrderedDict, defaultdict, namedtuple -from contextlib import contextmanager -from copy import copy - -import cupy -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest -from numba import cuda -from packaging import version - -import cudf -from cudf.api.extensions import no_default -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.core.column import column -from cudf.errors import MixedTypeError -from cudf.testing import _utils as utils, assert_eq, assert_neq -from cudf.testing._utils import ( - ALL_TYPES, - DATETIME_TYPES, - NUMERIC_TYPES, - assert_exceptions_equal, - does_not_raise, - expect_warning_if, - gen_rand, -) - -pytest_xfail = pytest.mark.xfail -pytestmark = pytest.mark.spilling - -# Use this to "unmark" the module level spilling mark -pytest_unmark_spilling = pytest.mark.skipif( - get_global_manager() is not None, reason="unmarked spilling" -) - -# If spilling is enabled globally, we skip many test permutations -# to reduce running time. -if get_global_manager() is not None: - ALL_TYPES = ["float32"] # noqa: F811 - DATETIME_TYPES = ["datetime64[ms]"] # noqa: F811 - NUMERIC_TYPES = ["float32"] # noqa: F811 - # To save time, we skip tests marked "xfail" - pytest_xfail = pytest.mark.skipif - - -@contextmanager -def _hide_ufunc_warnings(eval_str): - # pandas raises warnings for some inputs to the following ufuncs: - if any( - x in eval_str - for x in { - "arctanh", - "log", - } - ): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - "invalid value encountered in", - category=RuntimeWarning, - ) - warnings.filterwarnings( - "ignore", - "divide by zero encountered in", - category=RuntimeWarning, - ) - yield - else: - yield - - -@contextmanager -def _hide_concat_empty_dtype_warning(): - with warnings.catch_warnings(): - # Ignoring warnings in this test as warnings are - # being caught and validated in other tests. - warnings.filterwarnings( - "ignore", - "The behavior of array concatenation with empty " - "entries is deprecated.", - category=FutureWarning, - ) - yield - - -def test_init_via_list_of_tuples(): - data = [ - (5, "cats", "jump", np.nan), - (2, "dogs", "dig", 7.5), - (3, "cows", "moo", -2.1, "occasionally"), - ] - - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("columns", [["a", "b"], pd.Series(["a", "b"])]) -def test_init_via_list_of_series(columns): - data = [pd.Series([1, 2]), pd.Series([3, 4])] - - pdf = cudf.DataFrame(data, columns=columns) - gdf = cudf.DataFrame(data, columns=columns) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("index", [None, [0, 1, 2]]) -def test_init_with_missing_columns(index): - """Test initialization when columns and data keys are disjoint.""" - data = {"a": [1, 2, 3], "b": [2, 3, 4]} - columns = ["c", "d"] - - pdf = cudf.DataFrame(data, columns=columns, index=index) - gdf = cudf.DataFrame(data, columns=columns, index=index) - - assert_eq(pdf, gdf) - - -def _dataframe_na_data(): - return [ - pd.DataFrame( - { - "a": [0, 1, 2, np.nan, 4, None, 6], - "b": [np.nan, None, "u", "h", "d", "a", "m"], - }, - index=["q", "w", "e", "r", "t", "y", "u"], - ), - pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": ["a", "b", "u", "h", "d"]}), - pd.DataFrame( - { - "a": [None, None, np.nan, None], - "b": [np.nan, None, np.nan, None], - } - ), - pd.DataFrame({"a": []}), - pd.DataFrame({"a": [np.nan], "b": [None]}), - pd.DataFrame({"a": ["a", "b", "c", None, "e"]}), - pd.DataFrame({"a": ["a", "b", "c", "d", "e"]}), - ] - - -@pytest.mark.parametrize( - "rows", - [ - 0, - 1, - 2, - 100, - ], -) -def test_init_via_list_of_empty_tuples(rows): - data = [()] * rows - - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - assert_eq( - pdf, - gdf, - check_like=True, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "dict_of_series", - [ - {"a": pd.Series([1.0, 2.0, 3.0])}, - {"a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6])}, - { - "a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), - "b": pd.Series([1.0, 2.0, 4.0], index=[1, 2, 3]), - }, - {"a": [1, 2, 3], "b": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6])}, - { - "a": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]), - "b": pd.Series([1.0, 2.0, 4.0], index=["c", "d", "e"]), - }, - { - "a": pd.Series( - ["a", "b", "c"], - index=pd.MultiIndex.from_tuples([(1, 2), (1, 3), (2, 3)]), - ), - "b": pd.Series( - ["a", " b", "d"], - index=pd.MultiIndex.from_tuples([(1, 2), (1, 3), (2, 3)]), - ), - }, - ], -) -def test_init_from_series_align(dict_of_series): - pdf = pd.DataFrame(dict_of_series) - gdf = cudf.DataFrame(dict_of_series) - - assert_eq(pdf, gdf) - - for key in dict_of_series: - if isinstance(dict_of_series[key], pd.Series): - dict_of_series[key] = cudf.Series(dict_of_series[key]) - - gdf = cudf.DataFrame(dict_of_series) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - ("dict_of_series", "expectation"), - [ - ( - { - "a": pd.Series(["a", "b", "c"], index=[4, 4, 5]), - "b": pd.Series(["a", "b", "c"], index=[4, 5, 6]), - }, - pytest.raises( - ValueError, match="Cannot align indices with non-unique values" - ), - ), - ( - { - "a": pd.Series(["a", "b", "c"], index=[4, 4, 5]), - "b": pd.Series(["a", "b", "c"], index=[4, 4, 5]), - }, - does_not_raise(), - ), - ], -) -def test_init_from_series_align_nonunique(dict_of_series, expectation): - with expectation: - gdf = cudf.DataFrame(dict_of_series) - - if expectation == does_not_raise(): - pdf = pd.DataFrame(dict_of_series) - assert_eq(pdf, gdf) - - -def test_init_unaligned_with_index(): - pdf = pd.DataFrame( - { - "a": pd.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), - "b": pd.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), - }, - index=[7, 8, 9], - ) - gdf = cudf.DataFrame( - { - "a": cudf.Series([1.0, 2.0, 3.0], index=[4, 5, 6]), - "b": cudf.Series([1.0, 2.0, 3.0], index=[1, 2, 3]), - }, - index=[7, 8, 9], - ) - - assert_eq(pdf, gdf, check_dtype=False) - - -def test_init_series_list_columns_unsort(): - pseries = [ - pd.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) - ] - gseries = [ - cudf.Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3) - ] - pdf = pd.DataFrame(pseries) - gdf = cudf.DataFrame(gseries) - assert_eq(pdf, gdf) - - -def test_series_basic(): - # Make series from buffer - a1 = np.arange(10, dtype=np.float64) - series = cudf.Series(a1) - assert len(series) == 10 - np.testing.assert_equal(series.to_numpy(), np.hstack([a1])) - - -def test_series_from_cupy_scalars(): - data = [0.1, 0.2, 0.3] - data_np = np.array(data) - data_cp = cupy.array(data) - s_np = cudf.Series([data_np[0], data_np[2]]) - s_cp = cudf.Series([data_cp[0], data_cp[2]]) - assert_eq(s_np, s_cp) - - -@pytest.mark.parametrize("a", [[1, 2, 3], [1, 10, 30]]) -@pytest.mark.parametrize("b", [[4, 5, 6], [-11, -100, 30]]) -def test_concat_index(a, b): - df = pd.DataFrame() - df["a"] = a - df["b"] = b - - gdf = cudf.DataFrame() - gdf["a"] = a - gdf["b"] = b - - expected = pd.concat([df.a, df.b]) - actual = cudf.concat([gdf.a, gdf.b]) - - assert len(expected) == len(actual) - assert_eq(expected.index, actual.index) - - expected = pd.concat([df.a, df.b], ignore_index=True) - actual = cudf.concat([gdf.a, gdf.b], ignore_index=True) - - assert len(expected) == len(actual) - assert_eq(expected.index, actual.index) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2]}, - {"a": [1, 2, 3], "b": [3, 4, 5]}, - {"a": [1, 2, 3, 4], "b": [3, 4, 5, 6], "c": [1, 3, 5, 7]}, - {"a": [np.nan, 2, 3, 4], "b": [3, 4, np.nan, 6], "c": [1, 3, 5, 7]}, - {1: [1, 2, 3], 2: [3, 4, 5]}, - {"a": [1, None, None], "b": [3, np.nan, np.nan]}, - {1: ["a", "b", "c"], 2: ["q", "w", "u"]}, - {1: ["a", np.nan, "c"], 2: ["q", None, "u"]}, - {}, - {1: [], 2: [], 3: []}, - [1, 2, 3], - ], -) -def test_axes(data): - csr = cudf.DataFrame(data) - psr = pd.DataFrame(data) - - expected = psr.axes - actual = csr.axes - - for e, a in zip(expected, actual): - assert_eq(e, a, exact=False) - - -def test_dataframe_truncate_axis_0(): - df = cudf.DataFrame( - { - "A": ["a", "b", "c", "d", "e"], - "B": ["f", "g", "h", "i", "j"], - "C": ["k", "l", "m", "n", "o"], - }, - index=[1, 2, 3, 4, 5], - ) - pdf = df.to_pandas() - - expected = pdf.truncate(before=2, after=4, axis="index") - actual = df.truncate(before=2, after=4, axis="index") - assert_eq(actual, expected) - - expected = pdf.truncate(before=1, after=4, axis=0) - actual = df.truncate(before=1, after=4, axis=0) - assert_eq(expected, actual) - - -def test_dataframe_truncate_axis_1(): - df = cudf.DataFrame( - { - "A": ["a", "b", "c", "d", "e"], - "B": ["f", "g", "h", "i", "j"], - "C": ["k", "l", "m", "n", "o"], - }, - index=[1, 2, 3, 4, 5], - ) - pdf = df.to_pandas() - - expected = pdf.truncate(before="A", after="B", axis="columns") - actual = df.truncate(before="A", after="B", axis="columns") - assert_eq(actual, expected) - - expected = pdf.truncate(before="A", after="B", axis=1) - actual = df.truncate(before="A", after="B", axis=1) - assert_eq(actual, expected) - - -def test_dataframe_truncate_datetimeindex(): - dates = cudf.date_range( - "2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s" - ) - df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates) - pdf = df.to_pandas() - expected = pdf.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ) - actual = df.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ) - - assert_eq(actual, expected) - - -def test_series_init_none(): - # test for creating empty series - # 1: without initializing - sr1 = cudf.Series() - got = sr1.to_string() - - expect = repr(sr1.to_pandas()) - assert got == expect - - # 2: Using `None` as an initializer - sr2 = cudf.Series(None) - got = sr2.to_string() - - expect = repr(sr2.to_pandas()) - assert got == expect - - -def test_dataframe_basic(): - np.random.seed(0) - df = cudf.DataFrame() - - # Populate with cuda memory - df["keys"] = np.arange(10, dtype=np.float64) - np.testing.assert_equal(df["keys"].to_numpy(), np.arange(10)) - assert len(df) == 10 - - # Populate with numpy array - rnd_vals = np.random.random(10) - df["vals"] = rnd_vals - np.testing.assert_equal(df["vals"].to_numpy(), rnd_vals) - assert len(df) == 10 - assert tuple(df.columns) == ("keys", "vals") - - # Make another dataframe - df2 = cudf.DataFrame() - df2["keys"] = np.array([123], dtype=np.float64) - df2["vals"] = np.array([321], dtype=np.float64) - - # Concat - df = cudf.concat([df, df2]) - assert len(df) == 11 - - hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123]) - hvals = np.asarray(rnd_vals.tolist() + [321]) - - np.testing.assert_equal(df["keys"].to_numpy(), hkeys) - np.testing.assert_equal(df["vals"].to_numpy(), hvals) - - # As matrix - mat = df.values_host - - expect = np.vstack([hkeys, hvals]).T - - np.testing.assert_equal(mat, expect) - - # test dataframe with tuple name - df_tup = cudf.DataFrame() - data = np.arange(10) - df_tup[(1, "foobar")] = data - np.testing.assert_equal(data, df_tup[(1, "foobar")].to_numpy()) - - df = cudf.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) - pdf = pd.DataFrame(pd.DataFrame({"a": [1, 2, 3], "c": ["a", "b", "c"]})) - assert_eq(df, pdf) - - gdf = cudf.DataFrame({"id": [0, 1], "val": [None, None]}) - gdf["val"] = gdf["val"].astype("int") - - assert gdf["val"].isnull().all() - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "c": range(1, 11)}, - index=pd.Index( - ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], - name="custom_name", - ), - ), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), - ], -) -@pytest.mark.parametrize( - "columns", - [["a"], ["b"], "a", "b", ["a", "b"]], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_columns(pdf, columns, inplace): - pdf = pdf.copy() - gdf = cudf.from_pandas(pdf) - - expected = pdf.drop(columns=columns, inplace=inplace) - actual = gdf.drop(columns=columns, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("obj", ["Index", "Series"]) -def test_drop_cudf_obj_columns(obj): - pdf = pd.DataFrame({"A": [1], "B": [1]}) - gdf = cudf.from_pandas(pdf) - - columns = ["B"] - expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1) - actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "c": range(1, 11)}, - index=pd.Index(list(range(10)), name="custom_name"), - ), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), - ], -) -@pytest.mark.parametrize( - "labels", - [ - [1], - [0], - 1, - 5, - [5, 9], - pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - pd.Index([0, 1, 8, 9], name="new name"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_labels_axis_0(pdf, labels, inplace): - pdf = pdf.copy() - gdf = cudf.from_pandas(pdf) - - expected = pdf.drop(labels=labels, axis=0, inplace=inplace) - actual = gdf.drop(labels=labels, axis=0, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), - pd.DataFrame( - { - "a": range(10), - "b": range(10, 20), - }, - index=pd.Index(list(range(10)), dtype="uint64"), - ), - ], -) -@pytest.mark.parametrize( - "index", - [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_index(pdf, index, inplace): - pdf = pdf.copy() - gdf = cudf.from_pandas(pdf) - - expected = pdf.drop(index=index, inplace=inplace) - actual = gdf.drop(index=index, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5}, - index=pd.MultiIndex( - levels=[ - ["lama", "cow", "falcon"], - ["speed", "weight", "length"], - ], - codes=[ - [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], - [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], - ], - ), - ) - ], -) -@pytest.mark.parametrize( - "index,level", - [ - ("cow", 0), - ("lama", 0), - ("falcon", 0), - ("speed", 1), - ("weight", 1), - ("length", 1), - ("cow", None), - ( - "lama", - None, - ), - ( - "falcon", - None, - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_multiindex(pdf, index, level, inplace): - pdf = pdf.copy() - gdf = cudf.from_pandas(pdf) - - expected = pdf.drop(index=index, inplace=inplace, level=level) - actual = gdf.drop(index=index, inplace=inplace, level=level) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame({"a": range(10), "b": range(10, 20), "c": range(1, 11)}), - pd.DataFrame( - {"a": range(10), "b": range(10, 20), "d": ["a", "v"] * 5} - ), - ], -) -@pytest.mark.parametrize( - "labels", - [["a"], ["b"], "a", "b", ["a", "b"]], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_drop_labels_axis_1(pdf, labels, inplace): - pdf = pdf.copy() - gdf = cudf.from_pandas(pdf) - - expected = pdf.drop(labels=labels, axis=1, inplace=inplace) - actual = gdf.drop(labels=labels, axis=1, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -def test_dataframe_drop_error(): - df = cudf.DataFrame({"a": [1], "b": [2], "c": [3]}) - pdf = df.to_pandas() - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"columns": "d"}), - rfunc_args_and_kwargs=([], {"columns": "d"}), - ) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), - rfunc_args_and_kwargs=([], {"columns": ["a", "d", "b"]}), - ) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - ) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"axis": 1}), - rfunc_args_and_kwargs=([], {"axis": 1}), - ) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([[2, 0]],), - rfunc_args_and_kwargs=([[2, 0]],), - ) - - -def test_dataframe_swaplevel_axis_0(): - midx = cudf.MultiIndex( - levels=[ - ["Work"], - ["Final exam", "Coursework"], - ["History", "Geography"], - ["January", "February", "March", "April"], - ], - codes=[[0, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 2, 3]], - names=["a", "b", "c", "d"], - ) - cdf = cudf.DataFrame( - { - "Grade": ["A", "B", "A", "C"], - "Percentage": ["95", "85", "95", "75"], - }, - index=midx, - ) - pdf = cdf.to_pandas() - - assert_eq(pdf.swaplevel(), cdf.swaplevel()) - assert_eq(pdf.swaplevel(), cdf.swaplevel(-2, -1, 0)) - assert_eq(pdf.swaplevel(1, 2), cdf.swaplevel(1, 2)) - assert_eq(cdf.swaplevel(2, 1), cdf.swaplevel(1, 2)) - assert_eq(pdf.swaplevel(-1, -3), cdf.swaplevel(-1, -3)) - assert_eq(pdf.swaplevel("a", "b", 0), cdf.swaplevel("a", "b", 0)) - assert_eq(cdf.swaplevel("a", "b"), cdf.swaplevel("b", "a")) - - -def test_dataframe_swaplevel_TypeError(): - cdf = cudf.DataFrame( - {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] - ) - - with pytest.raises(TypeError): - cdf.swaplevel() - - -def test_dataframe_swaplevel_axis_1(): - midx = cudf.MultiIndex( - levels=[ - ["b", "a"], - ["bb", "aa"], - ["bbb", "aaa"], - ], - codes=[[0, 0, 1, 1], [0, 1, 0, 1], [0, 1, 0, 1]], - names=[None, "a", "b"], - ) - cdf = cudf.DataFrame( - data=[[45, 30, 100, 90], [200, 100, 50, 80]], - columns=midx, - ) - pdf = cdf.to_pandas() - - assert_eq(pdf.swaplevel(1, 2, 1), cdf.swaplevel(1, 2, 1)) - assert_eq(pdf.swaplevel("a", "b", 1), cdf.swaplevel("a", "b", 1)) - assert_eq(cdf.swaplevel(2, 1, 1), cdf.swaplevel(1, 2, 1)) - assert_eq(pdf.swaplevel(0, 2, 1), cdf.swaplevel(0, 2, 1)) - assert_eq(pdf.swaplevel(2, 0, 1), cdf.swaplevel(2, 0, 1)) - assert_eq(cdf.swaplevel("a", "a", 1), cdf.swaplevel("b", "b", 1)) - - -def test_dataframe_drop_raises(): - df = cudf.DataFrame( - {"a": [1, 2, 3], "c": [10, 20, 30]}, index=["x", "y", "z"] - ) - pdf = df.to_pandas() - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=(["p"],), - rfunc_args_and_kwargs=(["p"],), - ) - - # label dtype mismatch - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([3],), - rfunc_args_and_kwargs=([3],), - ) - - expect = pdf.drop("p", errors="ignore") - actual = df.drop("p", errors="ignore") - - assert_eq(actual, expect) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"columns": "p"}), - rfunc_args_and_kwargs=([], {"columns": "p"}), - ) - - expect = pdf.drop(columns="p", errors="ignore") - actual = df.drop(columns="p", errors="ignore") - - assert_eq(actual, expect) - - assert_exceptions_equal( - lfunc=pdf.drop, - rfunc=df.drop, - lfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), - rfunc_args_and_kwargs=([], {"labels": "p", "axis": 1}), - ) - - expect = pdf.drop(labels="p", axis=1, errors="ignore") - actual = df.drop(labels="p", axis=1, errors="ignore") - - assert_eq(actual, expect) - - -def test_dataframe_column_add_drop_via_setitem(): - df = cudf.DataFrame() - data = np.asarray(range(10)) - df["a"] = data - df["b"] = data - assert tuple(df.columns) == ("a", "b") - del df["a"] - assert tuple(df.columns) == ("b",) - df["c"] = data - assert tuple(df.columns) == ("b", "c") - df["a"] = data - assert tuple(df.columns) == ("b", "c", "a") - - -def test_dataframe_column_set_via_attr(): - data_0 = np.asarray([0, 2, 4, 5]) - data_1 = np.asarray([1, 4, 2, 3]) - data_2 = np.asarray([2, 0, 3, 0]) - df = cudf.DataFrame({"a": data_0, "b": data_1, "c": data_2}) - - for i in range(10): - df.c = df.a - assert assert_eq(df.c, df.a, check_names=False) - assert tuple(df.columns) == ("a", "b", "c") - - df.c = df.b - assert assert_eq(df.c, df.b, check_names=False) - assert tuple(df.columns) == ("a", "b", "c") - - -def test_dataframe_column_drop_via_attr(): - df = cudf.DataFrame({"a": []}) - - with pytest.raises(AttributeError): - del df.a - - assert tuple(df.columns) == tuple("a") - - -@pytest.mark.parametrize("axis", [0, "index"]) -def test_dataframe_index_rename(axis): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf = cudf.DataFrame.from_pandas(pdf) - - expect = pdf.rename(mapper={1: 5, 2: 6}, axis=axis) - got = gdf.rename(mapper={1: 5, 2: 6}, axis=axis) - - assert_eq(expect, got) - - expect = pdf.rename(index={1: 5, 2: 6}) - got = gdf.rename(index={1: 5, 2: 6}) - - assert_eq(expect, got) - - expect = pdf.rename({1: 5, 2: 6}) - got = gdf.rename({1: 5, 2: 6}) - - assert_eq(expect, got) - - # `pandas` can support indexes with mixed values. We throw a - # `NotImplementedError`. - with pytest.raises(NotImplementedError): - gdf.rename(mapper={1: "x", 2: "y"}, axis=axis) - - -def test_dataframe_MI_rename(): - gdf = cudf.DataFrame( - {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} - ) - gdg = gdf.groupby(["a", "b"]).count() - pdg = gdg.to_pandas() - - expect = pdg.rename(mapper={1: 5, 2: 6}, axis=0) - got = gdg.rename(mapper={1: 5, 2: 6}, axis=0) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("axis", [1, "columns"]) -def test_dataframe_column_rename(axis): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf = cudf.DataFrame.from_pandas(pdf) - - expect = pdf.rename(mapper=lambda name: 2 * name, axis=axis) - got = gdf.rename(mapper=lambda name: 2 * name, axis=axis) - - assert_eq(expect, got) - - expect = pdf.rename(columns=lambda name: 2 * name) - got = gdf.rename(columns=lambda name: 2 * name) - - assert_eq(expect, got) - - rename_mapper = {"a": "z", "b": "y", "c": "x"} - expect = pdf.rename(columns=rename_mapper) - got = gdf.rename(columns=rename_mapper) - - assert_eq(expect, got) - - -def test_dataframe_pop(): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [7.0, 8.0, 9.0]} - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - # Test non-existing column error - with pytest.raises(KeyError) as raises: - gdf.pop("fake_colname") - raises.match("fake_colname") - - # check pop numeric column - pdf_pop = pdf.pop("a") - gdf_pop = gdf.pop("a") - assert_eq(pdf_pop, gdf_pop) - assert_eq(pdf, gdf) - - # check string column - pdf_pop = pdf.pop("b") - gdf_pop = gdf.pop("b") - assert_eq(pdf_pop, gdf_pop) - assert_eq(pdf, gdf) - - # check float column and empty dataframe - pdf_pop = pdf.pop("c") - gdf_pop = gdf.pop("c") - assert_eq(pdf_pop, gdf_pop) - assert_eq(pdf, gdf) - - # check empty dataframe edge case - empty_pdf = pd.DataFrame(columns=["a", "b"]) - empty_gdf = cudf.DataFrame(columns=["a", "b"]) - pb = empty_pdf.pop("b") - gb = empty_gdf.pop("b") - assert len(pb) == len(gb) - assert empty_pdf.empty and empty_gdf.empty - - -@pytest.mark.parametrize("nelem", [0, 3, 100, 1000]) -def test_dataframe_astype(nelem): - df = cudf.DataFrame() - data = np.asarray(range(nelem), dtype=np.int32) - df["a"] = data - assert df["a"].dtype is np.dtype(np.int32) - df["b"] = df["a"].astype(np.float32) - assert df["b"].dtype is np.dtype(np.float32) - np.testing.assert_equal(df["a"].to_numpy(), df["b"].to_numpy()) - - -def test_astype_dict(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["1", "2", "3"]}) - pdf = gdf.to_pandas() - - assert_eq(pdf.astype({"a": "str"}), gdf.astype({"a": "str"})) - assert_eq( - pdf.astype({"a": "str", "b": np.int64}), - gdf.astype({"a": "str", "b": np.int64}), - ) - - -@pytest.mark.parametrize("nelem", [0, 100]) -def test_index_astype(nelem): - df = cudf.DataFrame() - data = np.asarray(range(nelem), dtype=np.int32) - df["a"] = data - assert df.index.dtype is np.dtype(np.int64) - df.index = df.index.astype(np.float32) - assert df.index.dtype is np.dtype(np.float32) - df["a"] = df["a"].astype(np.float32) - np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) - df["b"] = df["a"] - df = df.set_index("b") - df["a"] = df["a"].astype(np.int16) - df.index = df.index.astype(np.int16) - np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) - - -def test_dataframe_to_string_with_skipped_rows(): - # Test skipped rows - df = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} - ) - - with pd.option_context("display.max_rows", 5): - got = df.to_string() - - expect = textwrap.dedent( - """\ - a b - 0 1 11 - 1 2 12 - .. .. .. - 4 5 15 - 5 6 16 - - [6 rows x 2 columns]""" - ) - assert got == expect - - -def test_dataframe_to_string_with_skipped_rows_and_columns(): - # Test skipped rows and skipped columns - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [11, 12, 13, 14, 15, 16], - "c": [11, 12, 13, 14, 15, 16], - "d": [11, 12, 13, 14, 15, 16], - } - ) - - with pd.option_context("display.max_rows", 5, "display.max_columns", 3): - got = df.to_string() - - expect = textwrap.dedent( - """\ - a ... d - 0 1 ... 11 - 1 2 ... 12 - .. .. ... .. - 4 5 ... 15 - 5 6 ... 16 - - [6 rows x 4 columns]""" - ) - assert got == expect - - -def test_dataframe_to_string_with_masked_data(): - # Test masked data - df = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} - ) - - data = np.arange(6) - mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) - mask[0] = 0b00101101 - - masked = cudf.Series.from_masked_array(data, mask) - assert masked.null_count == 2 - df["c"] = masked - - # Check data - values = masked.copy() - validids = [0, 2, 3, 5] - densearray = masked.dropna().to_numpy() - np.testing.assert_equal(data[validids], densearray) - # Valid position is correct - for i in validids: - assert data[i] == values[i] - # Null position is correct - for i in range(len(values)): - if i not in validids: - assert values[i] is cudf.NA - - with pd.option_context("display.max_rows", 10): - got = df.to_string() - - expect = textwrap.dedent( - """\ - a b c - 0 1 11 0 - 1 2 12 - 2 3 13 2 - 3 4 14 3 - 4 5 15 - 5 6 16 5""" - ) - assert got == expect - - -def test_dataframe_to_string_wide(monkeypatch): - monkeypatch.setenv("COLUMNS", "79") - # Test basic - df = cudf.DataFrame({f"a{i}": [0, 1, 2] for i in range(100)}) - with pd.option_context("display.max_columns", 0): - got = df.to_string() - - expect = textwrap.dedent( - """\ - a0 a1 a2 a3 a4 a5 a6 a7 ... a92 a93 a94 a95 a96 a97 a98 a99 - 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 - 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 - 2 2 2 2 2 2 2 2 2 ... 2 2 2 2 2 2 2 2 - - [3 rows x 100 columns]""" # noqa: E501 - ) - assert got == expect - - -def test_dataframe_empty_to_string(): - # Test for printing empty dataframe - df = cudf.DataFrame() - got = df.to_string() - - expect = "Empty DataFrame\nColumns: []\nIndex: []" - assert got == expect - - -def test_dataframe_emptycolumns_to_string(): - # Test for printing dataframe having empty columns - df = cudf.DataFrame() - df["a"] = [] - df["b"] = [] - got = df.to_string() - - expect = "Empty DataFrame\nColumns: [a, b]\nIndex: []" - assert got == expect - - -def test_dataframe_copy(): - # Test for copying the dataframe using python copy pkg - df = cudf.DataFrame() - df["a"] = [1, 2, 3] - df2 = copy(df) - df2["b"] = [4, 5, 6] - got = df.to_string() - - expect = textwrap.dedent( - """\ - a - 0 1 - 1 2 - 2 3""" - ) - assert got == expect - - -def test_dataframe_copy_shallow(): - # Test for copy dataframe using class method - df = cudf.DataFrame() - df["a"] = [1, 2, 3] - df2 = df.copy() - df2["b"] = [4, 2, 3] - got = df.to_string() - - expect = textwrap.dedent( - """\ - a - 0 1 - 1 2 - 2 3""" - ) - assert got == expect - - -def test_dataframe_dtypes(): - dtypes = pd.Series( - [np.int32, np.float32, np.float64], index=["c", "a", "b"] - ) - df = cudf.DataFrame({k: np.ones(10, dtype=v) for k, v in dtypes.items()}) - assert df.dtypes.equals(dtypes) - - -def test_dataframe_add_col_to_object_dataframe(): - # Test for adding column to an empty object dataframe - cols = ["a", "b", "c"] - df = pd.DataFrame(columns=cols, dtype="str") - - data = {k: v for (k, v) in zip(cols, [["a"] for _ in cols])} - - gdf = cudf.DataFrame(data) - gdf = gdf[:0] - - assert gdf.dtypes.equals(df.dtypes) - gdf["a"] = [1] - df["a"] = [10] - assert gdf.dtypes.equals(df.dtypes) - gdf["b"] = [1.0] - df["b"] = [10.0] - assert gdf.dtypes.equals(df.dtypes) - - -def test_dataframe_dir_and_getattr(): - df = cudf.DataFrame( - { - "a": np.ones(10), - "b": np.ones(10), - "not an id": np.ones(10), - "oop$": np.ones(10), - } - ) - o = dir(df) - assert {"a", "b"}.issubset(o) - assert "not an id" not in o - assert "oop$" not in o - - # Getattr works - assert df.a.equals(df["a"]) - assert df.b.equals(df["b"]) - with pytest.raises(AttributeError): - df.not_a_column - - -def test_empty_dataframe_to_cupy(): - df = cudf.DataFrame() - - # Check fully empty dataframe. - mat = df.to_cupy() - assert mat.shape == (0, 0) - mat = df.to_numpy() - assert mat.shape == (0, 0) - - df = cudf.DataFrame() - nelem = 123 - for k in "abc": - df[k] = np.random.random(nelem) - - # Check all columns in empty dataframe. - mat = df.head(0).to_cupy() - assert mat.shape == (0, 3) - - -def test_dataframe_to_cupy(): - df = cudf.DataFrame() - - nelem = 123 - for k in "abcd": - df[k] = np.random.random(nelem) - - # Check all columns - mat = df.to_cupy() - assert mat.shape == (nelem, 4) - assert mat.strides == (8, 984) - - mat = df.to_numpy() - assert mat.shape == (nelem, 4) - assert mat.strides == (8, 984) - for i, k in enumerate(df.columns): - np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i]) - - # Check column subset - mat = df[["a", "c"]].to_cupy().get() - assert mat.shape == (nelem, 2) - - for i, k in enumerate("ac"): - np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i]) - - -def test_dataframe_to_cupy_null_values(): - df = cudf.DataFrame() - - nelem = 123 - na = -10000 - - refvalues = {} - for k in "abcd": - df[k] = data = np.random.random(nelem) - bitmask = utils.random_bitmask(nelem) - df[k] = df[k]._column.set_mask(bitmask) - boolmask = np.asarray( - utils.expand_bits_to_bytes(bitmask)[:nelem], dtype=np.bool_ - ) - data[~boolmask] = na - refvalues[k] = data - - # Check null value causes error - with pytest.raises(ValueError): - df.to_cupy() - with pytest.raises(ValueError): - df.to_numpy() - - for k in df.columns: - df[k] = df[k].fillna(na) - - mat = df.to_numpy() - for i, k in enumerate(df.columns): - np.testing.assert_array_equal(refvalues[k], mat[:, i]) - - -def test_dataframe_append_empty(): - pdf = pd.DataFrame( - { - "key": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], - "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], - } - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - gdf["newcol"] = 100 - pdf["newcol"] = 100 - - assert len(gdf["newcol"]) == len(pdf) - assert len(pdf["newcol"]) == len(pdf) - assert_eq(gdf, pdf) - - -def test_dataframe_setitem_from_masked_object(): - ary = np.random.randn(100) - mask = np.zeros(100, dtype=bool) - mask[:20] = True - np.random.shuffle(mask) - ary[mask] = np.nan - - test1_null = cudf.Series(ary, nan_as_null=True) - assert test1_null.nullable - assert test1_null.null_count == 20 - test1_nan = cudf.Series(ary, nan_as_null=False) - assert test1_nan.null_count == 0 - - test2_null = cudf.DataFrame.from_pandas( - pd.DataFrame({"a": ary}), nan_as_null=True - ) - assert test2_null["a"].nullable - assert test2_null["a"].null_count == 20 - test2_nan = cudf.DataFrame.from_pandas( - pd.DataFrame({"a": ary}), nan_as_null=False - ) - assert test2_nan["a"].null_count == 0 - - gpu_ary = cupy.asarray(ary) - test3_null = cudf.Series(gpu_ary, nan_as_null=True) - assert test3_null.nullable - assert test3_null.null_count == 20 - test3_nan = cudf.Series(gpu_ary, nan_as_null=False) - assert test3_nan.null_count == 0 - - test4 = cudf.DataFrame() - lst = [1, 2, None, 4, 5, 6, None, 8, 9] - test4["lst"] = lst - assert test4["lst"].nullable - assert test4["lst"].null_count == 2 - - -def test_dataframe_append_to_empty(): - pdf = pd.DataFrame() - pdf["a"] = [] - pdf["a"] = pdf["a"].astype("str") - pdf["b"] = [1, 2, 3] - - gdf = cudf.DataFrame() - gdf["a"] = [] - gdf["b"] = [1, 2, 3] - - assert_eq(gdf, pdf) - - -def test_dataframe_setitem_index_len1(): - gdf = cudf.DataFrame() - gdf["a"] = [1] - gdf["b"] = gdf.index._values - - np.testing.assert_equal(gdf.b.to_numpy(), [0]) - - -def test_empty_dataframe_setitem_df(): - gdf1 = cudf.DataFrame() - gdf2 = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}) - gdf1["a"] = gdf2["a"] - assert_eq(gdf1, gdf2) - - -def test_assign(): - gdf = cudf.DataFrame({"x": [1, 2, 3]}) - gdf2 = gdf.assign(y=gdf.x + 1) - assert list(gdf.columns) == ["x"] - assert list(gdf2.columns) == ["x", "y"] - - np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4]) - - -@pytest.mark.parametrize( - "mapping", - [ - {"y": 1, "z": lambda df: df["x"] + df["y"]}, - { - "x": lambda df: df["x"] * 2, - "y": lambda df: 2, - "z": lambda df: df["x"] / df["y"], - }, - ], -) -def test_assign_callable(mapping): - df = pd.DataFrame({"x": [1, 2, 3]}) - cdf = cudf.from_pandas(df) - expect = df.assign(**mapping) - actual = cdf.assign(**mapping) - assert_eq(expect, actual) - - -@pytest.mark.parametrize("nrows", [1, 8, 100, 1000]) -@pytest.mark.parametrize( - "method", - [ - "murmur3", - "md5", - "sha1", - "sha224", - "sha256", - "sha384", - "sha512", - "xxhash64", - ], -) -@pytest.mark.parametrize("seed", [None, 42]) -def test_dataframe_hash_values(nrows, method, seed): - warning_expected = seed is not None and method not in { - "murmur3", - "xxhash64", - } - potential_warning = ( - pytest.warns(UserWarning, match="Provided seed value has no effect*") - if warning_expected - else contextlib.nullcontext() - ) - - gdf = cudf.DataFrame() - data = np.arange(nrows) - data[0] = data[-1] # make first and last the same - gdf["a"] = data - gdf["b"] = gdf.a + 100 - with potential_warning: - out = gdf.hash_values(method=method, seed=seed) - assert isinstance(out, cudf.Series) - assert len(out) == nrows - expected_dtypes = { - "murmur3": np.uint32, - "md5": object, - "sha1": object, - "sha224": object, - "sha256": object, - "sha384": object, - "sha512": object, - "xxhash64": np.uint64, - } - assert out.dtype == expected_dtypes[method] - - # Check single column - with potential_warning: - out_one = gdf[["a"]].hash_values(method=method, seed=seed) - # First matches last - assert out_one.iloc[0] == out_one.iloc[-1] - # Equivalent to the cudf.Series.hash_values() - with potential_warning: - assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) - - -@pytest.mark.parametrize("method", ["murmur3", "xxhash64"]) -def test_dataframe_hash_values_seed(method): - gdf = cudf.DataFrame() - data = np.arange(10) - data[0] = data[-1] # make first and last the same - gdf["a"] = data - gdf["b"] = gdf.a + 100 - out_one = gdf.hash_values(method=method, seed=0) - out_two = gdf.hash_values(method=method, seed=1) - assert out_one.iloc[0] == out_one.iloc[-1] - assert out_two.iloc[0] == out_two.iloc[-1] - assert_neq(out_one, out_two) - - -def test_dataframe_hash_values_xxhash64(): - # xxhash64 has no built-in implementation in Python and we don't want to - # add a testing dependency, so we use regression tests against known good - # values. - gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) - gdf["b"] = -gdf["a"] - out_a = gdf["a"].hash_values(method="xxhash64", seed=0) - expected_a = cudf.Series( - [ - 3803688792395291579, - 10706502109028787093, - 9835943264235290955, - 18031741628920313605, - 18446744073709551615, - ], - dtype=np.uint64, - ) - assert_eq(out_a, expected_a) - - out_b = gdf["b"].hash_values(method="xxhash64", seed=42) - expected_b = cudf.Series( - [ - 9826995235083043316, - 10150515573749944095, - 5005707091092326006, - 5326262080505358431, - 18446744073709551615, - ], - dtype=np.uint64, - ) - assert_eq(out_b, expected_b) - - out_df = gdf.hash_values(method="xxhash64", seed=0) - expected_df = cudf.Series( - [ - 10208049663714815266, - 4949201786888768834, - 18122173653994477335, - 11133539368563441730, - 18446744073709551615, - ], - dtype=np.uint64, - ) - assert_eq(out_df, expected_df) - - -@pytest.mark.parametrize("nrows", [3, 10, 100, 1000]) -@pytest.mark.parametrize("nparts", [1, 2, 8, 13]) -@pytest.mark.parametrize("nkeys", [1, 2]) -def test_dataframe_hash_partition(nrows, nparts, nkeys): - np.random.seed(123) - gdf = cudf.DataFrame() - keycols = [] - for i in range(nkeys): - keyname = f"key{i}" - gdf[keyname] = np.random.randint(0, 7 - i, nrows) - keycols.append(keyname) - gdf["val1"] = np.random.randint(0, nrows * 2, nrows) - - got = gdf.partition_by_hash(keycols, nparts=nparts) - # Must return a list - assert isinstance(got, list) - # Must have correct number of partitions - assert len(got) == nparts - # All partitions must be DataFrame type - assert all(isinstance(p, cudf.DataFrame) for p in got) - # Check that all partitions have unique keys - part_unique_keys = set() - for p in got: - if len(p): - # Take rows of the keycolumns and build a set of the key-values - unique_keys = set(map(tuple, p[keycols].values_host)) - # Ensure that none of the key-values have occurred in other groups - assert not (unique_keys & part_unique_keys) - part_unique_keys |= unique_keys - assert len(part_unique_keys) - - -@pytest.mark.parametrize("nrows", [3, 10, 50]) -def test_dataframe_hash_partition_masked_value(nrows): - gdf = cudf.DataFrame() - gdf["key"] = np.arange(nrows) - gdf["val"] = np.arange(nrows) + 100 - bitmask = utils.random_bitmask(nrows) - bytemask = utils.expand_bits_to_bytes(bitmask) - gdf["val"] = gdf["val"]._column.set_mask(bitmask) - parted = gdf.partition_by_hash(["key"], nparts=3) - # Verify that the valid mask is correct - for p in parted: - df = p.to_pandas() - for row in df.itertuples(): - valid = bool(bytemask[row.key]) - expected_value = row.key + 100 if valid else np.nan - got_value = row.val - assert (expected_value == got_value) or ( - np.isnan(expected_value) and np.isnan(got_value) - ) - - -@pytest.mark.parametrize("nrows", [3, 10, 50]) -def test_dataframe_hash_partition_masked_keys(nrows): - gdf = cudf.DataFrame() - gdf["key"] = np.arange(nrows) - gdf["val"] = np.arange(nrows) + 100 - bitmask = utils.random_bitmask(nrows) - bytemask = utils.expand_bits_to_bytes(bitmask) - gdf["key"] = gdf["key"]._column.set_mask(bitmask) - parted = gdf.partition_by_hash(["key"], nparts=3, keep_index=False) - # Verify that the valid mask is correct - for p in parted: - df = p.to_pandas() - for row in df.itertuples(): - valid = bool(bytemask[row.val - 100]) - # val is key + 100 - expected_value = row.val - 100 if valid else np.nan - got_value = row.key - assert (expected_value == got_value) or ( - np.isnan(expected_value) and np.isnan(got_value) - ) - - -@pytest.mark.parametrize("keep_index", [True, False]) -def test_dataframe_hash_partition_keep_index(keep_index): - gdf = cudf.DataFrame( - {"val": [1, 2, 3, 4, 5], "key": [3, 2, 1, 4, 5]}, index=[5, 4, 3, 2, 1] - ) - - expected_df1 = cudf.DataFrame( - {"val": [1, 5], "key": [3, 5]}, index=[5, 1] if keep_index else None - ) - expected_df2 = cudf.DataFrame( - {"val": [2, 3, 4], "key": [2, 1, 4]}, - index=[4, 3, 2] if keep_index else None, - ) - expected = [expected_df1, expected_df2] - - parts = gdf.partition_by_hash(["key"], nparts=2, keep_index=keep_index) - - for exp, got in zip(expected, parts): - assert_eq(exp, got) - - -def test_dataframe_hash_partition_empty(): - gdf = cudf.DataFrame({"val": [1, 2], "key": [3, 2]}, index=["a", "b"]) - parts = gdf.iloc[:0].partition_by_hash(["key"], nparts=3) - assert len(parts) == 3 - for part in parts: - assert_eq(gdf.iloc[:0], part) - - -@pytest.mark.parametrize("dtype1", utils.supported_numpy_dtypes) -@pytest.mark.parametrize("dtype2", utils.supported_numpy_dtypes) -def test_dataframe_concat_different_numerical_columns(dtype1, dtype2): - df1 = pd.DataFrame(dict(x=pd.Series(np.arange(5)).astype(dtype1))) - df2 = pd.DataFrame(dict(x=pd.Series(np.arange(5)).astype(dtype2))) - if dtype1 != dtype2 and "datetime" in dtype1 or "datetime" in dtype2: - with pytest.raises(TypeError): - cudf.concat([df1, df2]) - else: - pres = pd.concat([df1, df2]) - gres = cudf.concat([cudf.from_pandas(df1), cudf.from_pandas(df2)]) - assert_eq(pres, gres, check_dtype=False, check_index_type=True) - - -def test_dataframe_concat_different_column_types(): - df1 = cudf.Series([42], dtype=np.float64) - df2 = cudf.Series(["a"], dtype="category") - with pytest.raises(ValueError): - cudf.concat([df1, df2]) - - df2 = cudf.Series(["a string"]) - with pytest.raises(TypeError): - cudf.concat([df1, df2]) - - -@pytest.mark.parametrize( - "df_1", [cudf.DataFrame({"a": [1, 2], "b": [1, 3]}), cudf.DataFrame({})] -) -@pytest.mark.parametrize( - "df_2", [cudf.DataFrame({"a": [], "b": []}), cudf.DataFrame({})] -) -def test_concat_empty_dataframe(df_1, df_2): - with _hide_concat_empty_dtype_warning(): - got = cudf.concat([df_1, df_2]) - expect = pd.concat([df_1.to_pandas(), df_2.to_pandas()], sort=False) - - # ignoring dtypes as pandas upcasts int to float - # on concatenation with empty dataframes - - assert_eq(got, expect, check_dtype=False, check_index_type=True) - - -@pytest.mark.parametrize( - "df1_d", - [ - {"a": [1, 2], "b": [1, 2], "c": ["s1", "s2"], "d": [1.0, 2.0]}, - {"b": [1.9, 10.9], "c": ["s1", "s2"]}, - {"c": ["s1"], "b": pd.Series([None], dtype="float"), "a": [False]}, - ], -) -@pytest.mark.parametrize( - "df2_d", - [ - {"a": [1, 2, 3]}, - {"a": [1, None, 3], "b": [True, True, False], "c": ["s3", None, "s4"]}, - {"a": [], "b": []}, - {}, - ], -) -def test_concat_different_column_dataframe(df1_d, df2_d): - with _hide_concat_empty_dtype_warning(): - got = cudf.concat( - [ - cudf.DataFrame(df1_d), - cudf.DataFrame(df2_d), - cudf.DataFrame(df1_d), - ], - sort=False, - ) - - pdf1 = pd.DataFrame(df1_d) - pdf2 = pd.DataFrame(df2_d) - - expect = pd.concat([pdf1, pdf2, pdf1], sort=False) - - # numerical columns are upcasted to float in cudf.DataFrame.to_pandas() - # casts nan to 0 in non-float numerical columns - - numeric_cols = got.dtypes[got.dtypes != "object"].index - for col in numeric_cols: - got[col] = got[col].astype(np.float64).fillna(np.nan) - - assert_eq(got, expect, check_dtype=False, check_index_type=True) - - -@pytest.mark.parametrize( - "ser_1", [pd.Series([1, 2, 3]), pd.Series([], dtype="float64")] -) -@pytest.mark.parametrize("ser_2", [pd.Series([], dtype="float64")]) -def test_concat_empty_series(ser_1, ser_2): - with _hide_concat_empty_dtype_warning(): - got = cudf.concat([cudf.Series(ser_1), cudf.Series(ser_2)]) - expect = pd.concat([ser_1, ser_2]) - - assert_eq(got, expect, check_index_type=True) - - -def test_concat_with_axis(): - df1 = pd.DataFrame(dict(x=np.arange(5), y=np.arange(5))) - df2 = pd.DataFrame(dict(a=np.arange(5), b=np.arange(5))) - - concat_df = pd.concat([df1, df2], axis=1) - cdf1 = cudf.from_pandas(df1) - cdf2 = cudf.from_pandas(df2) - - # concat only dataframes - concat_cdf = cudf.concat([cdf1, cdf2], axis=1) - assert_eq(concat_cdf, concat_df, check_index_type=True) - - # concat only series - concat_s = pd.concat([df1.x, df1.y], axis=1) - cs1 = cudf.Series.from_pandas(df1.x) - cs2 = cudf.Series.from_pandas(df1.y) - concat_cdf_s = cudf.concat([cs1, cs2], axis=1) - - assert_eq(concat_cdf_s, concat_s, check_index_type=True) - - # concat series and dataframes - s3 = pd.Series(np.random.random(5)) - cs3 = cudf.Series.from_pandas(s3) - - concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1) - concat_df_all = pd.concat([df1, s3, df2], axis=1) - assert_eq(concat_cdf_all, concat_df_all, check_index_type=True) - - # concat manual multi index - midf1 = cudf.from_pandas(df1) - midf1.index = cudf.MultiIndex( - levels=[[0, 1, 2, 3], [0, 1]], codes=[[0, 1, 2, 3, 2], [0, 1, 0, 1, 0]] - ) - midf2 = midf1[2:] - midf2.index = cudf.MultiIndex( - levels=[[3, 4, 5], [2, 0]], codes=[[0, 1, 2], [1, 0, 1]] - ) - mipdf1 = midf1.to_pandas() - mipdf2 = midf2.to_pandas() - - assert_eq( - cudf.concat([midf1, midf2]), - pd.concat([mipdf1, mipdf2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([midf2, midf1]), - pd.concat([mipdf2, mipdf1]), - check_index_type=True, - ) - assert_eq( - cudf.concat([midf1, midf2, midf1]), - pd.concat([mipdf1, mipdf2, mipdf1]), - check_index_type=True, - ) - - # concat groupby multi index - gdf1 = cudf.DataFrame( - { - "x": np.random.randint(0, 10, 10), - "y": np.random.randint(0, 10, 10), - "z": np.random.randint(0, 10, 10), - "v": np.random.randint(0, 10, 10), - } - ) - gdf2 = gdf1[5:] - gdg1 = gdf1.groupby(["x", "y"]).min() - gdg2 = gdf2.groupby(["x", "y"]).min() - pdg1 = gdg1.to_pandas() - pdg2 = gdg2.to_pandas() - - assert_eq( - cudf.concat([gdg1, gdg2]), - pd.concat([pdg1, pdg2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([gdg2, gdg1]), - pd.concat([pdg2, pdg1]), - check_index_type=True, - ) - - # series multi index concat - gdgz1 = gdg1.z - gdgz2 = gdg2.z - pdgz1 = gdgz1.to_pandas() - pdgz2 = gdgz2.to_pandas() - - assert_eq( - cudf.concat([gdgz1, gdgz2]), - pd.concat([pdgz1, pdgz2]), - check_index_type=True, - ) - assert_eq( - cudf.concat([gdgz2, gdgz1]), - pd.concat([pdgz2, pdgz1]), - check_index_type=True, - ) - - -@pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000]) -def test_nonmatching_index_setitem(nrows): - np.random.seed(0) - - gdf = cudf.DataFrame() - gdf["a"] = np.random.randint(2147483647, size=nrows) - gdf["b"] = np.random.randint(2147483647, size=nrows) - gdf = gdf.set_index("b") - - test_values = np.random.randint(2147483647, size=nrows) - gdf["c"] = test_values - assert len(test_values) == len(gdf["c"]) - gdf_series = cudf.Series(test_values, index=gdf.index, name="c") - assert_eq(gdf["c"].to_pandas(), gdf_series.to_pandas()) - - -@pytest.mark.parametrize("dtype", ["int", "int64[pyarrow]"]) -def test_from_pandas(dtype): - df = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0], dtype=dtype) - df.columns.name = "custom_column_name" - gdf = cudf.DataFrame.from_pandas(df) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf, check_dtype="pyarrow" not in dtype) - - s = df.x - gs = cudf.Series.from_pandas(s) - assert isinstance(gs, cudf.Series) - - assert_eq(s, gs, check_dtype="pyarrow" not in dtype) - - -@pytest.mark.parametrize("dtypes", [int, float]) -def test_from_records(dtypes): - h_ary = np.ndarray(shape=(10, 4), dtype=dtypes) - rec_ary = h_ary.view(np.recarray) - - gdf = cudf.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) - df = pd.DataFrame.from_records(rec_ary, columns=["a", "b", "c", "d"]) - assert isinstance(gdf, cudf.DataFrame) - assert_eq(df, gdf) - - gdf = cudf.DataFrame.from_records(rec_ary) - df = pd.DataFrame.from_records(rec_ary) - assert isinstance(gdf, cudf.DataFrame) - assert_eq(df, gdf) - - -@pytest.mark.parametrize("columns", [None, ["first", "second", "third"]]) -@pytest.mark.parametrize( - "index", - [ - None, - ["first", "second"], - "name", - "age", - "weight", - [10, 11], - ["abc", "xyz"], - ], -) -def test_from_records_index(columns, index): - rec_ary = np.array( - [("Rex", 9, 81.0), ("Fido", 3, 27.0)], - dtype=[("name", "U10"), ("age", "i4"), ("weight", "f4")], - ) - gdf = cudf.DataFrame.from_records(rec_ary, columns=columns, index=index) - df = pd.DataFrame.from_records(rec_ary, columns=columns, index=index) - assert isinstance(gdf, cudf.DataFrame) - assert_eq(df, gdf) - - -def test_dataframe_construction_from_cupy_arrays(): - h_ary = np.array([[1, 2, 3], [4, 5, 6]], np.int32) - d_ary = cupy.asarray(h_ary) - - gdf = cudf.DataFrame(d_ary, columns=["a", "b", "c"]) - df = pd.DataFrame(h_ary, columns=["a", "b", "c"]) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - gdf = cudf.DataFrame(d_ary) - df = pd.DataFrame(h_ary) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - gdf = cudf.DataFrame(d_ary, index=["a", "b"]) - df = pd.DataFrame(h_ary, index=["a", "b"]) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - gdf = cudf.DataFrame(d_ary) - gdf = gdf.set_index(keys=0, drop=False) - df = pd.DataFrame(h_ary) - df = df.set_index(keys=0, drop=False) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - gdf = cudf.DataFrame(d_ary) - gdf = gdf.set_index(keys=1, drop=False) - df = pd.DataFrame(h_ary) - df = df.set_index(keys=1, drop=False) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - -def test_dataframe_cupy_wrong_dimensions(): - d_ary = cupy.empty((2, 3, 4), dtype=np.int32) - with pytest.raises( - ValueError, match="records dimension expected 1 or 2 but found: 3" - ): - cudf.DataFrame(d_ary) - - -def test_dataframe_cupy_array_wrong_index(): - d_ary = cupy.empty((2, 3), dtype=np.int32) - - with pytest.raises(ValueError): - cudf.DataFrame(d_ary, index=["a"]) - - with pytest.raises(ValueError): - cudf.DataFrame(d_ary, index="a") - - -def test_index_in_dataframe_constructor(): - a = pd.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) - b = cudf.DataFrame({"x": [1, 2, 3]}, index=[4.0, 5.0, 6.0]) - - assert_eq(a, b) - assert_eq(a.loc[4:], b.loc[4:]) - - -dtypes = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] - - -@pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) -@pytest.mark.parametrize("data_type", dtypes) -def test_from_arrow(nelem, data_type): - df = pd.DataFrame( - { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), - } - ) - padf = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) - gdf = cudf.DataFrame.from_arrow(padf) - assert isinstance(gdf, cudf.DataFrame) - - assert_eq(df, gdf) - - s = pa.Array.from_pandas(df.a) - gs = cudf.Series.from_arrow(s) - assert isinstance(gs, cudf.Series) - - # For some reason PyArrow to_pandas() converts to numpy array and has - # better type compatibility - np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy()) - - -def test_from_arrow_chunked_categories(): - # Verify that categories are properly deduplicated across chunked arrays. - indices = pa.array([0, 1, 0, 1, 2, 0, None, 2]) - dictionary = pa.array(["foo", "bar", "baz"]) - dict_array = pa.DictionaryArray.from_arrays(indices, dictionary) - chunked_array = pa.chunked_array([dict_array, dict_array]) - table = pa.table({"a": chunked_array}) - df = cudf.DataFrame.from_arrow(table) - final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist() - assert sorted(final_dictionary) == sorted(dictionary.to_pylist()) - - -@pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) -@pytest.mark.parametrize("data_type", dtypes) -def test_to_arrow(nelem, data_type): - df = pd.DataFrame( - { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), - } - ) - gdf = cudf.DataFrame.from_pandas(df) - - pa_df = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) - - pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) - - assert isinstance(pa_gdf, pa.Table) - assert pa.Table.equals(pa_df, pa_gdf) - - pa_s = pa.Array.from_pandas(df.a) - pa_gs = gdf["a"].to_arrow() - - assert isinstance(pa_gs, pa.Array) - assert pa.Array.equals(pa_s, pa_gs) - - pa_i = pa.Array.from_pandas(df.index) - pa_gi = gdf.index.to_arrow() - - assert isinstance(pa_gi, pa.Array) - assert pa.Array.equals(pa_i, pa_gi) - - -@pytest.mark.parametrize("data_type", dtypes) -def test_to_from_arrow_nulls(data_type): - if data_type == "longlong": - data_type = "int64" - if data_type == "bool": - s1 = pa.array([True, None, False, None, True], type=data_type) - else: - dtype = np.dtype(data_type) - if dtype.type == np.datetime64: - time_unit, _ = np.datetime_data(dtype) - data_type = pa.timestamp(unit=time_unit) - s1 = pa.array([1, None, 3, None, 5], type=data_type) - gs1 = cudf.Series.from_arrow(s1) - assert isinstance(gs1, cudf.Series) - # We have 64B padded buffers for nulls whereas Arrow returns a minimal - # number of bytes, so only check the first byte in this case - np.testing.assert_array_equal( - np.asarray(s1.buffers()[0]).view("u1")[0], - gs1._column.mask_array_view(mode="read").copy_to_host().view("u1")[0], - ) - assert pa.Array.equals(s1, gs1.to_arrow()) - - s2 = pa.array([None, None, None, None, None], type=data_type) - gs2 = cudf.Series.from_arrow(s2) - assert isinstance(gs2, cudf.Series) - # We have 64B padded buffers for nulls whereas Arrow returns a minimal - # number of bytes, so only check the first byte in this case - np.testing.assert_array_equal( - np.asarray(s2.buffers()[0]).view("u1")[0], - gs2._column.mask_array_view(mode="read").copy_to_host().view("u1")[0], - ) - assert pa.Array.equals(s2, gs2.to_arrow()) - - -def test_to_arrow_categorical(): - df = pd.DataFrame() - df["a"] = pd.Series(["a", "b", "c"], dtype="category") - gdf = cudf.DataFrame.from_pandas(df) - - pa_df = pa.Table.from_pandas( - df, preserve_index=False - ).replace_schema_metadata(None) - pa_gdf = gdf.to_arrow(preserve_index=False).replace_schema_metadata(None) - - assert isinstance(pa_gdf, pa.Table) - assert pa.Table.equals(pa_df, pa_gdf) - - pa_s = pa.Array.from_pandas(df.a) - pa_gs = gdf["a"].to_arrow() - - assert isinstance(pa_gs, pa.Array) - assert pa.Array.equals(pa_s, pa_gs) - - -def test_from_arrow_missing_categorical(): - pd_cat = pd.Categorical(["a", "b", "c"], categories=["a", "b"]) - pa_cat = pa.array(pd_cat, from_pandas=True) - gd_cat = cudf.Series(pa_cat) - - assert isinstance(gd_cat, cudf.Series) - assert_eq( - pd.Series(pa_cat.to_pandas()), # PyArrow returns a pd.Categorical - gd_cat.to_pandas(), - ) - - -def test_to_arrow_missing_categorical(): - pd_cat = pd.Categorical(["a", "b", "c"], categories=["a", "b"]) - pa_cat = pa.array(pd_cat, from_pandas=True) - gd_cat = cudf.Series(pa_cat) - - assert isinstance(gd_cat, cudf.Series) - assert pa.Array.equals(pa_cat, gd_cat.to_arrow()) - - -@pytest.mark.parametrize("data_type", dtypes) -def test_from_scalar_typing(data_type): - if data_type == "datetime64[ms]": - scalar = ( - np.dtype("int64") - .type(np.random.randint(0, 5)) - .astype("datetime64[ms]") - ) - elif data_type.startswith("datetime64"): - scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]") - data_type = "datetime64[ms]" - else: - scalar = np.dtype(data_type).type(np.random.randint(0, 5)) - - gdf = cudf.DataFrame() - gdf["a"] = [1, 2, 3, 4, 5] - gdf["b"] = scalar - assert gdf["b"].dtype == np.dtype(data_type) - assert len(gdf["b"]) == len(gdf["a"]) - - -@pytest.mark.parametrize("data_type", NUMERIC_TYPES) -def test_from_python_array(data_type): - np_arr = np.random.randint(0, 100, 10).astype(data_type) - data = memoryview(np_arr) - data = arr.array(data.format, data) - - gs = cudf.Series(data) - - np.testing.assert_equal(gs.to_numpy(), np_arr) - - -def test_series_shape(): - ps = pd.Series([1, 2, 3, 4]) - cs = cudf.Series([1, 2, 3, 4]) - - assert ps.shape == cs.shape - - -def test_series_shape_empty(): - ps = pd.Series([], dtype="float64") - cs = cudf.Series([], dtype="float64") - - assert ps.shape == cs.shape - - -def test_dataframe_shape(): - pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - gdf = cudf.DataFrame.from_pandas(pdf) - - assert pdf.shape == gdf.shape - - -def test_dataframe_shape_empty(): - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - - assert pdf.shape == gdf.shape - - -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 20]) -@pytest.mark.parametrize("dtype", dtypes + ["object"]) -@pytest.mark.parametrize("nulls", ["none", "some", "all"]) -def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): - # In case of `bool` dtype: pandas <= 1.2.5 type-casts - # a boolean series to `float64` series if a `np.nan` is assigned to it: - # >>> s = pd.Series([True, False, True]) - # >>> s - # 0 True - # 1 False - # 2 True - # dtype: bool - # >>> s[[2]] = np.nan - # >>> s - # 0 1.0 - # 1 0.0 - # 2 NaN - # dtype: float64 - # In pandas >= 1.3.2 this behavior is fixed: - # >>> s = pd.Series([True, False, True]) - # >>> s - # 0 - # True - # 1 - # False - # 2 - # True - # dtype: bool - # >>> s[[2]] = np.nan - # >>> s - # 0 - # True - # 1 - # False - # 2 - # NaN - # dtype: object - # In cudf we change `object` dtype to `str` type - for which there - # is no transpose implemented yet. Hence we need to test transpose - # against pandas nullable types as they are the ones that closely - # resemble `cudf` dtypes behavior. - pdf = pd.DataFrame() - - null_rep = np.nan if dtype in ["float32", "float64"] else None - np_dtype = dtype - dtype = np.dtype(dtype) - dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.get(dtype, dtype) - for i in range(num_cols): - colname = string.ascii_lowercase[i] - data = pd.Series( - np.random.randint(0, 26, num_rows).astype(np_dtype), - dtype=dtype, - ) - if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) - if len(idx): - data[idx] = null_rep - elif nulls == "all": - data[:] = null_rep - pdf[colname] = data - - gdf = cudf.DataFrame.from_pandas(pdf) - - got_function = gdf.transpose() - got_property = gdf.T - - expect = pdf.transpose() - nullable = dtype not in DATETIME_TYPES - assert_eq(expect, got_function.to_pandas(nullable=nullable)) - assert_eq(expect, got_property.to_pandas(nullable=nullable)) - - -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 20]) -def test_dataframe_transpose_category(num_cols, num_rows): - pdf = pd.DataFrame() - - for i in range(num_cols): - colname = string.ascii_lowercase[i] - data = pd.Series(list(string.ascii_lowercase), dtype="category") - data = data.sample(num_rows, replace=True).reset_index(drop=True) - pdf[colname] = data - - gdf = cudf.DataFrame.from_pandas(pdf) - - got_function = gdf.transpose() - got_property = gdf.T - - expect = pdf.transpose() - - assert_eq(expect, got_function.to_pandas()) - assert_eq(expect, got_property.to_pandas()) - - -def test_generated_column(): - gdf = cudf.DataFrame({"a": (i for i in range(5))}) - assert len(gdf) == 5 - - -@pytest.fixture -def pdf(): - return pd.DataFrame({"x": range(10), "y": range(10)}) - - -@pytest.fixture -def gdf(pdf): - return cudf.DataFrame.from_pandas(pdf) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data", - [ - { - "x": [np.nan, 2, 3, 4, 100, np.nan], - "y": [4, 5, 6, 88, 99, np.nan], - "z": [7, 8, 9, 66, np.nan, 77], - }, - {"x": [1, 2, 3], "y": [4, 5, 6], "z": [7, 8, 9]}, - { - "x": [np.nan, np.nan, np.nan], - "y": [np.nan, np.nan, np.nan], - "z": [np.nan, np.nan, np.nan], - }, - pytest.param( - {"x": [], "y": [], "z": []}, - marks=pytest_xfail( - condition=version.parse("11") - <= version.parse(cupy.__version__) - < version.parse("11.1"), - reason="Zero-sized array passed to cupy reduction, " - "https://github.com/cupy/cupy/issues/6937", - ), - ), - pytest.param( - {"x": []}, - marks=pytest_xfail( - condition=version.parse("11") - <= version.parse(cupy.__version__) - < version.parse("11.1"), - reason="Zero-sized array passed to cupy reduction, " - "https://github.com/cupy/cupy/issues/6937", - ), - ), - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize( - "func", - [ - "min", - "max", - "sum", - "prod", - "product", - "cummin", - "cummax", - "cumsum", - "cumprod", - "mean", - "median", - "sum", - "std", - "var", - "kurt", - "skew", - "all", - "any", - ], -) -@pytest.mark.parametrize("skipna", [True, False]) -def test_dataframe_reductions(data, axis, func, skipna): - pdf = pd.DataFrame(data=data) - gdf = cudf.DataFrame.from_pandas(pdf) - - # Reductions can fail in numerous possible ways when attempting row-wise - # reductions, which are only partially supported. Catching the appropriate - # exception here allows us to detect API breakage in the form of changing - # exceptions. - expected_exception = None - if axis == 1: - if func in ("kurt", "skew"): - expected_exception = NotImplementedError - elif func not in cudf.core.dataframe._cupy_nan_methods_map: - if skipna is False: - expected_exception = NotImplementedError - elif any(col.nullable for name, col in gdf.items()): - expected_exception = ValueError - elif func in ("cummin", "cummax"): - expected_exception = AttributeError - - # Test different degrees of freedom for var and std. - all_kwargs = [{"ddof": 1}, {"ddof": 2}] if func in ("var", "std") else [{}] - for kwargs in all_kwargs: - if expected_exception is not None: - with pytest.raises(expected_exception): - (getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs),) - else: - expect = getattr(pdf, func)(axis=axis, skipna=skipna, **kwargs) - with expect_warning_if( - skipna - and func in {"min", "max"} - and axis == 1 - and any(gdf.T[col].isna().all() for col in gdf.T), - RuntimeWarning, - ): - got = getattr(gdf, func)(axis=axis, skipna=skipna, **kwargs) - assert_eq(got, expect, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"x": [np.nan, 2, 3, 4, 100, np.nan], "y": [4, 5, 6, 88, 99, np.nan]}, - {"x": [1, 2, 3], "y": [4, 5, 6]}, - {"x": [np.nan, np.nan, np.nan], "y": [np.nan, np.nan, np.nan]}, - {"x": [], "y": []}, - {"x": []}, - ], -) -@pytest.mark.parametrize("func", [lambda df: df.count()]) -def test_dataframe_count_reduction(data, func): - pdf = pd.DataFrame(data=data) - gdf = cudf.DataFrame.from_pandas(pdf) - - assert_eq(func(pdf), func(gdf)) - - -@pytest.mark.parametrize( - "data", - [ - {"x": [np.nan, 2, 3, 4, 100, np.nan], "y": [4, 5, 6, 88, 99, np.nan]}, - {"x": [1, 2, 3], "y": [4, 5, 6]}, - {"x": [np.nan, np.nan, np.nan], "y": [np.nan, np.nan, np.nan]}, - {"x": pd.Series([], dtype="float"), "y": pd.Series([], dtype="float")}, - {"x": pd.Series([], dtype="int")}, - ], -) -@pytest.mark.parametrize("ops", ["sum", "product", "prod"]) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 10]) -def test_dataframe_min_count_ops(data, ops, skipna, min_count): - psr = pd.DataFrame(data) - gsr = cudf.from_pandas(psr) - - assert_eq( - getattr(psr, ops)(skipna=skipna, min_count=min_count), - getattr(gsr, ops)(skipna=skipna, min_count=min_count), - check_dtype=False, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "binop", - [ - operator.add, - operator.mul, - operator.floordiv, - operator.truediv, - operator.mod, - operator.pow, - ], -) -@pytest.mark.parametrize( - "other", - [ - 1.0, - pd.Series([1.0]), - pd.Series([1.0, 2.0]), - pd.Series([1.0, 2.0, 3.0]), - pd.Series([1.0], index=["x"]), - pd.Series([1.0, 2.0], index=["x", "y"]), - pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]), - pd.DataFrame({"x": [1.0]}), - pd.DataFrame({"x": [1.0], "y": [2.0]}), - pd.DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}), - ], -) -def test_arithmetic_binops_df(pdf, gdf, binop, other): - # Avoid 1**NA cases: https://github.com/pandas-dev/pandas/issues/29997 - pdf[pdf == 1.0] = 2 - gdf[gdf == 1.0] = 2 - try: - d = binop(pdf, other) - except Exception: - if isinstance(other, (pd.Series, pd.DataFrame)): - cudf_other = cudf.from_pandas(other) - - # that returns before we enter this try-except. - assert_exceptions_equal( - lfunc=binop, - rfunc=binop, - lfunc_args_and_kwargs=([pdf, other], {}), - rfunc_args_and_kwargs=([gdf, cudf_other], {}), - ) - else: - if isinstance(other, (pd.Series, pd.DataFrame)): - other = cudf.from_pandas(other) - g = binop(gdf, other) - assert_eq(d, g) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "binop", - [ - operator.eq, - operator.lt, - operator.le, - operator.gt, - operator.ge, - operator.ne, - ], -) -@pytest.mark.parametrize( - "other", - [ - 1.0, - pd.Series([1.0, 2.0], index=["x", "y"]), - pd.DataFrame({"x": [1.0]}), - pd.DataFrame({"x": [1.0], "y": [2.0]}), - pd.DataFrame({"x": [1.0], "y": [2.0], "z": [3.0]}), - ], -) -def test_comparison_binops_df(pdf, gdf, binop, other): - # Avoid 1**NA cases: https://github.com/pandas-dev/pandas/issues/29997 - pdf[pdf == 1.0] = 2 - gdf[gdf == 1.0] = 2 - try: - d = binop(pdf, other) - except Exception: - if isinstance(other, (pd.Series, pd.DataFrame)): - cudf_other = cudf.from_pandas(other) - - # that returns before we enter this try-except. - assert_exceptions_equal( - lfunc=binop, - rfunc=binop, - lfunc_args_and_kwargs=([pdf, other], {}), - rfunc_args_and_kwargs=([gdf, cudf_other], {}), - ) - else: - if isinstance(other, (pd.Series, pd.DataFrame)): - other = cudf.from_pandas(other) - g = binop(gdf, other) - assert_eq(d, g) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "binop", - [ - operator.eq, - operator.lt, - operator.le, - operator.gt, - operator.ge, - operator.ne, - ], -) -@pytest.mark.parametrize( - "other", - [ - pd.Series([1.0]), - pd.Series([1.0, 2.0]), - pd.Series([1.0, 2.0, 3.0]), - pd.Series([1.0], index=["x"]), - pd.Series([1.0, 2.0, 3.0], index=["x", "y", "z"]), - ], -) -def test_comparison_binops_df_reindexing(request, pdf, gdf, binop, other): - # Avoid 1**NA cases: https://github.com/pandas-dev/pandas/issues/29997 - pdf[pdf == 1.0] = 2 - gdf[gdf == 1.0] = 2 - try: - with pytest.warns(FutureWarning): - d = binop(pdf, other) - except Exception: - if isinstance(other, (pd.Series, pd.DataFrame)): - cudf_other = cudf.from_pandas(other) - - # that returns before we enter this try-except. - assert_exceptions_equal( - lfunc=binop, - rfunc=binop, - lfunc_args_and_kwargs=([pdf, other], {}), - rfunc_args_and_kwargs=([gdf, cudf_other], {}), - ) - else: - request.applymarker( - pytest.mark.xfail( - condition=pdf.columns.difference(other.index).size > 0, - reason=""" - Currently we will not match pandas for equality/inequality - operators when there are columns that exist in a Series but not - the DataFrame because pandas returns True/False values whereas - we return NA. However, this reindexing is deprecated in pandas - so we opt not to add support. This test should start passing - once pandas removes the deprecated behavior in 2.0. When that - happens, this test can be merged with the two tests above into - a single test with common parameters. - """, - ) - ) - - if isinstance(other, (pd.Series, pd.DataFrame)): - other = cudf.from_pandas(other) - g = binop(gdf, other) - assert_eq(d, g) - - -def test_binops_df_invalid(gdf): - with pytest.raises(TypeError): - gdf + np.array([1, 2]) - - -@pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor]) -def test_bitwise_binops_df(pdf, gdf, binop): - d = binop(pdf, pdf + 1) - g = binop(gdf, gdf + 1) - assert_eq(d, g) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "binop", - [ - operator.add, - operator.mul, - operator.floordiv, - operator.truediv, - operator.mod, - operator.pow, - operator.eq, - operator.lt, - operator.le, - operator.gt, - operator.ge, - operator.ne, - ], -) -def test_binops_series(pdf, gdf, binop): - pdf = pdf + 1.0 - gdf = gdf + 1.0 - d = binop(pdf.x, pdf.y) - g = binop(gdf.x, gdf.y) - assert_eq(d, g) - - -@pytest.mark.parametrize("binop", [operator.and_, operator.or_, operator.xor]) -def test_bitwise_binops_series(pdf, gdf, binop): - d = binop(pdf.x, pdf.y + 1) - g = binop(gdf.x, gdf.y + 1) - assert_eq(d, g) - - -@pytest.mark.parametrize("unaryop", [operator.neg, operator.inv, operator.abs]) -@pytest.mark.parametrize( - "col_name,assign_col_name", [(None, False), (None, True), ("abc", True)] -) -def test_unaryops_df(pdf, unaryop, col_name, assign_col_name): - pd_df = pdf.copy() - if assign_col_name: - pd_df.columns.name = col_name - gdf = cudf.from_pandas(pd_df) - d = unaryop(pd_df - 5) - g = unaryop(gdf - 5) - assert_eq(d, g) - - -def test_df_abs(pdf): - np.random.seed(0) - disturbance = pd.Series(np.random.rand(10)) - pdf = pdf - 5 + disturbance - d = pdf.apply(np.abs) - g = cudf.from_pandas(pdf).abs() - assert_eq(d, g) - - -def test_scale_df(gdf): - got = (gdf - 5).scale() - expect = cudf.DataFrame( - {"x": np.linspace(0.0, 1.0, 10), "y": np.linspace(0.0, 1.0, 10)} - ) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "func", - [ - lambda df: df.empty, - lambda df: df.x.empty, - lambda df: df.x.fillna(123, limit=None, method=None, axis=None), - lambda df: df.drop("x", axis=1, errors="raise"), - ], -) -def test_unary_operators(func, pdf, gdf): - p = func(pdf) - g = func(gdf) - assert_eq(p, g) - - -def test_is_monotonic(gdf): - pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[3, 1, 2]) - gdf = cudf.DataFrame.from_pandas(pdf) - assert not gdf.index.is_monotonic_increasing - assert not gdf.index.is_monotonic_decreasing - - -def test_iter(pdf, gdf): - assert list(pdf) == list(gdf) - - -def test_iteritems(gdf): - for k, v in gdf.items(): - assert k in gdf.columns - assert isinstance(v, cudf.Series) - assert_eq(v, gdf[k]) - - -@pytest.mark.parametrize("q", [0.5, 1, 0.001, [0.5], [], [0.005, 0.5, 1]]) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_quantile(q, numeric_only): - ts = pd.date_range("2018-08-24", periods=5, freq="D") - td = pd.to_timedelta(np.arange(5), unit="h") - pdf = pd.DataFrame( - {"date": ts, "delta": td, "val": np.random.randn(len(ts))} - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - assert_eq(pdf["date"].quantile(q), gdf["date"].quantile(q)) - assert_eq(pdf["delta"].quantile(q), gdf["delta"].quantile(q)) - assert_eq(pdf["val"].quantile(q), gdf["val"].quantile(q)) - - q = q if isinstance(q, list) else [q] - assert_eq( - pdf.quantile(q, numeric_only=numeric_only), - gdf.quantile(q, numeric_only=numeric_only), - ) - - -@pytest.mark.parametrize("q", [0.2, 1, 0.001, [0.5], [], [0.005, 0.8, 0.03]]) -@pytest.mark.parametrize("interpolation", ["higher", "lower", "nearest"]) -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_decimal_quantile(q, interpolation, decimal_type): - data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"] - gdf = cudf.DataFrame( - {"id": np.random.randint(0, 10, size=len(data)), "val": data} - ) - gdf["id"] = gdf["id"].astype("float64") - gdf["val"] = gdf["val"].astype(decimal_type(7, 2)) - pdf = gdf.to_pandas() - - got = gdf.quantile(q, numeric_only=False, interpolation=interpolation) - expected = pdf.quantile( - q if isinstance(q, list) else [q], - numeric_only=False, - interpolation=interpolation, - ) - - assert_eq(got, expected) - - -def test_empty_quantile(): - pdf = pd.DataFrame({"x": []}, dtype="float64") - df = cudf.DataFrame({"x": []}, dtype="float64") - - actual = df.quantile() - expected = pdf.quantile() - - assert_eq(actual, expected) - - -def test_from_pandas_function(pdf): - gdf = cudf.from_pandas(pdf) - assert isinstance(gdf, cudf.DataFrame) - assert_eq(pdf, gdf) - - gdf = cudf.from_pandas(pdf.x) - assert isinstance(gdf, cudf.Series) - assert_eq(pdf.x, gdf) - - with pytest.raises(TypeError): - cudf.from_pandas(123) - - -@pytest.mark.parametrize("preserve_index", [True, False]) -def test_arrow_pandas_compat(pdf, gdf, preserve_index): - pdf["z"] = range(10) - pdf = pdf.set_index("z") - gdf["z"] = range(10) - gdf = gdf.set_index("z") - - pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) - gdf_arrow_table = gdf.to_arrow(preserve_index=preserve_index) - - assert pa.Table.equals(pdf_arrow_table, gdf_arrow_table) - - gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) - pdf2 = pdf_arrow_table.to_pandas() - - assert_eq(pdf2, gdf2) - pdf.columns.name = "abc" - pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) - - gdf2 = cudf.DataFrame.from_arrow(pdf_arrow_table) - pdf2 = pdf_arrow_table.to_pandas() - assert_eq(pdf2, gdf2) - - -@pytest.mark.parametrize( - "index", - [ - None, - cudf.RangeIndex(3, name="a"), - "a", - "b", - ["a", "b"], - cudf.RangeIndex(0, 5, 2, name="a"), - ], -) -@pytest.mark.parametrize("preserve_index", [True, False, None]) -def test_arrow_round_trip(preserve_index, index): - data = {"a": [4, 5, 6], "b": ["cat", "dog", "bird"]} - if isinstance(index, (list, str)): - gdf = cudf.DataFrame(data).set_index(index) - else: - gdf = cudf.DataFrame(data, index=index) - - table = gdf.to_arrow(preserve_index=preserve_index) - table_pd = pa.Table.from_pandas( - gdf.to_pandas(), preserve_index=preserve_index - ) - - gdf_out = cudf.DataFrame.from_arrow(table) - pdf_out = table_pd.to_pandas() - - assert_eq(gdf_out, pdf_out) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) -def test_cuda_array_interface(dtype): - np_data = np.arange(10).astype(dtype) - cupy_data = cupy.array(np_data) - pd_data = pd.Series(np_data) - - cudf_data = cudf.Series(cupy_data) - assert_eq(pd_data, cudf_data) - - gdf = cudf.DataFrame() - gdf["test"] = cupy_data - pd_data.name = "test" - assert_eq(pd_data, gdf["test"]) - - -@pytest.mark.parametrize("nelem", [0, 2, 3, 100]) -@pytest.mark.parametrize("nchunks", [1, 2, 5, 10]) -@pytest.mark.parametrize("data_type", dtypes) -def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): - np_list_data = [ - np.random.randint(0, 100, nelem).astype(data_type) - for i in range(nchunks) - ] - pa_chunk_array = pa.chunked_array(np_list_data) - - expect = pa_chunk_array.to_pandas() - got = cudf.Series(pa_chunk_array) - - assert_eq(expect, got) - - np_list_data2 = [ - np.random.randint(0, 100, nelem).astype(data_type) - for i in range(nchunks) - ] - pa_chunk_array2 = pa.chunked_array(np_list_data2) - pa_table = pa.Table.from_arrays( - [pa_chunk_array, pa_chunk_array2], names=["a", "b"] - ) - - expect = pa_table.to_pandas() - got = cudf.DataFrame.from_arrow(pa_table) - - assert_eq(expect, got) - - -@pytest.mark.skip(reason="Test was designed to be run in isolation") -def test_gpu_memory_usage_with_boolmask(): - ctx = cuda.current_context() - - def query_GPU_memory(note=""): - memInfo = ctx.get_memory_info() - usedMemoryGB = (memInfo.total - memInfo.free) / 1e9 - return usedMemoryGB - - cuda.current_context().deallocations.clear() - nRows = int(1e8) - nCols = 2 - dataNumpy = np.asfortranarray(np.random.rand(nRows, nCols)) - colNames = ["col" + str(iCol) for iCol in range(nCols)] - pandasDF = pd.DataFrame(data=dataNumpy, columns=colNames, dtype=np.float32) - cudaDF = cudf.core.DataFrame.from_pandas(pandasDF) - boolmask = cudf.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool")) - - memory_used = query_GPU_memory() - cudaDF = cudaDF[boolmask] - - assert ( - cudaDF.index._values.data_array_view(mode="read").device_ctypes_pointer - == cudaDF["col0"].index._values.data_array_view.device_ctypes_pointer - ) - assert ( - cudaDF.index._values.data_array_view(mode="read").device_ctypes_pointer - == cudaDF["col1"].index._values.data_array_view.device_ctypes_pointer - ) - - assert memory_used == query_GPU_memory() - - -def test_boolmask(pdf, gdf): - boolmask = np.random.randint(0, 2, len(pdf)) > 0 - gdf = gdf[boolmask] - pdf = pdf[boolmask] - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "mask_shape", - [ - (2, "ab"), - (2, "abc"), - (3, "ab"), - (3, "abc"), - (3, "abcd"), - (4, "abc"), - (4, "abcd"), - ], -) -def test_dataframe_boolmask(mask_shape): - pdf = pd.DataFrame() - for col in "abc": - pdf[col] = np.random.randint(0, 10, 3) - pdf_mask = pd.DataFrame() - for col in mask_shape[1]: - pdf_mask[col] = np.random.randint(0, 2, mask_shape[0]) > 0 - gdf = cudf.DataFrame.from_pandas(pdf) - gdf_mask = cudf.DataFrame.from_pandas(pdf_mask) - gdf = gdf[gdf_mask] - pdf = pdf[pdf_mask] - - assert np.array_equal(gdf.columns, pdf.columns) - for col in gdf.columns: - assert np.array_equal( - gdf[col].fillna(-1).to_pandas().values, pdf[col].fillna(-1).values - ) - - -@pytest.mark.parametrize( - "mask", - [ - [True, False, True], - pytest.param( - cudf.Series([True, False, True]), - marks=pytest_xfail( - reason="Pandas can't index a multiindex with a Series" - ), - ), - ], -) -def test_dataframe_multiindex_boolmask(mask): - gdf = cudf.DataFrame( - {"w": [3, 2, 1], "x": [1, 2, 3], "y": [0, 1, 0], "z": [1, 1, 1]} - ) - gdg = gdf.groupby(["w", "x"]).count() - pdg = gdg.to_pandas() - assert_eq(gdg[mask], pdg[mask]) - - -def test_dataframe_assignment(): - pdf = pd.DataFrame() - for col in "abc": - pdf[col] = np.array([0, 1, 1, -2, 10]) - gdf = cudf.DataFrame.from_pandas(pdf) - gdf[gdf < 0] = 999 - pdf[pdf < 0] = 999 - assert_eq(gdf, pdf) - - -def test_1row_arrow_table(): - data = [pa.array([0]), pa.array([1])] - batch = pa.RecordBatch.from_arrays(data, ["f0", "f1"]) - table = pa.Table.from_batches([batch]) - - expect = table.to_pandas() - got = cudf.DataFrame.from_arrow(table) - assert_eq(expect, got) - - -def test_arrow_handle_no_index_name(pdf, gdf): - gdf_arrow = gdf.to_arrow() - pdf_arrow = pa.Table.from_pandas(pdf) - assert pa.Table.equals(pdf_arrow, gdf_arrow) - - got = cudf.DataFrame.from_arrow(gdf_arrow) - expect = pdf_arrow.to_pandas() - assert_eq(expect, got) - - -def test_pandas_non_contiguious(): - arr1 = np.random.sample([5000, 10]) - assert arr1.flags["C_CONTIGUOUS"] is True - df = pd.DataFrame(arr1) - for col in df.columns: - assert df[col].values.flags["C_CONTIGUOUS"] is False - - gdf = cudf.DataFrame.from_pandas(df) - assert_eq(gdf.to_pandas(), df) - - -@pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) -@pytest.mark.parametrize("null_type", [np.nan, None, "mixed"]) -def test_series_all_null(num_elements, null_type): - if null_type == "mixed": - data = [] - data1 = [np.nan] * int(num_elements / 2) - data2 = [None] * int(num_elements / 2) - for idx in range(len(data1)): - data.append(data1[idx]) - data.append(data2[idx]) - else: - data = [null_type] * num_elements - - # Typecast Pandas because None will return `object` dtype - expect = pd.Series(data, dtype="float64") - got = cudf.Series(data, dtype="float64") - - assert_eq(expect, got) - - -@pytest.mark.parametrize("num_elements", [0, 2, 10, 100]) -def test_series_all_valid_nan(num_elements): - data = [np.nan] * num_elements - sr = cudf.Series(data, nan_as_null=False) - np.testing.assert_equal(sr.null_count, 0) - - -def test_series_rename(): - pds = pd.Series([1, 2, 3], name="asdf") - gds = cudf.Series([1, 2, 3], name="asdf") - - expect = pds.rename("new_name") - got = gds.rename("new_name") - - assert_eq(expect, got) - - pds = pd.Series(expect) - gds = cudf.Series(got) - - assert_eq(pds, gds) - - pds = pd.Series(expect, name="name name") - gds = cudf.Series(got, name="name name") - - assert_eq(pds, gds) - - -@pytest.mark.parametrize("data_type", dtypes) -@pytest.mark.parametrize("nelem", [0, 100]) -def test_head_tail(nelem, data_type): - pdf = pd.DataFrame( - { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), - } - ) - gdf = cudf.from_pandas(pdf) - - assert_eq(gdf.head(), pdf.head()) - assert_eq(gdf.head(3), pdf.head(3)) - assert_eq(gdf.head(-2), pdf.head(-2)) - assert_eq(gdf.head(0), pdf.head(0)) - - assert_eq(gdf["a"].head(), pdf["a"].head()) - assert_eq(gdf["a"].head(3), pdf["a"].head(3)) - assert_eq(gdf["a"].head(-2), pdf["a"].head(-2)) - - assert_eq(gdf.tail(), pdf.tail()) - assert_eq(gdf.tail(3), pdf.tail(3)) - assert_eq(gdf.tail(-2), pdf.tail(-2)) - assert_eq(gdf.tail(0), pdf.tail(0)) - - assert_eq(gdf["a"].tail(), pdf["a"].tail()) - assert_eq(gdf["a"].tail(3), pdf["a"].tail(3)) - assert_eq(gdf["a"].tail(-2), pdf["a"].tail(-2)) - - -def test_tail_for_string(): - gdf = cudf.DataFrame() - gdf["id"] = cudf.Series(["a", "b"], dtype=np.object_) - gdf["v"] = cudf.Series([1, 2]) - assert_eq(gdf.tail(3), gdf.to_pandas().tail(3)) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize( - "column_names", - [ - ["v0", "v1"], - ["v0", "index"], - pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]), - pd.MultiIndex.from_tuples([(1, 2), (10, 11)], names=["ABC", "DEF"]), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("col_level", [0, 1]) -@pytest.mark.parametrize("col_fill", ["", "some_lv"]) -def test_reset_index(level, drop, column_names, inplace, col_level, col_fill): - midx = pd.MultiIndex.from_tuples( - [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] - ) - pdf = pd.DataFrame( - [[1, 2], [3, 4], [5, 6], [7, 8]], index=midx, columns=column_names - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.reset_index( - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - got = gdf.reset_index( - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - if inplace: - expect = pdf - got = gdf - - assert_eq(expect, got) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("level", [None, 0, 1, [None]]) -@pytest.mark.parametrize("drop", [False, True]) -@pytest.mark.parametrize("inplace", [False, True]) -@pytest.mark.parametrize("col_level", [0, 1]) -@pytest.mark.parametrize("col_fill", ["", "some_lv"]) -def test_reset_index_dup_level_name(level, drop, inplace, col_level, col_fill): - # midx levels are named [None, None] - midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) - pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx) - gdf = cudf.from_pandas(pdf) - if level == [None]: - assert_exceptions_equal( - lfunc=pdf.reset_index, - rfunc=gdf.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": level, "drop": drop, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [], - {"level": level, "drop": drop, "inplace": inplace}, - ), - ) - return - - expect = pdf.reset_index( - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - got = gdf.reset_index( - level=level, - drop=drop, - inplace=inplace, - col_level=col_level, - col_fill=col_fill, - ) - if inplace: - expect = pdf - got = gdf - - assert_eq(expect, got) - - -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("inplace", [False, True]) -@pytest.mark.parametrize("col_level", [0, 1]) -@pytest.mark.parametrize("col_fill", ["", "some_lv"]) -def test_reset_index_named(pdf, gdf, drop, inplace, col_level, col_fill): - pdf.index.name = "cudf" - gdf.index.name = "cudf" - - expect = pdf.reset_index( - drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill - ) - got = gdf.reset_index( - drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill - ) - if inplace: - expect = pdf - got = gdf - assert_eq(expect, got) - - -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("inplace", [False, True]) -@pytest.mark.parametrize("column_names", [["x", "y"], ["index", "y"]]) -@pytest.mark.parametrize("col_level", [0, 1]) -@pytest.mark.parametrize("col_fill", ["", "some_lv"]) -def test_reset_index_unnamed( - pdf, gdf, drop, inplace, column_names, col_level, col_fill -): - pdf.columns = column_names - gdf.columns = column_names - - expect = pdf.reset_index( - drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill - ) - got = gdf.reset_index( - drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill - ) - if inplace: - expect = pdf - got = gdf - assert_eq(expect, got) - - -def test_reset_index_invalid_level(): - with pytest.raises(IndexError): - cudf.DataFrame([1]).reset_index(level=2) - - with pytest.raises(IndexError): - pd.DataFrame([1]).reset_index(level=2) - - -@pytest.mark.parametrize( - "data", - [ - { - "a": [1, 2, 3, 4, 5], - "b": ["a", "b", "c", "d", "e"], - "c": [1.0, 2.0, 3.0, 4.0, 5.0], - } - ], -) -@pytest.mark.parametrize( - "index", - [ - "a", - ["a", "b"], - pd.CategoricalIndex(["I", "II", "III", "IV", "V"]), - pd.Series(["h", "i", "k", "l", "m"]), - ["b", pd.Index(["I", "II", "III", "IV", "V"])], - ["c", [11, 12, 13, 14, 15]], - pd.MultiIndex( - levels=[ - ["I", "II", "III", "IV", "V"], - ["one", "two", "three", "four", "five"], - ], - codes=[[0, 1, 2, 3, 4], [4, 3, 2, 1, 0]], - names=["col1", "col2"], - ), - pd.RangeIndex(0, 5), # corner case - [pd.Series(["h", "i", "k", "l", "m"]), pd.RangeIndex(0, 5)], - [ - pd.MultiIndex( - levels=[ - ["I", "II", "III", "IV", "V"], - ["one", "two", "three", "four", "five"], - ], - codes=[[0, 1, 2, 3, 4], [4, 3, 2, 1, 0]], - names=["col1", "col2"], - ), - pd.RangeIndex(0, 5), - ], - ], -) -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("append", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_set_index(data, index, drop, append, inplace): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = pdf.set_index(index, inplace=inplace, drop=drop, append=append) - actual = gdf.set_index(index, inplace=inplace, drop=drop, append=append) - - if inplace: - expected = pdf - actual = gdf - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - { - "a": [1, 1, 2, 2, 5], - "b": ["a", "b", "c", "d", "e"], - "c": [1.0, 2.0, 3.0, 4.0, 5.0], - } - ], -) -@pytest.mark.parametrize("index", ["a", pd.Index([1, 1, 2, 2, 3])]) -@pytest.mark.parametrize("verify_integrity", [True]) -@pytest_xfail -def test_set_index_verify_integrity(data, index, verify_integrity): - gdf = cudf.DataFrame(data) - gdf.set_index(index, verify_integrity=verify_integrity) - - -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("nelem", [10, 200, 1333]) -def test_set_index_multi(drop, nelem): - np.random.seed(0) - a = np.arange(nelem) - np.random.shuffle(a) - df = pd.DataFrame( - { - "a": a, - "b": np.random.randint(0, 4, size=nelem), - "c": np.random.uniform(low=0, high=4, size=nelem), - "d": np.random.choice(["green", "black", "white"], nelem), - } - ) - df["e"] = df["d"].astype("category") - gdf = cudf.DataFrame.from_pandas(df) - - assert_eq(gdf.set_index("a", drop=drop), gdf.set_index(["a"], drop=drop)) - assert_eq( - df.set_index(["b", "c"], drop=drop), - gdf.set_index(["b", "c"], drop=drop), - ) - assert_eq( - df.set_index(["d", "b"], drop=drop), - gdf.set_index(["d", "b"], drop=drop), - ) - assert_eq( - df.set_index(["b", "d", "e"], drop=drop), - gdf.set_index(["b", "d", "e"], drop=drop), - ) - - -@pytest.fixture() -def reindex_data(): - return cudf.datasets.randomdata( - nrows=6, - dtypes={ - "a": "category", - "c": float, - "d": str, - }, - ) - - -@pytest.fixture() -def reindex_data_numeric(): - return cudf.datasets.randomdata( - nrows=6, - dtypes={"a": float, "b": float, "c": float}, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("copy", [True, False]) -@pytest.mark.parametrize( - "args,gd_kwargs", - [ - ([], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {"axis": 0}), - ([["a", "b", "c", "d", "e"]], {"axis": 1}), - ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": 0}), - ([], {"labels": ["a", "b", "c", "d", "e"], "axis": 1}), - ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": "index"}), - ([], {"labels": ["a", "b", "c", "d", "e"], "axis": "columns"}), - ([], {"index": [-3, 0, 3, 0, -2, 1, 3, 4, 6]}), - ([], {"columns": ["a", "b", "c", "d", "e"]}), - ( - [], - { - "index": [-3, 0, 3, 0, -2, 1, 3, 4, 6], - "columns": ["a", "b", "c", "d", "e"], - }, - ), - ], -) -def test_dataframe_reindex(copy, reindex_data, args, gd_kwargs): - pdf, gdf = reindex_data.to_pandas(), reindex_data - - gd_kwargs["copy"] = copy - pd_kwargs = gd_kwargs.copy() - pd_kwargs["copy"] = True - assert_eq(pdf.reindex(*args, **pd_kwargs), gdf.reindex(*args, **gd_kwargs)) - - -@pytest.mark.parametrize("fill_value", [-1.0, 0.0, 1.5]) -@pytest.mark.parametrize( - "args,kwargs", - [ - ([], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {}), - ([[-3, 0, 3, 0, -2, 1, 3, 4, 6]], {"axis": 0}), - ([["a", "b", "c", "d", "e"]], {"axis": 1}), - ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": 0}), - ([], {"labels": ["a", "b", "c", "d", "e"], "axis": 1}), - ([], {"labels": [-3, 0, 3, 0, -2, 1, 3, 4, 6], "axis": "index"}), - ([], {"labels": ["a", "b", "c", "d", "e"], "axis": "columns"}), - ([], {"index": [-3, 0, 3, 0, -2, 1, 3, 4, 6]}), - ([], {"columns": ["a", "b", "c", "d", "e"]}), - ( - [], - { - "index": [-3, 0, 3, 0, -2, 1, 3, 4, 6], - "columns": ["a", "b", "c", "d", "e"], - }, - ), - ], -) -def test_dataframe_reindex_fill_value( - reindex_data_numeric, args, kwargs, fill_value -): - pdf, gdf = reindex_data_numeric.to_pandas(), reindex_data_numeric - kwargs["fill_value"] = fill_value - assert_eq(pdf.reindex(*args, **kwargs), gdf.reindex(*args, **kwargs)) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_dataframe_reindex_change_dtype(copy): - index = pd.date_range("12/29/2009", periods=10, freq="D") - columns = ["a", "b", "c", "d", "e"] - gdf = cudf.datasets.randomdata( - nrows=6, dtypes={"a": "category", "c": float, "d": str} - ) - pdf = gdf.to_pandas() - # Validate reindexes both labels and column names when - # index=index_labels and columns=column_labels - assert_eq( - pdf.reindex(index=index, columns=columns, copy=True), - gdf.reindex(index=index, columns=columns, copy=copy), - check_freq=False, - ) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_series_categorical_reindex(copy): - index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = cudf.datasets.randomdata(nrows=6, dtypes={"a": "category"}) - pdf = gdf.to_pandas() - assert_eq(pdf["a"].reindex(copy=True), gdf["a"].reindex(copy=copy)) - assert_eq( - pdf["a"].reindex(index, copy=True), gdf["a"].reindex(index, copy=copy) - ) - assert_eq( - pdf["a"].reindex(index=index, copy=True), - gdf["a"].reindex(index=index, copy=copy), - ) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_series_float_reindex(copy): - index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = cudf.datasets.randomdata(nrows=6, dtypes={"c": float}) - pdf = gdf.to_pandas() - assert_eq(pdf["c"].reindex(copy=True), gdf["c"].reindex(copy=copy)) - assert_eq( - pdf["c"].reindex(index, copy=True), gdf["c"].reindex(index, copy=copy) - ) - assert_eq( - pdf["c"].reindex(index=index, copy=True), - gdf["c"].reindex(index=index, copy=copy), - ) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_series_string_reindex(copy): - index = [-3, 0, 3, 0, -2, 1, 3, 4, 6] - gdf = cudf.datasets.randomdata(nrows=6, dtypes={"d": str}) - pdf = gdf.to_pandas() - assert_eq(pdf["d"].reindex(copy=True), gdf["d"].reindex(copy=copy)) - assert_eq( - pdf["d"].reindex(index, copy=True), gdf["d"].reindex(index, copy=copy) - ) - assert_eq( - pdf["d"].reindex(index=index, copy=True), - gdf["d"].reindex(index=index, copy=copy), - ) - - -@pytest.mark.parametrize("names", [None, ["a", "b"]]) -@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) -def test_reindex_multiindex_col_to_multiindex(names, klass): - idx = pd.Index( - [("A", "one"), ("A", "two")], - dtype="object", - ) - df = pd.DataFrame([[1, 2]], columns=idx) - gdf = cudf.from_pandas(df) - midx = klass.from_tuples([("A", "one"), ("A", "three")], names=names) - result = gdf.reindex(columns=midx) - expected = cudf.DataFrame([[1, None]], columns=midx) - # (pandas2.0): check_dtype=False won't be needed - # as None col will return object instead of float - assert_eq(result, expected, check_dtype=False) - - -@pytest.mark.parametrize("names", [None, ["a", "b"]]) -@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) -def test_reindex_tuple_col_to_multiindex(names, klass): - idx = pd.Index( - [("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False - ) - df = pd.DataFrame([[1, 2]], columns=idx) - gdf = cudf.from_pandas(df) - midx = klass.from_tuples([("A", "one"), ("A", "two")], names=names) - result = gdf.reindex(columns=midx) - expected = cudf.DataFrame([[1, 2]], columns=midx) - assert_eq(result, expected) - - -@pytest.mark.parametrize("name", [None, "foo"]) -@pytest.mark.parametrize("klass", [range, cudf.RangeIndex, pd.RangeIndex]) -def test_reindex_columns_rangeindex_keeps_rangeindex(name, klass): - new_columns = klass(3) - exp_name = None - if klass is not range: - new_columns.name = name - exp_name = name - df = cudf.DataFrame([[1, 2]]) - result = df.reindex(columns=new_columns).columns - expected = pd.RangeIndex(3, name=exp_name) - assert_eq(result, expected) - - -def test_to_frame(pdf, gdf): - assert_eq(pdf.x.to_frame(), gdf.x.to_frame()) - - name = "foo" - gdf_new_name = gdf.x.to_frame(name=name) - pdf_new_name = pdf.x.to_frame(name=name) - assert_eq(pdf.x.to_frame(), gdf.x.to_frame()) - - name = False - gdf_new_name = gdf.x.to_frame(name=name) - pdf_new_name = pdf.x.to_frame(name=name) - assert_eq(gdf_new_name, pdf_new_name) - assert gdf_new_name.columns[0] == name - - -def test_dataframe_empty_sort_index(): - pdf = pd.DataFrame({"x": []}) - gdf = cudf.DataFrame.from_pandas(pdf) - - expect = pdf.sort_index() - got = gdf.sort_index() - - assert_eq(expect, got, check_index_type=True) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "index", - [ - pd.RangeIndex(0, 3, 1), - [3.0, 1.0, np.nan], - # Test for single column MultiIndex - pd.MultiIndex.from_arrays( - [ - [2, 0, 1], - ] - ), - pd.RangeIndex(2, -1, -1), - ], -) -@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_sort_index( - request, index, axis, ascending, inplace, ignore_index, na_position -): - if not PANDAS_GE_220 and axis in (1, "columns") and ignore_index: - pytest.skip(reason="Bug fixed in pandas-2.2") - - pdf = pd.DataFrame( - {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]}, - index=index, - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - expected = pdf.sort_index( - axis=axis, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - got = gdf.sort_index( - axis=axis, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - - if inplace is True: - assert_eq(pdf, gdf, check_index_type=True) - else: - assert_eq(expected, got, check_index_type=True) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("axis", [0, 1, "index", "columns"]) -@pytest.mark.parametrize( - "level", - [ - 0, - "b", - 1, - ["b"], - "a", - ["a", "b"], - ["b", "a"], - [0, 1], - [1, 0], - [0, 2], - None, - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_dataframe_mulitindex_sort_index( - request, axis, level, ascending, inplace, ignore_index, na_position -): - request.applymarker( - pytest.mark.xfail( - condition=axis in (1, "columns") - and level is None - and not ascending - and ignore_index, - reason="https://github.com/pandas-dev/pandas/issues/57293", - ) - ) - pdf = pd.DataFrame( - { - "b": [1.0, 3.0, np.nan], - "a": [1, 4, 3], - 1: ["a", "b", "c"], - "e": [3, 1, 4], - "d": [1, 2, 8], - } - ).set_index(["b", "a", 1]) - gdf = cudf.DataFrame.from_pandas(pdf) - - expected = pdf.sort_index( - axis=axis, - level=level, - ascending=ascending, - inplace=inplace, - na_position=na_position, - ignore_index=ignore_index, - ) - got = gdf.sort_index( - axis=axis, - level=level, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - - if inplace is True: - assert_eq(pdf, gdf) - else: - assert_eq(expected, got) - - -def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names(): - gdf = cudf.DataFrame([[1, 2, 3]], columns=["b", "a", "c"]) - result = gdf.sort_index(axis=1, ignore_index=True) - assert result._data.names == tuple(result._data.keys()) - - -@pytest.mark.parametrize("dtype", dtypes + ["category"]) -def test_dataframe_0_row_dtype(dtype): - if dtype == "category": - data = pd.Series(["a", "b", "c", "d", "e"], dtype="category") - else: - data = np.array([1, 2, 3, 4, 5], dtype=dtype) - - expect = cudf.DataFrame() - expect["x"] = data - expect["y"] = data - got = expect.head(0) - - for col_name in got.columns: - assert expect[col_name].dtype == got[col_name].dtype - - expect = cudf.Series(data) - got = expect.head(0) - - assert expect.dtype == got.dtype - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_series_list_nanasnull(nan_as_null): - data = [1.0, 2.0, 3.0, np.nan, None] - - expect = pa.array(data, from_pandas=nan_as_null) - got = cudf.Series(data, nan_as_null=nan_as_null).to_arrow() - - # Bug in Arrow 0.14.1 where NaNs aren't handled - expect = expect.cast("int64", safe=False) - got = got.cast("int64", safe=False) - - assert pa.Array.equals(expect, got) - - -def test_column_assignment(): - gdf = cudf.datasets.randomdata( - nrows=20, dtypes={"a": "category", "b": int, "c": float} - ) - new_cols = ["q", "r", "s"] - gdf.columns = new_cols - assert list(gdf.columns) == new_cols - - -def test_select_dtype(): - gdf = cudf.datasets.randomdata( - nrows=20, dtypes={"a": "category", "b": int, "c": float, "d": str} - ) - pdf = gdf.to_pandas() - - assert_eq(pdf.select_dtypes("float64"), gdf.select_dtypes("float64")) - assert_eq(pdf.select_dtypes(np.float64), gdf.select_dtypes(np.float64)) - assert_eq( - pdf.select_dtypes(include=["float64"]), - gdf.select_dtypes(include=["float64"]), - ) - assert_eq( - pdf.select_dtypes(include=["object", "int", "category"]), - gdf.select_dtypes(include=["object", "int", "category"]), - ) - - assert_eq( - pdf.select_dtypes(include=["int64", "float64"]), - gdf.select_dtypes(include=["int64", "float64"]), - ) - assert_eq( - pdf.select_dtypes(include=np.number), - gdf.select_dtypes(include=np.number), - ) - assert_eq( - pdf.select_dtypes(include=[np.int64, np.float64]), - gdf.select_dtypes(include=[np.int64, np.float64]), - ) - - assert_eq( - pdf.select_dtypes(include=["category"]), - gdf.select_dtypes(include=["category"]), - ) - assert_eq( - pdf.select_dtypes(exclude=np.number), - gdf.select_dtypes(exclude=np.number), - ) - - assert_exceptions_equal( - lfunc=pdf.select_dtypes, - rfunc=gdf.select_dtypes, - lfunc_args_and_kwargs=([], {"includes": ["Foo"]}), - rfunc_args_and_kwargs=([], {"includes": ["Foo"]}), - ) - - assert_exceptions_equal( - lfunc=pdf.select_dtypes, - rfunc=gdf.select_dtypes, - lfunc_args_and_kwargs=( - [], - {"exclude": np.number, "include": np.number}, - ), - rfunc_args_and_kwargs=( - [], - {"exclude": np.number, "include": np.number}, - ), - ) - - gdf = cudf.DataFrame( - {"A": [3, 4, 5], "C": [1, 2, 3], "D": ["a", "b", "c"]} - ) - pdf = gdf.to_pandas() - assert_eq( - pdf.select_dtypes(include=["object", "int", "category"]), - gdf.select_dtypes(include=["object", "int", "category"]), - ) - assert_eq( - pdf.select_dtypes(include=["object"], exclude=["category"]), - gdf.select_dtypes(include=["object"], exclude=["category"]), - ) - - gdf = cudf.DataFrame({"a": range(10), "b": range(10, 20)}) - pdf = gdf.to_pandas() - assert_eq( - pdf.select_dtypes(include=["category"]), - gdf.select_dtypes(include=["category"]), - ) - assert_eq( - pdf.select_dtypes(include=["float"]), - gdf.select_dtypes(include=["float"]), - ) - assert_eq( - pdf.select_dtypes(include=["object"]), - gdf.select_dtypes(include=["object"]), - ) - assert_eq( - pdf.select_dtypes(include=["int"]), gdf.select_dtypes(include=["int"]) - ) - assert_eq( - pdf.select_dtypes(exclude=["float"]), - gdf.select_dtypes(exclude=["float"]), - ) - assert_eq( - pdf.select_dtypes(exclude=["object"]), - gdf.select_dtypes(exclude=["object"]), - ) - assert_eq( - pdf.select_dtypes(include=["int"], exclude=["object"]), - gdf.select_dtypes(include=["int"], exclude=["object"]), - ) - - assert_exceptions_equal( - lfunc=pdf.select_dtypes, - rfunc=gdf.select_dtypes, - ) - - gdf = cudf.DataFrame( - {"a": cudf.Series([], dtype="int"), "b": cudf.Series([], dtype="str")} - ) - pdf = gdf.to_pandas() - assert_eq( - pdf.select_dtypes(exclude=["object"]), - gdf.select_dtypes(exclude=["object"]), - ) - assert_eq( - pdf.select_dtypes(include=["int"], exclude=["object"]), - gdf.select_dtypes(include=["int"], exclude=["object"]), - ) - - gdf = cudf.DataFrame( - {"int_col": [0, 1, 2], "list_col": [[1, 2], [3, 4], [5, 6]]} - ) - pdf = gdf.to_pandas() - assert_eq( - pdf.select_dtypes("int64"), - gdf.select_dtypes("int64"), - ) - - -def test_select_dtype_datetime(): - gdf = cudf.datasets.timeseries( - start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} - ) - gdf = gdf.reset_index() - pdf = gdf.to_pandas() - - assert_eq(pdf.select_dtypes("datetime64"), gdf.select_dtypes("datetime64")) - assert_eq( - pdf.select_dtypes(np.dtype("datetime64")), - gdf.select_dtypes(np.dtype("datetime64")), - ) - assert_eq( - pdf.select_dtypes(include="datetime64"), - gdf.select_dtypes(include="datetime64"), - ) - - -def test_select_dtype_datetime_with_frequency(): - gdf = cudf.datasets.timeseries( - start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={"x": int} - ) - gdf = gdf.reset_index() - pdf = gdf.to_pandas() - - assert_exceptions_equal( - pdf.select_dtypes, - gdf.select_dtypes, - (["datetime64[ms]"],), - (["datetime64[ms]"],), - ) - - -def test_dataframe_describe_exclude(): - np.random.seed(12) - data_length = 10000 - - df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) - pdf = df.to_pandas() - - gdf_results = df.describe(exclude=["float"]) - pdf_results = pdf.describe(exclude=["float"]) - - assert_eq(gdf_results, pdf_results) - - -def test_dataframe_describe_include(): - np.random.seed(12) - data_length = 10000 - - df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) - pdf = df.to_pandas() - gdf_results = df.describe(include=["int"]) - pdf_results = pdf.describe(include=["int"]) - - assert_eq(gdf_results, pdf_results) - - -def test_dataframe_describe_default(): - np.random.seed(12) - data_length = 10000 - - df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["y"] = np.random.normal(10, 1, data_length) - pdf = df.to_pandas() - gdf_results = df.describe() - pdf_results = pdf.describe() - - assert_eq(pdf_results, gdf_results) - - -def test_series_describe_include_all(): - np.random.seed(12) - data_length = 10000 - - df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) - df["animal"] = np.random.choice(["dog", "cat", "bird"], data_length) - - pdf = df.to_pandas() - gdf_results = df.describe(include="all") - pdf_results = pdf.describe(include="all") - - assert_eq(gdf_results[["x", "y"]], pdf_results[["x", "y"]]) - assert_eq(gdf_results.index, pdf_results.index) - assert_eq(gdf_results.columns, pdf_results.columns) - assert_eq( - gdf_results[["animal"]].fillna(-1).astype("str"), - pdf_results[["animal"]].fillna(-1).astype("str"), - ) - - -def test_dataframe_describe_percentiles(): - np.random.seed(12) - data_length = 10000 - sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99] - - df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["y"] = np.random.normal(10, 1, data_length) - pdf = df.to_pandas() - gdf_results = df.describe(percentiles=sample_percentiles) - pdf_results = pdf.describe(percentiles=sample_percentiles) - - assert_eq(pdf_results, gdf_results) - - -def test_get_numeric_data(): - pdf = pd.DataFrame( - {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0], "z": ["a", "b", "c"]} - ) - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf._get_numeric_data(), gdf._get_numeric_data()) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-15, -1, 0, 1, 15]) -@pytest.mark.parametrize("data_empty", [False, True]) -def test_shift(dtype, period, data_empty): - # TODO : this function currently tests for series.shift() - # but should instead test for dataframe.shift() - if data_empty: - data = None - else: - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, 10, low=-2, high=2) - else: - data = gen_rand(dtype, 10) - - gs = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) - ps = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) - - shifted_outcome = gs.a.shift(period) - expected_outcome = ps.a.shift(period) - - # pandas uses NaNs to signal missing value and force converts the - # results columns to float types - if data_empty: - assert_eq( - shifted_outcome, - expected_outcome, - check_index_type=False, - check_dtype=False, - ) - else: - assert_eq(shifted_outcome, expected_outcome, check_dtype=False) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) -@pytest.mark.parametrize("data_empty", [False, True]) -def test_diff(dtype, period, data_empty): - if data_empty: - data = None - else: - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, 100000, low=-2, high=2) - else: - data = gen_rand(dtype, 100000) - - gdf = cudf.DataFrame({"a": cudf.Series(data, dtype=dtype)}) - pdf = pd.DataFrame({"a": pd.Series(data, dtype=dtype)}) - - expected_outcome = pdf.a.diff(period) - diffed_outcome = gdf.a.diff(period).astype(expected_outcome.dtype) - - if data_empty: - assert_eq(diffed_outcome, expected_outcome, check_index_type=False) - else: - assert_eq(diffed_outcome, expected_outcome) - - -@pytest.mark.parametrize("df", _dataframe_na_data()) -@pytest.mark.parametrize("nan_as_null", [True, False, None]) -@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"]) -def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call): - def detect_nan(x): - # Check if the input is a float and if it is nan - return x.apply(lambda v: isinstance(v, float) and np.isnan(v)) - - nan_contains = df.select_dtypes(object).apply(detect_nan) - if nan_as_null is False and ( - nan_contains.any().any() and not nan_contains.all().all() - ): - with pytest.raises(MixedTypeError): - cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) - else: - gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null) - - assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)()) - - # Test individual columns - for col in df: - assert_eq( - getattr(df[col], api_call)(), getattr(gdf[col], api_call)() - ) - - -def test_ndim(): - pdf = pd.DataFrame({"x": range(5), "y": range(5, 10)}) - gdf = cudf.DataFrame.from_pandas(pdf) - assert pdf.ndim == gdf.ndim - assert pdf.x.ndim == gdf.x.ndim - - s = pd.Series(dtype="float64") - gs = cudf.Series() - assert s.ndim == gs.ndim - - -@pytest.mark.parametrize( - "decimals", - [ - -3, - 0, - 5, - pd.Series( - [1, 4, 3, -6], - index=["floats", "ints", "floats_with_nan", "floats_same"], - ), - cudf.Series( - [-4, -2, 12], index=["ints", "floats_with_nan", "floats_same"] - ), - {"floats": -1, "ints": 15, "floats_will_nan": 2}, - ], -) -def test_dataframe_round(decimals): - gdf = cudf.DataFrame( - { - "floats": np.arange(0.5, 10.5, 1), - "ints": np.random.normal(-100, 100, 10), - "floats_with_na": np.array( - [ - 14.123, - 2.343, - np.nan, - 0.0, - -8.302, - np.nan, - 94.313, - None, - -8.029, - np.nan, - ] - ), - "floats_same": np.repeat([-0.6459412758761901], 10), - "bools": np.random.choice([True, None, False], 10), - "strings": np.random.choice(["abc", "xyz", None], 10), - "struct": np.random.choice([{"abc": 1}, {"xyz": 2}, None], 10), - "list": [[1], [2], None, [4], [3]] * 2, - } - ) - pdf = gdf.to_pandas() - - if isinstance(decimals, cudf.Series): - pdecimals = decimals.to_pandas() - else: - pdecimals = decimals - - result = gdf.round(decimals) - expected = pdf.round(pdecimals) - - assert_eq(result, expected) - - -def test_dataframe_round_dict_decimal_validation(): - df = cudf.DataFrame({"A": [0.12], "B": [0.13]}) - with pytest.raises(TypeError): - df.round({"A": 1, "B": 0.5}) - - -@pytest.mark.parametrize( - "data", - [ - [0, 1, 2, 3], - [-2, -1, 2, 3, 5], - [-2, -1, 0, 3, 5], - [True, False, False], - [True], - [False], - [], - [True, None, False], - [True, True, None], - [None, None], - [[0, 5], [1, 6], [2, 7], [3, 8], [4, 9]], - [[1, True], [2, False], [3, False]], - [["a", True], ["b", False], ["c", False]], - ], -) -def test_all(data): - # Provide a dtype when data is empty to avoid future pandas changes. - dtype = None if data else float - if np.array(data).ndim <= 1: - pdata = pd.Series(data=data, dtype=dtype) - gdata = cudf.Series.from_pandas(pdata) - got = gdata.all() - expected = pdata.all() - assert_eq(got, expected) - else: - pdata = pd.DataFrame(data, columns=["a", "b"], dtype=dtype).replace( - [None], False - ) - gdata = cudf.DataFrame.from_pandas(pdata) - - # test bool_only - if pdata["b"].dtype == "bool": - got = gdata.all(bool_only=True) - expected = pdata.all(bool_only=True) - assert_eq(got, expected) - else: - got = gdata.all() - expected = pdata.all() - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [0, 1, 2, 3], - [-2, -1, 2, 3, 5], - [-2, -1, 0, 3, 5], - [0, 0, 0, 0, 0], - [0, 0, None, 0], - [True, False, False], - [True], - [False], - [], - [True, None, False], - [True, True, None], - [None, None], - [[0, 5], [1, 6], [2, 7], [3, 8], [4, 9]], - [[1, True], [2, False], [3, False]], - [["a", True], ["b", False], ["c", False]], - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -def test_any(data, axis): - # Provide a dtype when data is empty to avoid future pandas changes. - dtype = float if all(x is None for x in data) or len(data) < 1 else None - if np.array(data).ndim <= 1: - pdata = pd.Series(data=data, dtype=dtype) - gdata = cudf.Series(data=data, dtype=dtype) - - if axis == 1: - with pytest.raises(NotImplementedError): - gdata.any(axis=axis) - else: - got = gdata.any(axis=axis) - expected = pdata.any(axis=axis) - assert_eq(got, expected) - else: - pdata = pd.DataFrame(data, columns=["a", "b"]) - gdata = cudf.DataFrame.from_pandas(pdata) - - # test bool_only - if pdata["b"].dtype == "bool": - got = gdata.any(bool_only=True) - expected = pdata.any(bool_only=True) - assert_eq(got, expected) - else: - got = gdata.any(axis=axis) - expected = pdata.any(axis=axis) - assert_eq(got, expected) - - -@pytest.mark.parametrize("axis", [0, 1]) -def test_empty_dataframe_any(axis): - pdf = pd.DataFrame({}, columns=["a", "b"], dtype=float) - gdf = cudf.DataFrame.from_pandas(pdf) - got = gdf.any(axis=axis) - expected = pdf.any(axis=axis) - assert_eq(got, expected, check_index_type=False) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("a", [[], ["123"]]) -@pytest.mark.parametrize("b", ["123", ["123"]]) -@pytest.mark.parametrize( - "misc_data", - ["123", ["123"] * 20, 123, [1, 2, 0.8, 0.9] * 50, 0.9, 0.00001], -) -@pytest.mark.parametrize("non_list_data", [123, "abc", "zyx", "rapids", 0.8]) -def test_create_dataframe_cols_empty_data(a, b, misc_data, non_list_data): - expected = pd.DataFrame({"a": a}) - actual = cudf.DataFrame.from_pandas(expected) - expected["b"] = b - actual["b"] = b - assert_eq(actual, expected) - - expected = pd.DataFrame({"a": []}) - actual = cudf.DataFrame.from_pandas(expected) - expected["b"] = misc_data - actual["b"] = misc_data - assert_eq(actual, expected) - - expected = pd.DataFrame({"a": a}) - actual = cudf.DataFrame.from_pandas(expected) - expected["b"] = non_list_data - actual["b"] = non_list_data - assert_eq(actual, expected) - - -def test_empty_dataframe_describe(): - pdf = pd.DataFrame({"a": [], "b": []}) - gdf = cudf.from_pandas(pdf) - - expected = pdf.describe() - actual = gdf.describe() - - assert_eq(expected, actual) - - -def test_as_column_types(): - col = column.as_column(cudf.Series([], dtype="float64")) - assert_eq(col.dtype, np.dtype("float64")) - gds = cudf.Series._from_column(col) - pds = pd.Series(pd.Series([], dtype="float64")) - - assert_eq(pds, gds) - - col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32") - assert_eq(col.dtype, np.dtype("float32")) - gds = cudf.Series._from_column(col) - pds = pd.Series(pd.Series([], dtype="float32")) - - assert_eq(pds, gds) - - col = column.as_column(cudf.Series([], dtype="float64"), dtype="str") - assert_eq(col.dtype, np.dtype("object")) - gds = cudf.Series._from_column(col) - pds = pd.Series(pd.Series([], dtype="str")) - - assert_eq(pds, gds) - - col = column.as_column(cudf.Series([], dtype="float64"), dtype="object") - assert_eq(col.dtype, np.dtype("object")) - gds = cudf.Series._from_column(col) - pds = pd.Series(pd.Series([], dtype="object")) - - assert_eq(pds, gds) - - pds = pd.Series(np.array([1, 2, 3]), dtype="float32") - gds = cudf.Series._from_column( - column.as_column(np.array([1, 2, 3]), dtype="float32") - ) - - assert_eq(pds, gds) - - pds = pd.Series([1, 2, 3], dtype="float32") - gds = cudf.Series([1, 2, 3], dtype="float32") - - assert_eq(pds, gds) - - pds = pd.Series([], dtype="float64") - gds = cudf.Series._from_column(column.as_column(pds)) - assert_eq(pds, gds) - - pds = pd.Series([1, 2, 4], dtype="int64") - gds = cudf.Series._from_column( - column.as_column(cudf.Series([1, 2, 4]), dtype="int64") - ) - - assert_eq(pds, gds) - - pds = pd.Series([1.2, 18.0, 9.0], dtype="float32") - gds = cudf.Series._from_column( - column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32") - ) - - assert_eq(pds, gds) - - pds = pd.Series([1.2, 18.0, 9.0], dtype="str") - gds = cudf.Series._from_column( - column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str") - ) - - assert_eq(pds, gds) - - pds = pd.Series(pd.Index(["1", "18", "9"]), dtype="int") - gds = cudf.Series(cudf.Index(["1", "18", "9"]), dtype="int") - - assert_eq(pds, gds) - - -def test_one_row_head(): - gdf = cudf.DataFrame({"name": ["carl"], "score": [100]}, index=[123]) - pdf = gdf.to_pandas() - - head_gdf = gdf.head() - head_pdf = pdf.head() - - assert_eq(head_pdf, head_gdf) - - -@pytest.mark.parametrize("index", [None, [123], ["a", "b"]]) -def test_no_cols_head(index): - pdf = pd.DataFrame(index=index) - gdf = cudf.from_pandas(pdf) - - head_gdf = gdf.head() - head_pdf = pdf.head() - - assert_eq(head_pdf, head_gdf) - - -@pytest.mark.parametrize("dtype", ALL_TYPES) -@pytest.mark.parametrize( - "np_dtype,pd_dtype", - [ - tuple(item) - for item in cudf.utils.dtypes.np_dtypes_to_pandas_dtypes.items() - ], -) -def test_series_astype_pandas_nullable(dtype, np_dtype, pd_dtype): - source = cudf.Series([0, 1, None], dtype=dtype) - - expect = source.astype(np_dtype) - got = source.astype(pd_dtype) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("as_dtype", NUMERIC_TYPES) -def test_series_astype_numeric_to_numeric(dtype, as_dtype): - psr = pd.Series([1, 2, 4, 3], dtype=dtype) - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("as_dtype", NUMERIC_TYPES) -def test_series_astype_numeric_to_numeric_nulls(dtype, as_dtype): - data = [1, 2, None, 3] - sr = cudf.Series(data, dtype=dtype) - got = sr.astype(as_dtype) - expect = cudf.Series([1, 2, None, 3], dtype=as_dtype) - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "as_dtype", - [ - "str", - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - ], -) -def test_series_astype_numeric_to_other(dtype, as_dtype): - psr = pd.Series([1, 2, 3], dtype=dtype) - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize( - "as_dtype", - [ - "str", - "int32", - "uint32", - "float32", - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - ], -) -def test_series_astype_string_to_other(as_dtype): - if "datetime64" in as_dtype: - data = ["2001-01-01", "2002-02-02", "2000-01-05"] - else: - data = ["1", "2", "3"] - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize( - "as_dtype", - [ - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - ], -) -def test_series_astype_datetime_to_other(as_dtype): - data = ["2001-01-01", "2002-02-02", "2001-01-05"] - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize( - "inp", - [ - ("datetime64[ns]", "2011-01-01 00:00:00.000000000"), - ("datetime64[us]", "2011-01-01 00:00:00.000000"), - ("datetime64[ms]", "2011-01-01 00:00:00.000"), - ("datetime64[s]", "2011-01-01 00:00:00"), - ], -) -def test_series_astype_datetime_to_string(inp): - dtype, expect = inp - base_date = "2011-01-01" - sr = cudf.Series([base_date], dtype=dtype) - got = sr.astype(str)[0] - assert expect == got - - -@pytest.mark.parametrize( - "as_dtype", - [ - "int32", - "uint32", - "float32", - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - "str", - ], -) -def test_series_astype_categorical_to_other(as_dtype): - if "datetime64" in as_dtype: - data = ["2001-01-01", "2002-02-02", "2000-01-05", "2001-01-01"] - else: - data = [1, 2, 3, 1] - psr = pd.Series(data, dtype="category") - gsr = cudf.from_pandas(psr) - assert_eq(psr.astype(as_dtype), gsr.astype(as_dtype)) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_series_astype_to_categorical_ordered(ordered): - psr = pd.Series([1, 2, 3, 1], dtype="category") - gsr = cudf.from_pandas(psr) - - ordered_dtype_pd = pd.CategoricalDtype( - categories=[1, 2, 3], ordered=ordered - ) - ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) - assert_eq( - psr.astype("int32").astype(ordered_dtype_pd).astype("int32"), - gsr.astype("int32").astype(ordered_dtype_gd).astype("int32"), - ) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_series_astype_cat_ordered_to_unordered(ordered): - pd_dtype = pd.CategoricalDtype(categories=[1, 2, 3], ordered=ordered) - pd_to_dtype = pd.CategoricalDtype( - categories=[1, 2, 3], ordered=not ordered - ) - gd_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype) - gd_to_dtype = cudf.CategoricalDtype.from_pandas(pd_to_dtype) - - psr = pd.Series([1, 2, 3], dtype=pd_dtype) - gsr = cudf.Series([1, 2, 3], dtype=gd_dtype) - - expect = psr.astype(pd_to_dtype) - got = gsr.astype(gd_to_dtype) - - assert_eq(expect, got) - - -def test_series_astype_null_cases(): - data = [1, 2, None, 3] - - # numerical to other - assert_eq(cudf.Series(data, dtype="str"), cudf.Series(data).astype("str")) - - assert_eq( - cudf.Series(data, dtype="category"), - cudf.Series(data).astype("category"), - ) - - assert_eq( - cudf.Series(data, dtype="float32"), - cudf.Series(data, dtype="int32").astype("float32"), - ) - - assert_eq( - cudf.Series(data, dtype="float32"), - cudf.Series(data, dtype="uint32").astype("float32"), - ) - - assert_eq( - cudf.Series(data, dtype="datetime64[ms]"), - cudf.Series(data).astype("datetime64[ms]"), - ) - - # categorical to other - assert_eq( - cudf.Series(data, dtype="str"), - cudf.Series(data, dtype="category").astype("str"), - ) - - assert_eq( - cudf.Series(data, dtype="float32"), - cudf.Series(data, dtype="category").astype("float32"), - ) - - assert_eq( - cudf.Series(data, dtype="datetime64[ms]"), - cudf.Series(data, dtype="category").astype("datetime64[ms]"), - ) - - # string to other - assert_eq( - cudf.Series([1, 2, None, 3], dtype="int32"), - cudf.Series(["1", "2", None, "3"]).astype("int32"), - ) - - assert_eq( - cudf.Series( - ["2001-01-01", "2001-02-01", None, "2001-03-01"], - dtype="datetime64[ms]", - ), - cudf.Series(["2001-01-01", "2001-02-01", None, "2001-03-01"]).astype( - "datetime64[ms]" - ), - ) - - assert_eq( - cudf.Series(["a", "b", "c", None], dtype="category").to_pandas(), - cudf.Series(["a", "b", "c", None]).astype("category").to_pandas(), - ) - - # datetime to other - data = [ - "2001-01-01 00:00:00.000000", - "2001-02-01 00:00:00.000000", - None, - "2001-03-01 00:00:00.000000", - ] - assert_eq( - cudf.Series(data), - cudf.Series(data, dtype="datetime64[us]").astype("str"), - ) - - assert_eq( - pd.Series(data, dtype="datetime64[ns]").astype("category"), - cudf.from_pandas(pd.Series(data, dtype="datetime64[ns]")).astype( - "category" - ), - ) - - -def test_series_astype_null_categorical(): - sr = cudf.Series([None, None, None], dtype="category") - expect = cudf.Series([None, None, None], dtype="int32") - got = sr.astype("int32") - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ( - pd.Series([3, 3.0]), - pd.Series([2.3, 3.9]), - pd.Series([1.5, 3.9]), - pd.Series([1.0, 2]), - ), - [ - pd.Series([3, 3.0]), - pd.Series([2.3, 3.9]), - pd.Series([1.5, 3.9]), - pd.Series([1.0, 2]), - ], - ], -) -def test_create_dataframe_from_list_like(data): - pdf = pd.DataFrame(data, index=["count", "mean", "std", "min"]) - gdf = cudf.DataFrame(data, index=["count", "mean", "std", "min"]) - - assert_eq(pdf, gdf) - - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - assert_eq(pdf, gdf) - - -def test_create_dataframe_column(): - pdf = pd.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) - gdf = cudf.DataFrame(columns=["a", "b", "c"], index=["A", "Z", "X"]) - - assert_eq(pdf, gdf) - - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": [2, 3, 5]}, - columns=["a", "b", "c"], - index=["A", "Z", "X"], - ) - gdf = cudf.DataFrame( - {"a": [1, 2, 3], "b": [2, 3, 5]}, - columns=["a", "b", "c"], - index=["A", "Z", "X"], - ) - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "data", - [ - pd.DataFrame(np.eye(2)), - cudf.DataFrame(np.eye(2)), - np.eye(2), - cupy.eye(2), - None, - [[1, 0], [0, 1]], - [cudf.Series([0, 1]), cudf.Series([1, 0])], - ], -) -@pytest.mark.parametrize( - "columns", - [None, range(2), pd.RangeIndex(2), cudf.RangeIndex(2)], -) -def test_dataframe_columns_returns_rangeindex(data, columns): - if data is None and columns is None: - pytest.skip(f"{data=} and {columns=} not relevant.") - result = cudf.DataFrame(data=data, columns=columns).columns - expected = pd.RangeIndex(range(2)) - assert_eq(result, expected) - - -def test_dataframe_columns_returns_rangeindex_single_col(): - result = cudf.DataFrame([1, 2, 3]).columns - expected = pd.RangeIndex(range(1)) - assert_eq(result, expected) - - -@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"]) -@pytest.mark.parametrize("idx_data", [[], [1, 2]]) -@pytest.mark.parametrize("data", [None, [], {}]) -def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data): - result = cudf.DataFrame( - data, columns=cudf.Index(idx_data, dtype=dtype) - ).columns - expected = pd.Index(idx_data, dtype=dtype) - assert_eq(result, expected) - - -@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"]) -def test_dataframe_astype_preserves_column_dtype(dtype): - result = cudf.DataFrame([1], columns=cudf.Index([1], dtype=dtype)) - result = result.astype(np.int32).columns - expected = pd.Index([1], dtype=dtype) - assert_eq(result, expected) - - -def test_dataframe_astype_preserves_column_rangeindex(): - result = cudf.DataFrame([1], columns=range(1)) - result = result.astype(np.int32).columns - expected = pd.RangeIndex(1) - assert_eq(result, expected) - - -@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"]) -def test_dataframe_fillna_preserves_column_dtype(dtype): - result = cudf.DataFrame([1, None], columns=cudf.Index([1], dtype=dtype)) - result = result.fillna(2).columns - expected = pd.Index([1], dtype=dtype) - assert_eq(result, expected) - - -def test_dataframe_fillna_preserves_column_rangeindex(): - result = cudf.DataFrame([1, None], columns=range(1)) - result = result.fillna(2).columns - expected = pd.RangeIndex(1) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 4], - [], - [5.0, 7.0, 8.0], - pd.Categorical(["a", "b", "c"]), - ["m", "a", "d", "v"], - ], -) -def test_series_values_host_property(data): - pds = pd.Series(data=data, dtype=None if data else float) - gds = cudf.Series(data=data, dtype=None if data else float) - - np.testing.assert_array_equal(pds.values, gds.values_host) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 4], - [], - [5.0, 7.0, 8.0], - pytest.param( - pd.Categorical(["a", "b", "c"]), - marks=pytest_xfail(raises=NotImplementedError), - ), - pytest.param( - ["m", "a", "d", "v"], - marks=pytest_xfail(raises=TypeError), - ), - ], -) -def test_series_values_property(data): - pds = pd.Series(data=data, dtype=None if data else float) - gds = cudf.from_pandas(pds) - gds_vals = gds.values - assert isinstance(gds_vals, cupy.ndarray) - np.testing.assert_array_equal(gds_vals.get(), pds.values) - - -@pytest.mark.parametrize( - "data", - [ - {"A": [1, 2, 3], "B": [4, 5, 6]}, - {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]}, - {"A": [1, 2, 3], "B": [1.0, 2.0, 3.0]}, - {"A": np.float32(np.arange(3)), "B": np.float64(np.arange(3))}, - pytest.param( - {"A": [1, None, 3], "B": [1, 2, None]}, - marks=pytest_xfail( - reason="Nulls not supported by values accessor" - ), - ), - pytest.param( - {"A": [None, None, None], "B": [None, None, None]}, - marks=pytest_xfail( - reason="Nulls not supported by values accessor" - ), - ), - {"A": [], "B": []}, - pytest.param( - {"A": [1, 2, 3], "B": ["a", "b", "c"]}, - marks=pytest_xfail( - reason="str or categorical not supported by values accessor" - ), - ), - pytest.param( - {"A": pd.Categorical(["a", "b", "c"]), "B": ["d", "e", "f"]}, - marks=pytest_xfail( - reason="str or categorical not supported by values accessor" - ), - ), - ], -) -def test_df_values_property(data): - pdf = pd.DataFrame.from_dict(data) - gdf = cudf.DataFrame.from_pandas(pdf) - - pmtr = pdf.values - gmtr = gdf.values.get() - - np.testing.assert_array_equal(pmtr, gmtr) - - -def test_numeric_alpha_value_counts(): - pdf = pd.DataFrame( - { - "numeric": [1, 2, 3, 4, 5, 6, 1, 2, 4] * 10, - "alpha": ["u", "h", "d", "a", "m", "u", "h", "d", "a"] * 10, - } - ) - - gdf = cudf.DataFrame( - { - "numeric": [1, 2, 3, 4, 5, 6, 1, 2, 4] * 10, - "alpha": ["u", "h", "d", "a", "m", "u", "h", "d", "a"] * 10, - } - ) - - assert_eq( - pdf.numeric.value_counts().sort_index(), - gdf.numeric.value_counts().sort_index(), - check_dtype=False, - ) - assert_eq( - pdf.alpha.value_counts().sort_index(), - gdf.alpha.value_counts().sort_index(), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "data", - [ - pd.DataFrame( - { - "num_legs": [2, 4], - "num_wings": [2, 0], - "bird_cats": pd.Series( - ["sparrow", "pigeon"], - dtype="category", - index=["falcon", "dog"], - ), - }, - index=["falcon", "dog"], - ), - pd.DataFrame( - {"num_legs": [8, 2], "num_wings": [0, 2]}, - index=["spider", "falcon"], - ), - pd.DataFrame( - { - "num_legs": [8, 2, 1, 0, 2, 4, 5], - "num_wings": [2, 0, 2, 1, 2, 4, -1], - } - ), - pd.DataFrame({"a": ["a", "b", "c"]}, dtype="category"), - pd.DataFrame({"a": ["a", "b", "c"]}), - ], -) -@pytest.mark.parametrize( - "values", - [ - [0, 2], - {"num_wings": [0, 3]}, - pd.DataFrame( - {"num_legs": [8, 2], "num_wings": [0, 2]}, - index=["spider", "falcon"], - ), - pd.DataFrame( - { - "num_legs": [2, 4], - "num_wings": [2, 0], - "bird_cats": pd.Series( - ["sparrow", "pigeon"], - dtype="category", - index=["falcon", "dog"], - ), - }, - index=["falcon", "dog"], - ), - ["sparrow", "pigeon"], - pd.Series(["sparrow", "pigeon"], dtype="category"), - pd.Series([1, 2, 3, 4, 5]), - "abc", - 123, - pd.Series(["a", "b", "c"]), - pd.Series(["a", "b", "c"], dtype="category"), - pd.DataFrame({"a": ["a", "b", "c"]}, dtype="category"), - ], -) -def test_isin_dataframe(data, values): - pdf = data - gdf = cudf.from_pandas(pdf) - - if cudf.api.types.is_scalar(values): - assert_exceptions_equal( - lfunc=pdf.isin, - rfunc=gdf.isin, - lfunc_args_and_kwargs=([values],), - rfunc_args_and_kwargs=([values],), - ) - else: - try: - expected = pdf.isin(values) - except TypeError as e: - # Can't do isin with different categories - if str(e) == ( - "Categoricals can only be compared if 'categories' " - "are the same." - ): - return - - if isinstance(values, (pd.DataFrame, pd.Series)): - values = cudf.from_pandas(values) - - got = gdf.isin(values) - assert_eq(got, expected) - - -def test_isin_axis_duplicated_error(): - df = cudf.DataFrame(range(2)) - with pytest.raises(ValueError): - df.isin(cudf.Series(range(2), index=[1, 1])) - - with pytest.raises(ValueError): - df.isin(cudf.DataFrame(range(2), index=[1, 1])) - - with pytest.raises(ValueError): - df.isin(cudf.DataFrame([[1, 2]], columns=[1, 1])) - - -def test_constructor_properties(): - df = cudf.DataFrame() - key1 = "a" - key2 = "b" - val1 = np.array([123], dtype=np.float64) - val2 = np.array([321], dtype=np.float64) - df[key1] = val1 - df[key2] = val2 - - # Correct use of _constructor_sliced (for DataFrame) - assert_eq(df[key1], df._constructor_sliced(val1, name=key1)) - - # Correct use of _constructor_expanddim (for cudf.Series) - assert_eq(df, df[key2]._constructor_expanddim({key1: val1, key2: val2})) - - # Incorrect use of _constructor_sliced (Raises for cudf.Series) - with pytest.raises(NotImplementedError): - df[key1]._constructor_sliced - - # Incorrect use of _constructor_expanddim (Raises for DataFrame) - with pytest.raises(NotImplementedError): - df._constructor_expanddim - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("as_dtype", ALL_TYPES) -def test_df_astype_numeric_to_all(dtype, as_dtype): - if "uint" in dtype: - data = [1, 2, None, 4, 7] - elif "int" in dtype or "longlong" in dtype: - data = [1, 2, None, 4, -7] - elif "float" in dtype: - data = [1.0, 2.0, None, 4.0, np.nan, -7.0] - - gdf = cudf.DataFrame() - - gdf["foo"] = cudf.Series(data, dtype=dtype) - gdf["bar"] = cudf.Series(data, dtype=dtype) - - insert_data = cudf.Series(data, dtype=dtype) - - expect = cudf.DataFrame() - expect["foo"] = insert_data.astype(as_dtype) - expect["bar"] = insert_data.astype(as_dtype) - - got = gdf.astype(as_dtype) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "as_dtype", - [ - "int32", - "float32", - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - ], -) -def test_df_astype_string_to_other(as_dtype): - if "datetime64" in as_dtype: - # change None to "NaT" after this issue is fixed: - # https://github.com/rapidsai/cudf/issues/5117 - data = ["2001-01-01", "2002-02-02", "2000-01-05", None] - elif as_dtype == "int32": - data = [1, 2, 3] - elif as_dtype == "category": - data = ["1", "2", "3", None] - elif "float" in as_dtype: - data = [1.0, 2.0, 3.0, np.nan] - - insert_data = cudf.Series.from_pandas(pd.Series(data, dtype="str")) - expect_data = cudf.Series(data, dtype=as_dtype) - - gdf = cudf.DataFrame() - expect = cudf.DataFrame() - - gdf["foo"] = insert_data - gdf["bar"] = insert_data - - expect["foo"] = expect_data - expect["bar"] = expect_data - - got = gdf.astype(as_dtype) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "as_dtype", - [ - "int64", - "datetime64[s]", - "datetime64[us]", - "datetime64[ns]", - "str", - "category", - ], -) -def test_df_astype_datetime_to_other(as_dtype): - data = [ - "1991-11-20 00:00:00.000", - "2004-12-04 00:00:00.000", - "2016-09-13 00:00:00.000", - None, - ] - - gdf = cudf.DataFrame() - expect = cudf.DataFrame() - - gdf["foo"] = cudf.Series(data, dtype="datetime64[ms]") - gdf["bar"] = cudf.Series(data, dtype="datetime64[ms]") - - if as_dtype == "int64": - expect["foo"] = cudf.Series( - [690595200000, 1102118400000, 1473724800000, None], dtype="int64" - ) - expect["bar"] = cudf.Series( - [690595200000, 1102118400000, 1473724800000, None], dtype="int64" - ) - elif as_dtype == "str": - expect["foo"] = cudf.Series(data, dtype="str") - expect["bar"] = cudf.Series(data, dtype="str") - elif as_dtype == "category": - expect["foo"] = cudf.Series(gdf["foo"], dtype="category") - expect["bar"] = cudf.Series(gdf["bar"], dtype="category") - else: - expect["foo"] = cudf.Series(data, dtype=as_dtype) - expect["bar"] = cudf.Series(data, dtype=as_dtype) - - got = gdf.astype(as_dtype) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "as_dtype", - [ - "int32", - "float32", - "category", - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", - "str", - ], -) -def test_df_astype_categorical_to_other(as_dtype): - if "datetime64" in as_dtype: - data = ["2001-01-01", "2002-02-02", "2000-01-05", "2001-01-01"] - else: - data = [1, 2, 3, 1] - psr = pd.Series(data, dtype="category") - pdf = pd.DataFrame() - pdf["foo"] = psr - pdf["bar"] = psr - gdf = cudf.DataFrame.from_pandas(pdf) - assert_eq(pdf.astype(as_dtype), gdf.astype(as_dtype)) - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_df_astype_to_categorical_ordered(ordered): - psr = pd.Series([1, 2, 3, 1], dtype="category") - pdf = pd.DataFrame() - pdf["foo"] = psr - pdf["bar"] = psr - gdf = cudf.DataFrame.from_pandas(pdf) - - ordered_dtype_pd = pd.CategoricalDtype( - categories=[1, 2, 3], ordered=ordered - ) - ordered_dtype_gd = cudf.CategoricalDtype.from_pandas(ordered_dtype_pd) - - assert_eq( - pdf.astype(ordered_dtype_pd).astype("int32"), - gdf.astype(ordered_dtype_gd).astype("int32"), - ) - - -@pytest.mark.parametrize( - "dtype", - [dtype for dtype in ALL_TYPES] - + [ - cudf.CategoricalDtype(ordered=True), - cudf.CategoricalDtype(ordered=False), - ], -) -def test_empty_df_astype(dtype): - df = cudf.DataFrame() - result = df.astype(dtype=dtype) - assert_eq(df, result) - assert_eq(df.to_pandas().astype(dtype=dtype), result) - - -@pytest.mark.parametrize( - "errors", - [ - pytest.param( - "raise", marks=pytest_xfail(reason="should raise error here") - ), - pytest.param("other", marks=pytest_xfail(raises=ValueError)), - "ignore", - ], -) -def test_series_astype_error_handling(errors): - sr = cudf.Series(["random", "words"]) - got = sr.astype("datetime64", errors=errors) - assert_eq(sr, got) - - -@pytest.mark.parametrize("dtype", ALL_TYPES) -def test_df_constructor_dtype(dtype): - if "datetime" in dtype: - data = ["1991-11-20", "2004-12-04", "2016-09-13", None] - elif dtype == "str": - data = ["a", "b", "c", None] - elif "float" in dtype: - data = [1.0, 0.5, -1.1, np.nan, None] - elif "bool" in dtype: - data = [True, False, None] - else: - data = [1, 2, 3, None] - - sr = cudf.Series(data, dtype=dtype) - - expect = cudf.DataFrame() - expect["foo"] = sr - expect["bar"] = sr - got = cudf.DataFrame({"foo": data, "bar": data}, dtype=dtype) - - assert_eq(expect, got) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data", - [ - cudf.datasets.randomdata( - nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": int} - ), - cudf.datasets.randomdata( - nrows=10, dtypes={"a": "category", "b": int, "c": float, "d": str} - ), - cudf.datasets.randomdata( - nrows=10, dtypes={"a": bool, "b": int, "c": float, "d": str} - ), - cudf.DataFrame(), - cudf.DataFrame({"a": [0, 1, 2], "b": [1, None, 3]}), - cudf.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [7, np.nan, 9, 10], - "c": cudf.Series( - [np.nan, np.nan, np.nan, np.nan], nan_as_null=False - ), - "d": cudf.Series([None, None, None, None], dtype="int64"), - "e": [100, None, 200, None], - "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False), - } - ), - cudf.DataFrame( - { - "a": [10, 11, 12, 13, 14, 15], - "b": cudf.Series( - [10, None, np.nan, 2234, None, np.nan], nan_as_null=False - ), - } - ), - ], -) -@pytest.mark.parametrize( - "op", ["max", "min", "sum", "product", "mean", "var", "std"] -) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_rowwise_ops(data, op, skipna, numeric_only): - gdf = data - pdf = gdf.to_pandas() - - kwargs = {"axis": 1, "skipna": skipna, "numeric_only": numeric_only} - if op in ("var", "std"): - kwargs["ddof"] = 0 - - if not numeric_only and not all( - ( - (pdf[column].count() == 0) - if skipna - else (pdf[column].notna().count() == 0) - ) - or cudf.api.types.is_numeric_dtype(pdf[column].dtype) - or pdf[column].dtype.kind == "b" - for column in pdf - ): - with pytest.raises(TypeError): - expected = getattr(pdf, op)(**kwargs) - with pytest.raises(TypeError): - got = getattr(gdf, op)(**kwargs) - else: - expected = getattr(pdf, op)(**kwargs) - got = getattr(gdf, op)(**kwargs) - - assert_eq( - expected, - got, - check_dtype=False, - check_index_type=False if len(got.index) == 0 else True, - ) - - -@pytest.mark.parametrize( - "op", ["max", "min", "sum", "product", "mean", "var", "std"] -) -def test_rowwise_ops_nullable_dtypes_all_null(op): - gdf = cudf.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [7, np.nan, 9, 10], - "c": cudf.Series([np.nan, np.nan, np.nan, np.nan], dtype=float), - "d": cudf.Series([None, None, None, None], dtype="int64"), - "e": [100, None, 200, None], - "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False), - } - ) - - expected = cudf.Series([None, None, None, None], dtype="float64") - - if op in ("var", "std"): - got = getattr(gdf, op)(axis=1, ddof=0, skipna=False) - else: - got = getattr(gdf, op)(axis=1, skipna=False) - - assert_eq(got.null_count, expected.null_count) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "op", - [ - "max", - "min", - "sum", - "product", - "mean", - "var", - "std", - ], -) -def test_rowwise_ops_nullable_dtypes_partial_null(op): - gdf = cudf.DataFrame( - { - "a": [10, 11, 12, 13, 14, 15], - "b": cudf.Series( - [10, None, np.nan, 2234, None, np.nan], - nan_as_null=False, - ), - } - ) - - if op in ("var", "std"): - got = getattr(gdf, op)(axis=1, ddof=0, skipna=False) - expected = getattr(gdf.to_pandas(), op)(axis=1, ddof=0, skipna=False) - else: - got = getattr(gdf, op)(axis=1, skipna=False) - expected = getattr(gdf.to_pandas(), op)(axis=1, skipna=False) - - assert_eq(got.null_count, 2) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "op,expected", - [ - ( - "max", - cudf.Series( - [10, None, None, 2234, None, 453], - dtype="int64", - ), - ), - ( - "min", - cudf.Series( - [10, None, None, 13, None, 15], - dtype="int64", - ), - ), - ( - "sum", - cudf.Series( - [20, None, None, 2247, None, 468], - dtype="int64", - ), - ), - ( - "product", - cudf.Series( - [100, None, None, 29042, None, 6795], - dtype="int64", - ), - ), - ( - "mean", - cudf.Series( - [10.0, None, None, 1123.5, None, 234.0], - dtype="float32", - ), - ), - ( - "var", - cudf.Series( - [0.0, None, None, 1233210.25, None, 47961.0], - dtype="float32", - ), - ), - ( - "std", - cudf.Series( - [0.0, None, None, 1110.5, None, 219.0], - dtype="float32", - ), - ), - ], -) -def test_rowwise_ops_nullable_int_dtypes(op, expected): - gdf = cudf.DataFrame( - { - "a": [10, 11, None, 13, None, 15], - "b": cudf.Series( - [10, None, 323, 2234, None, 453], - nan_as_null=False, - ), - } - ) - - if op in ("var", "std"): - got = getattr(gdf, op)(axis=1, ddof=0, skipna=False) - else: - got = getattr(gdf, op)(axis=1, skipna=False) - - assert_eq(got.null_count, expected.null_count) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - { - "t1": cudf.Series( - ["2020-08-01 09:00:00", "1920-05-01 10:30:00"], dtype=" 0, None, None), - (pd.Series(range(5)), pd.Series(range(5)) > 1, None, None), - (pd.Series(range(5)), pd.Series(range(5)) > 1, 10, None), - ( - pd.Series(range(5)), - pd.Series(range(5)) > 1, - pd.Series(range(5, 10)), - None, - ), - ( - pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]), - ( - pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]) - % 3 - ) - == 0, - -pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"]), - None, - ), - ( - pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}), - pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}) == 4, - None, - None, - ), - ( - pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}), - pd.DataFrame({"a": [1, 2, np.nan], "b": [4, np.nan, 6]}) != 4, - None, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [True, True, True], - None, - ValueError, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [True, True, True, False], - None, - ValueError, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [[True, True, True, False], [True, True, True, False]], - None, - ValueError, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [[True, True], [False, True], [True, False], [False, True]], - None, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - cuda.to_device( - np.array( - [[True, True], [False, True], [True, False], [False, True]] - ) - ), - None, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - cupy.array( - [[True, True], [False, True], [True, False], [False, True]] - ), - 17, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [[True, True], [False, True], [True, False], [False, True]], - 17, - None, - ), - ( - pd.DataFrame({"p": [-2, 3, -4, -79], "k": [9, 10, 11, 12]}), - [ - [True, True, False, True], - [True, True, False, True], - [True, True, False, True], - [True, True, False, True], - ], - None, - ValueError, - ), - ( - pd.Series([1, 2, np.nan]), - pd.Series([1, 2, np.nan]) == 4, - None, - None, - ), - ( - pd.Series([1, 2, np.nan]), - pd.Series([1, 2, np.nan]) != 4, - None, - None, - ), - ( - pd.Series([4, np.nan, 6]), - pd.Series([4, np.nan, 6]) == 4, - None, - None, - ), - ( - pd.Series([4, np.nan, 6]), - pd.Series([4, np.nan, 6]) != 4, - None, - None, - ), - ( - pd.Series([4, np.nan, 6], dtype="category"), - pd.Series([4, np.nan, 6], dtype="category") != 4, - None, - None, - ), - ( - pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category"), - pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category") == "b", - None, - None, - ), - ( - pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category"), - pd.Series(["a", "b", "b", "d", "c", "s"], dtype="category") == "b", - "s", - None, - ), - ( - pd.Series([1, 2, 3, 2, 5]), - pd.Series([1, 2, 3, 2, 5]) == 2, - pd.DataFrame( - { - "a": pd.Series([1, 2, 3, 2, 5]), - "b": pd.Series([1, 2, 3, 2, 5]), - } - ), - NotImplementedError, - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_df_sr_mask_where(data, condition, other, error, inplace): - ps_where = data - gs_where = cudf.from_pandas(data) - - ps_mask = ps_where.copy(deep=True) - gs_mask = gs_where.copy(deep=True) - - if hasattr(condition, "__cuda_array_interface__"): - if type(condition).__module__.split(".")[0] == "cupy": - ps_condition = cupy.asnumpy(condition) - else: - ps_condition = np.array(condition).astype("bool") - else: - ps_condition = condition - - if type(condition).__module__.split(".")[0] == "pandas": - gs_condition = cudf.from_pandas(condition) - else: - gs_condition = condition - - ps_other = other - if type(other).__module__.split(".")[0] == "pandas": - gs_other = cudf.from_pandas(other) - else: - gs_other = other - - if error is None: - expect_where = ps_where.where( - ps_condition, other=ps_other, inplace=inplace - ) - got_where = gs_where.where( - gs_condition, other=gs_other, inplace=inplace - ) - - expect_mask = ps_mask.mask( - ps_condition, other=ps_other, inplace=inplace - ) - got_mask = gs_mask.mask(gs_condition, other=gs_other, inplace=inplace) - - if inplace: - expect_where = ps_where - got_where = gs_where - - expect_mask = ps_mask - got_mask = gs_mask - - if isinstance(expect_where, pd.Series) and isinstance( - expect_where.dtype, pd.CategoricalDtype - ): - np.testing.assert_array_equal( - expect_where.cat.codes, - got_where.cat.codes.astype(expect_where.cat.codes.dtype) - .fillna(-1) - .to_numpy(), - ) - assert_eq(expect_where.cat.categories, got_where.cat.categories) - - np.testing.assert_array_equal( - expect_mask.cat.codes, - got_mask.cat.codes.astype(expect_mask.cat.codes.dtype) - .fillna(-1) - .to_numpy(), - ) - assert_eq(expect_mask.cat.categories, got_mask.cat.categories) - else: - assert_eq( - expect_where.fillna(-1), - got_where.fillna(-1), - check_dtype=False, - ) - assert_eq( - expect_mask.fillna(-1), got_mask.fillna(-1), check_dtype=False - ) - else: - assert_exceptions_equal( - lfunc=ps_where.where, - rfunc=gs_where.where, - lfunc_args_and_kwargs=( - [ps_condition], - {"other": ps_other, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [gs_condition], - {"other": gs_other, "inplace": inplace}, - ), - ) - - assert_exceptions_equal( - lfunc=ps_mask.mask, - rfunc=gs_mask.mask, - lfunc_args_and_kwargs=( - [ps_condition], - {"other": ps_other, "inplace": inplace}, - ), - rfunc_args_and_kwargs=( - [gs_condition], - {"other": gs_other, "inplace": inplace}, - ), - ) - - -@pytest.mark.parametrize( - "data,condition,other,has_cat", - [ - ( - pd.DataFrame( - { - "a": pd.Series(["a", "a", "b", "c", "a", "d", "d", "a"]), - "b": pd.Series(["o", "p", "q", "e", "p", "p", "a", "a"]), - } - ), - pd.DataFrame( - { - "a": pd.Series(["a", "a", "b", "c", "a", "d", "d", "a"]), - "b": pd.Series(["o", "p", "q", "e", "p", "p", "a", "a"]), - } - ) - != "a", - None, - None, - ), - ( - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ) - != "a", - None, - True, - ), - ( - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ) - == "a", - None, - True, - ), - ( - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ) - != "a", - "a", - True, - ), - ( - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - ["a", "a", "b", "c", "a", "d", "d", "a"], - dtype="category", - ), - "b": pd.Series( - ["o", "p", "q", "e", "p", "p", "a", "a"], - dtype="category", - ), - } - ) - == "a", - "a", - True, - ), - ], -) -def test_df_string_cat_types_mask_where(data, condition, other, has_cat): - ps = data - gs = cudf.from_pandas(data) - - ps_condition = condition - if type(condition).__module__.split(".")[0] == "pandas": - gs_condition = cudf.from_pandas(condition) - else: - gs_condition = condition - - ps_other = other - if type(other).__module__.split(".")[0] == "pandas": - gs_other = cudf.from_pandas(other) - else: - gs_other = other - - expect_where = ps.where(ps_condition, other=ps_other) - got_where = gs.where(gs_condition, other=gs_other) - - expect_mask = ps.mask(ps_condition, other=ps_other) - got_mask = gs.mask(gs_condition, other=gs_other) - - if has_cat is None: - assert_eq( - expect_where.fillna(-1).astype("str"), - got_where.fillna(-1), - check_dtype=False, - ) - assert_eq( - expect_mask.fillna(-1).astype("str"), - got_mask.fillna(-1), - check_dtype=False, - ) - else: - assert_eq(expect_where, got_where, check_dtype=False) - assert_eq(expect_mask, got_mask, check_dtype=False) - - -@pytest.mark.parametrize( - "data,expected_upcast_type,error", - [ - ( - pd.Series([random.random() for _ in range(10)], dtype="float32"), - np.dtype("float32"), - None, - ), - ( - pd.Series([random.random() for _ in range(10)], dtype="float16"), - None, - TypeError, - ), - ( - pd.Series([random.random() for _ in range(10)], dtype="float64"), - np.dtype("float64"), - None, - ), - ( - pd.Series([random.random() for _ in range(10)], dtype="float128"), - None, - ValueError, - ), - ], -) -def test_from_pandas_unsupported_types(data, expected_upcast_type, error): - pdf = pd.DataFrame({"one_col": data}) - if error is not None: - with pytest.raises(error): - cudf.from_pandas(data) - - with pytest.raises(error): - cudf.Series(data) - - with pytest.raises(error): - cudf.from_pandas(pdf) - - with pytest.raises(error): - cudf.DataFrame(pdf) - else: - df = cudf.from_pandas(data) - - assert_eq(data, df, check_dtype=False) - assert df.dtype == expected_upcast_type - - df = cudf.Series(data) - assert_eq(data, df, check_dtype=False) - assert df.dtype == expected_upcast_type - - df = cudf.from_pandas(pdf) - assert_eq(pdf, df, check_dtype=False) - assert df["one_col"].dtype == expected_upcast_type - - df = cudf.DataFrame(pdf) - assert_eq(pdf, df, check_dtype=False) - assert df["one_col"].dtype == expected_upcast_type - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -@pytest.mark.parametrize("index", [None, "a", ["a", "b"]]) -def test_from_pandas_nan_as_null(nan_as_null, index): - data = [np.nan, 2.0, 3.0] - - if index is None: - pdf = pd.DataFrame({"a": data, "b": data}) - expected = cudf.DataFrame( - { - "a": column.as_column(data, nan_as_null=nan_as_null), - "b": column.as_column(data, nan_as_null=nan_as_null), - } - ) - else: - pdf = pd.DataFrame({"a": data, "b": data}).set_index(index) - expected = cudf.DataFrame( - { - "a": column.as_column(data, nan_as_null=nan_as_null), - "b": column.as_column(data, nan_as_null=nan_as_null), - } - ) - expected = cudf.DataFrame( - { - "a": column.as_column(data, nan_as_null=nan_as_null), - "b": column.as_column(data, nan_as_null=nan_as_null), - } - ) - expected = expected.set_index(index) - - got = cudf.from_pandas(pdf, nan_as_null=nan_as_null) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_from_pandas_for_series_nan_as_null(nan_as_null): - data = [np.nan, 2.0, 3.0] - psr = pd.Series(data) - - expected = cudf.Series._from_column( - column.as_column(data, nan_as_null=nan_as_null) - ) - got = cudf.from_pandas(psr, nan_as_null=nan_as_null) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_df_series_dataframe_astype_copy(copy): - gdf = cudf.DataFrame({"col1": [1, 2], "col2": [3, 4]}) - pdf = gdf.to_pandas() - - assert_eq( - gdf.astype(dtype="float", copy=copy), - pdf.astype(dtype="float", copy=copy), - ) - assert_eq(gdf, pdf) - - gsr = cudf.Series([1, 2]) - psr = gsr.to_pandas() - - assert_eq( - gsr.astype(dtype="float", copy=copy), - psr.astype(dtype="float", copy=copy), - ) - assert_eq(gsr, psr) - - gsr = cudf.Series([1, 2]) - psr = gsr.to_pandas() - - actual = gsr.astype(dtype="int64", copy=copy) - expected = psr.astype(dtype="int64", copy=copy) - assert_eq(expected, actual) - assert_eq(gsr, psr) - actual[0] = 3 - expected[0] = 3 - assert_eq(gsr, psr) - - -@pytest.mark.parametrize("copy", [True, False]) -def test_df_series_dataframe_astype_dtype_dict(copy): - gdf = cudf.DataFrame({"col1": [1, 2], "col2": [3, 4]}) - pdf = gdf.to_pandas() - - assert_eq( - gdf.astype(dtype={"col1": "float"}, copy=copy), - pdf.astype(dtype={"col1": "float"}, copy=copy), - ) - assert_eq(gdf, pdf) - - gsr = cudf.Series([1, 2]) - psr = gsr.to_pandas() - - assert_eq( - gsr.astype(dtype={None: "float"}, copy=copy), - psr.astype(dtype={None: "float"}, copy=copy), - ) - assert_eq(gsr, psr) - - assert_exceptions_equal( - lfunc=psr.astype, - rfunc=gsr.astype, - lfunc_args_and_kwargs=([], {"dtype": {"a": "float"}, "copy": copy}), - rfunc_args_and_kwargs=([], {"dtype": {"a": "float"}, "copy": copy}), - ) - - gsr = cudf.Series([1, 2]) - psr = gsr.to_pandas() - - actual = gsr.astype({None: "int64"}, copy=copy) - expected = psr.astype({None: "int64"}, copy=copy) - assert_eq(expected, actual) - assert_eq(gsr, psr) - - actual[0] = 3 - expected[0] = 3 - assert_eq(gsr, psr) - - -@pytest.mark.parametrize( - "data,columns", - [ - ([1, 2, 3, 100, 112, 35464], ["a"]), - (range(100), None), - ( - [], - None, - ), - ((-10, 21, 32, 32, 1, 2, 3), ["p"]), - ( - (), - None, - ), - ([[1, 2, 3], [1, 2, 3]], ["col1", "col2", "col3"]), - ([range(100), range(100)], ["range" + str(i) for i in range(100)]), - (((1, 2, 3), (1, 2, 3)), ["tuple0", "tuple1", "tuple2"]), - ([[1, 2, 3]], ["list col1", "list col2", "list col3"]), - ([[1, 2, 3]], pd.Index(["col1", "col2", "col3"], name="rapids")), - ([range(100)], ["range" + str(i) for i in range(100)]), - (((1, 2, 3),), ["k1", "k2", "k3"]), - ], -) -def test_dataframe_init_1d_list(data, columns): - expect = pd.DataFrame(data, columns=columns) - actual = cudf.DataFrame(data, columns=columns) - - assert_eq( - expect, - actual, - check_index_type=len(data) != 0, - ) - - expect = pd.DataFrame(data, columns=None) - actual = cudf.DataFrame(data, columns=None) - - assert_eq( - expect, - actual, - check_index_type=len(data) != 0, - ) - - -@pytest.mark.parametrize( - "data,cols,index", - [ - ( - np.ndarray(shape=(4, 2), dtype=float, order="F"), - ["a", "b"], - ["a", "b", "c", "d"], - ), - ( - np.ndarray(shape=(4, 2), dtype=float, order="F"), - ["a", "b"], - [0, 20, 30, 10], - ), - ( - np.ndarray(shape=(4, 2), dtype=float, order="F"), - ["a", "b"], - [0, 1, 2, 3], - ), - (np.array([11, 123, -2342, 232]), ["a"], [1, 2, 11, 12]), - (np.array([11, 123, -2342, 232]), ["a"], ["khsdjk", "a", "z", "kk"]), - ( - cupy.ndarray(shape=(4, 2), dtype=float, order="F"), - ["a", "z"], - ["a", "z", "a", "z"], - ), - (cupy.array([11, 123, -2342, 232]), ["z"], [0, 1, 1, 0]), - (cupy.array([11, 123, -2342, 232]), ["z"], [1, 2, 3, 4]), - (cupy.array([11, 123, -2342, 232]), ["z"], ["a", "z", "d", "e"]), - (np.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]), - (np.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]), - (cupy.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]), - (cupy.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]), - ], -) -def test_dataframe_init_from_arrays_cols(data, cols, index): - gd_data = data - if isinstance(data, cupy.ndarray): - # pandas can't handle cupy arrays in general - pd_data = data.get() - - # additional test for building DataFrame with gpu array whose - # cuda array interface has no `descr` attribute - numba_data = cuda.as_cuda_array(data) - else: - pd_data = data - numba_data = None - - # verify with columns & index - pdf = pd.DataFrame(pd_data, columns=cols, index=index) - gdf = cudf.DataFrame(gd_data, columns=cols, index=index) - - assert_eq(pdf, gdf, check_dtype=False) - - # verify with columns - pdf = pd.DataFrame(pd_data, columns=cols) - gdf = cudf.DataFrame(gd_data, columns=cols) - - assert_eq(pdf, gdf, check_dtype=False) - - pdf = pd.DataFrame(pd_data) - gdf = cudf.DataFrame(gd_data) - - assert_eq(pdf, gdf, check_dtype=False) - - if numba_data is not None: - gdf = cudf.DataFrame(numba_data) - assert_eq(pdf, gdf, check_dtype=False) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "col_data", - [ - range(5), - ["a", "b", "x", "y", "z"], - [1.0, 0.213, 0.34332], - ["a"], - [1], - [0.2323], - [], - ], -) -@pytest.mark.parametrize( - "assign_val", - [ - 1, - 2, - np.array(2), - cupy.array(2), - 0.32324, - np.array(0.34248), - cupy.array(0.34248), - "abc", - np.array("abc", dtype="object"), - np.array("abc", dtype="str"), - np.array("abc"), - None, - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_dataframe_assign_scalar(request, col_data, assign_val): - request.applymarker( - pytest.mark.xfail( - condition=PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and len(col_data) == 0, - reason="https://github.com/pandas-dev/pandas/issues/56679", - ) - ) - pdf = pd.DataFrame({"a": col_data}) - gdf = cudf.DataFrame({"a": col_data}) - - pdf["b"] = ( - cupy.asnumpy(assign_val) - if isinstance(assign_val, cupy.ndarray) - else assign_val - ) - gdf["b"] = assign_val - - assert_eq(pdf, gdf) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "col_data", - [ - 1, - 2, - np.array(2), - cupy.array(2), - 0.32324, - np.array(0.34248), - cupy.array(0.34248), - "abc", - np.array("abc", dtype="object"), - np.array("abc", dtype="str"), - np.array("abc"), - None, - ], -) -@pytest.mark.parametrize( - "assign_val", - [ - 1, - 2, - np.array(2), - cupy.array(2), - 0.32324, - np.array(0.34248), - cupy.array(0.34248), - "abc", - np.array("abc", dtype="object"), - np.array("abc", dtype="str"), - np.array("abc"), - None, - ], -) -def test_dataframe_assign_scalar_with_scalar_cols(col_data, assign_val): - pdf = pd.DataFrame( - { - "a": cupy.asnumpy(col_data) - if isinstance(col_data, cupy.ndarray) - else col_data - }, - index=["dummy_mandatory_index"], - ) - gdf = cudf.DataFrame({"a": col_data}, index=["dummy_mandatory_index"]) - - pdf["b"] = ( - cupy.asnumpy(assign_val) - if isinstance(assign_val, cupy.ndarray) - else assign_val - ) - gdf["b"] = assign_val - - assert_eq(pdf, gdf) - - -def test_dataframe_info_basic(): - buffer = io.StringIO() - str_cmp = textwrap.dedent( - """\ - - Index: 10 entries, a to 1111 - Data columns (total 10 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 0 10 non-null float64 - 1 1 10 non-null float64 - 2 2 10 non-null float64 - 3 3 10 non-null float64 - 4 4 10 non-null float64 - 5 5 10 non-null float64 - 6 6 10 non-null float64 - 7 7 10 non-null float64 - 8 8 10 non-null float64 - 9 9 10 non-null float64 - dtypes: float64(10) - memory usage: 859.0+ bytes - """ - ) - df = pd.DataFrame( - np.random.randn(10, 10), - index=["a", "2", "3", "4", "5", "6", "7", "8", "100", "1111"], - ) - cudf.from_pandas(df).info(buf=buffer, verbose=True) - s = buffer.getvalue() - assert str_cmp == s - - -def test_dataframe_info_verbose_mem_usage(): - buffer = io.StringIO() - df = pd.DataFrame({"a": [1, 2, 3], "b": ["safdas", "assa", "asdasd"]}) - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 3 entries, 0 to 2 - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 3 non-null int64 - 1 b 3 non-null object - dtypes: int64(1), object(1) - memory usage: 56.0+ bytes - """ - ) - cudf.from_pandas(df).info(buf=buffer, verbose=True) - s = buffer.getvalue() - assert str_cmp == s - - buffer.truncate(0) - buffer.seek(0) - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 3 entries, 0 to 2 - Columns: 2 entries, a to b - dtypes: int64(1), object(1) - memory usage: 56.0+ bytes - """ - ) - cudf.from_pandas(df).info(buf=buffer, verbose=False) - s = buffer.getvalue() - assert str_cmp == s - - buffer.truncate(0) - buffer.seek(0) - - df = pd.DataFrame( - {"a": [1, 2, 3], "b": ["safdas", "assa", "asdasd"]}, - index=["sdfdsf", "sdfsdfds", "dsfdf"], - ) - str_cmp = textwrap.dedent( - """\ - - Index: 3 entries, sdfdsf to dsfdf - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 3 non-null int64 - 1 b 3 non-null object - dtypes: int64(1), object(1) - memory usage: 91.0 bytes - """ - ) - cudf.from_pandas(df).info(buf=buffer, verbose=True, memory_usage="deep") - s = buffer.getvalue() - assert str_cmp == s - - buffer.truncate(0) - buffer.seek(0) - - int_values = [1, 2, 3, 4, 5] - text_values = ["alpha", "beta", "gamma", "delta", "epsilon"] - float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - - df = cudf.DataFrame( - { - "int_col": int_values, - "text_col": text_values, - "float_col": float_values, - } - ) - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 130.0 bytes - """ - ) - df.info(buf=buffer, verbose=True, memory_usage="deep") - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - -def test_dataframe_info_null_counts(): - int_values = [1, 2, 3, 4, 5] - text_values = ["alpha", "beta", "gamma", "delta", "epsilon"] - float_values = [0.0, 0.25, 0.5, 0.75, 1.0] - - df = cudf.DataFrame( - { - "int_col": int_values, - "text_col": text_values, - "float_col": float_values, - } - ) - buffer = io.StringIO() - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 5 entries, 0 to 4 - Data columns (total 3 columns): - # Column Dtype - --- ------ ----- - 0 int_col int64 - 1 text_col object - 2 float_col float64 - dtypes: float64(1), int64(1), object(1) - memory usage: 130.0+ bytes - """ - ) - df.info(buf=buffer, verbose=True, null_counts=False) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df.info(buf=buffer, verbose=True, max_cols=0) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df = cudf.DataFrame() - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 0 entries - Empty DataFrame""" - ) - df.info(buf=buffer, verbose=True) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df = cudf.DataFrame( - { - "a": [1, 2, 3, None, 10, 11, 12, None], - "b": ["a", "b", "c", "sd", "sdf", "sd", None, None], - } - ) - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 8 entries, 0 to 7 - Data columns (total 2 columns): - # Column Dtype - --- ------ ----- - 0 a int64 - 1 b object - dtypes: int64(1), object(1) - memory usage: 238.0+ bytes - """ - ) - pd.options.display.max_info_rows = 2 - df.info(buf=buffer, max_cols=2, null_counts=None) - pd.reset_option("display.max_info_rows") - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - str_cmp = textwrap.dedent( - """\ - - RangeIndex: 8 entries, 0 to 7 - Data columns (total 2 columns): - # Column Non-Null Count Dtype - --- ------ -------------- ----- - 0 a 6 non-null int64 - 1 b 6 non-null object - dtypes: int64(1), object(1) - memory usage: 238.0+ bytes - """ - ) - - df.info(buf=buffer, max_cols=2, null_counts=None) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - buffer.truncate(0) - buffer.seek(0) - - df.info(buf=buffer, null_counts=True) - actual_string = buffer.getvalue() - assert str_cmp == actual_string - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data1", - [ - [1, 2, 3, 4, 5, 6, 7], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], - [ - 1.9876543, - 2.9876654, - 3.9876543, - 4.1234587, - 5.23, - 6.88918237, - 7.00001, - ], - [ - -1.9876543, - -2.9876654, - -3.9876543, - -4.1234587, - -5.23, - -6.88918237, - -7.00001, - ], - [ - 1.987654321, - 2.987654321, - 3.987654321, - 0.1221, - 2.1221, - 0.112121, - -21.1212, - ], - [ - -1.987654321, - -2.987654321, - -3.987654321, - -0.1221, - -2.1221, - -0.112121, - 21.1212, - ], - ], -) -@pytest.mark.parametrize( - "data2", - [ - [1, 2, 3, 4, 5, 6, 7], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], - [ - 1.9876543, - 2.9876654, - 3.9876543, - 4.1234587, - 5.23, - 6.88918237, - 7.00001, - ], - [ - -1.9876543, - -2.9876654, - -3.9876543, - -4.1234587, - -5.23, - -6.88918237, - -7.00001, - ], - [ - 1.987654321, - 2.987654321, - 3.987654321, - 0.1221, - 2.1221, - 0.112121, - -21.1212, - ], - [ - -1.987654321, - -2.987654321, - -3.987654321, - -0.1221, - -2.1221, - -0.112121, - 21.1212, - ], - ], -) -@pytest.mark.parametrize("rtol", [0, 0.01, 1e-05, 1e-08, 5e-1, 50.12]) -@pytest.mark.parametrize("atol", [0, 0.01, 1e-05, 1e-08, 50.12]) -def test_cudf_isclose(data1, data2, rtol, atol): - array1 = cupy.array(data1) - array2 = cupy.array(data2) - - expected = cudf.Series(cupy.isclose(array1, array2, rtol=rtol, atol=atol)) - - actual = cudf.isclose( - cudf.Series(data1), cudf.Series(data2), rtol=rtol, atol=atol - ) - - assert_eq(expected, actual) - actual = cudf.isclose(data1, data2, rtol=rtol, atol=atol) - - assert_eq(expected, actual) - - actual = cudf.isclose( - cupy.array(data1), cupy.array(data2), rtol=rtol, atol=atol - ) - - assert_eq(expected, actual) - - actual = cudf.isclose( - np.array(data1), np.array(data2), rtol=rtol, atol=atol - ) - - assert_eq(expected, actual) - - actual = cudf.isclose( - pd.Series(data1), pd.Series(data2), rtol=rtol, atol=atol - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data1", - [ - [ - -1.9876543, - -2.9876654, - np.nan, - -4.1234587, - -5.23, - -6.88918237, - -7.00001, - ], - [ - 1.987654321, - 2.987654321, - 3.987654321, - 0.1221, - 2.1221, - np.nan, - -21.1212, - ], - ], -) -@pytest.mark.parametrize( - "data2", - [ - [ - -1.9876543, - -2.9876654, - -3.9876543, - -4.1234587, - -5.23, - -6.88918237, - -7.00001, - ], - [ - 1.987654321, - 2.987654321, - 3.987654321, - 0.1221, - 2.1221, - 0.112121, - -21.1212, - ], - [ - -1.987654321, - -2.987654321, - -3.987654321, - np.nan, - np.nan, - np.nan, - 21.1212, - ], - ], -) -@pytest.mark.parametrize("equal_nan", [True, False]) -def test_cudf_isclose_nulls(data1, data2, equal_nan): - array1 = cupy.array(data1) - array2 = cupy.array(data2) - - expected = cudf.Series(cupy.isclose(array1, array2, equal_nan=equal_nan)) - - actual = cudf.isclose( - cudf.Series(data1), cudf.Series(data2), equal_nan=equal_nan - ) - assert_eq(expected, actual, check_dtype=False) - actual = cudf.isclose(data1, data2, equal_nan=equal_nan) - assert_eq(expected, actual, check_dtype=False) - - -def test_cudf_isclose_different_index(): - s1 = cudf.Series( - [-1.9876543, -2.9876654, -3.9876543, -4.1234587, -5.23, -7.00001], - index=[0, 1, 2, 3, 4, 5], - ) - s2 = cudf.Series( - [-1.9876543, -2.9876654, -7.00001, -4.1234587, -5.23, -3.9876543], - index=[0, 1, 5, 3, 4, 2], - ) - - expected = cudf.Series([True] * 6, index=s1.index) - assert_eq(expected, cudf.isclose(s1, s2)) - - s1 = cudf.Series( - [-1.9876543, -2.9876654, -3.9876543, -4.1234587, -5.23, -7.00001], - index=[0, 1, 2, 3, 4, 5], - ) - s2 = cudf.Series( - [-1.9876543, -2.9876654, -7.00001, -4.1234587, -5.23, -3.9876543], - index=[0, 1, 5, 10, 4, 2], - ) - - expected = cudf.Series( - [True, True, True, False, True, True], index=s1.index - ) - assert_eq(expected, cudf.isclose(s1, s2)) - - s1 = cudf.Series( - [-1.9876543, -2.9876654, -3.9876543, -4.1234587, -5.23, -7.00001], - index=[100, 1, 2, 3, 4, 5], - ) - s2 = cudf.Series( - [-1.9876543, -2.9876654, -7.00001, -4.1234587, -5.23, -3.9876543], - index=[0, 1, 100, 10, 4, 2], - ) - - expected = cudf.Series( - [False, True, True, False, True, False], index=s1.index - ) - assert_eq(expected, cudf.isclose(s1, s2)) - - -@pytest.mark.parametrize( - "orient", ["dict", "list", "split", "tight", "records", "index", "series"] -) -@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)]) -def test_dataframe_to_dict(orient, into): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]}, index=[10, 11, 12]) - pdf = df.to_pandas() - - actual = df.to_dict(orient=orient, into=into) - expected = pdf.to_dict(orient=orient, into=into) - if orient == "series": - assert actual.keys() == expected.keys() - for key in actual.keys(): - assert_eq(expected[key], actual[key]) - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data, orient, dtype, columns", - [ - ( - {"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}, - "columns", - None, - None, - ), - ({"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}, "index", None, None), - ( - {"col_1": [None, 2, 1, 0], "col_2": [3, None, 1, 0]}, - "index", - None, - ["A", "B", "C", "D"], - ), - ( - { - "col_1": ["ab", "cd", "ef", "gh"], - "col_2": ["zx", "one", "two", "three"], - }, - "index", - None, - ["A", "B", "C", "D"], - ), - ( - { - "index": [("a", "b"), ("a", "c")], - "columns": [("x", 1), ("y", 2)], - "data": [[1, 3], [2, 4]], - "index_names": ["n1", "n2"], - "column_names": ["z1", "z2"], - }, - "tight", - "float64", - None, - ), - ], -) -def test_dataframe_from_dict(data, orient, dtype, columns): - expected = pd.DataFrame.from_dict( - data=data, orient=orient, dtype=dtype, columns=columns - ) - - actual = cudf.DataFrame.from_dict( - data=data, orient=orient, dtype=dtype, columns=columns - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("dtype", ["int64", "str", None]) -def test_dataframe_from_dict_transposed(dtype): - pd_data = {"a": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]} - gd_data = {key: cudf.Series(val) for key, val in pd_data.items()} - - expected = pd.DataFrame.from_dict(pd_data, orient="index", dtype=dtype) - actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype) - - gd_data = {key: cupy.asarray(val) for key, val in pd_data.items()} - actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pd_data, gd_data, orient, dtype, columns", - [ - ( - {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])}, - { - "col_1": cupy.array([3, 2, 1, 0]), - "col_2": cupy.array([3, 2, 1, 0]), - }, - "columns", - None, - None, - ), - ( - {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])}, - { - "col_1": cupy.array([3, 2, 1, 0]), - "col_2": cupy.array([3, 2, 1, 0]), - }, - "index", - None, - None, - ), - ( - { - "col_1": np.array([None, 2, 1, 0]), - "col_2": np.array([3, None, 1, 0]), - }, - { - "col_1": cupy.array([np.nan, 2, 1, 0]), - "col_2": cupy.array([3, np.nan, 1, 0]), - }, - "index", - None, - ["A", "B", "C", "D"], - ), - ( - { - "col_1": np.array(["ab", "cd", "ef", "gh"]), - "col_2": np.array(["zx", "one", "two", "three"]), - }, - { - "col_1": np.array(["ab", "cd", "ef", "gh"]), - "col_2": np.array(["zx", "one", "two", "three"]), - }, - "index", - None, - ["A", "B", "C", "D"], - ), - ( - { - "index": [("a", "b"), ("a", "c")], - "columns": [("x", 1), ("y", 2)], - "data": [np.array([1, 3]), np.array([2, 4])], - "index_names": ["n1", "n2"], - "column_names": ["z1", "z2"], - }, - { - "index": [("a", "b"), ("a", "c")], - "columns": [("x", 1), ("y", 2)], - "data": [cupy.array([1, 3]), cupy.array([2, 4])], - "index_names": ["n1", "n2"], - "column_names": ["z1", "z2"], - }, - "tight", - "float64", - None, - ), - ], -) -def test_dataframe_from_dict_cp_np_arrays( - pd_data, gd_data, orient, dtype, columns -): - expected = pd.DataFrame.from_dict( - data=pd_data, orient=orient, dtype=dtype, columns=columns - ) - - actual = cudf.DataFrame.from_dict( - data=gd_data, orient=orient, dtype=dtype, columns=columns - ) - - assert_eq(expected, actual, check_dtype=dtype is not None) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"a": [1, 2, 3, 4, 5, 10, 11, 12, 33, 55, 19]}), - pd.DataFrame( - { - "one": [1, 2, 3, 4, 5, 10], - "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], - } - ), - pd.DataFrame( - { - "one": [1, 2, 3, 4, 5, 10], - "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], - }, - index=[10, 20, 30, 40, 50, 60], - ), - pd.DataFrame( - { - "one": [1, 2, 3, 4, 5, 10], - "two": ["abc", "def", "ghi", "xyz", "pqr", "abc"], - }, - index=["a", "b", "c", "d", "e", "f"], - ), - pd.DataFrame(index=["a", "b", "c", "d", "e", "f"]), - pd.DataFrame(columns=["a", "b", "c", "d", "e", "f"]), - pd.DataFrame(index=[10, 11, 12]), - pd.DataFrame(columns=[10, 11, 12]), - pd.DataFrame(), - pd.DataFrame({"one": [], "two": []}), - pd.DataFrame({2: [], 1: []}), - pd.DataFrame( - { - 0: [1, 2, 3, 4, 5, 10], - 1: ["abc", "def", "ghi", "xyz", "pqr", "abc"], - 100: ["a", "b", "b", "x", "z", "a"], - }, - index=[10, 20, 30, 40, 50, 60], - ), - ], -) -def test_dataframe_keys(df): - gdf = cudf.from_pandas(df) - - assert_eq( - df.keys(), - gdf.keys(), - ) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series([1, 2, 3, 4, 5, 10, 11, 12, 33, 55, 19]), - pd.Series(["abc", "def", "ghi", "xyz", "pqr", "abc"]), - pd.Series( - [1, 2, 3, 4, 5, 10], - index=["abc", "def", "ghi", "xyz", "pqr", "abc"], - ), - pd.Series( - ["abc", "def", "ghi", "xyz", "pqr", "abc"], - index=[1, 2, 3, 4, 5, 10], - ), - pd.Series(index=["a", "b", "c", "d", "e", "f"], dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - pd.Series(dtype="float64"), - pd.Series([], dtype="float64"), - ], -) -def test_series_keys(ps): - gds = cudf.from_pandas(ps) - - assert_eq(ps.keys(), gds.keys()) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[7, 20, 11, 9], - ), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[100]), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame( - {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, - index=[100, 200, 300, 400, 500, 0], - ), - ], -) -@pytest.mark.parametrize( - "other", - [ - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame([[5, 6], [7, 8]], columns=list("BD")), - pd.DataFrame([[5, 6], [7, 8]], columns=list("DE")), - pd.DataFrame(), - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - pd.DataFrame([]), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - pd.DataFrame([], index=[100]), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - ], -) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_concat_dataframe(df, other, sort, ignore_index): - pdf = df - other_pd = other - - gdf = cudf.from_pandas(df) - other_gd = cudf.from_pandas(other) - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf, other_pd], sort=sort, ignore_index=ignore_index - ) - actual = cudf.concat( - [gdf, other_gd], sort=sort, ignore_index=ignore_index - ) - - # In empty dataframe cases, Pandas & cudf differ in columns - # creation, pandas creates RangeIndex(0, 0) - # whereas cudf creates an empty Index([], dtype="object"). - check_column_type = ( - False if len(expected.columns) == len(df.columns) == 0 else True - ) - - if expected.shape != df.shape: - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_column_type=check_column_type, - ) - else: - assert_eq( - expected, - actual, - check_index_type=not gdf.empty, - check_column_type=check_column_type, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({12: [], 22: []}), - pd.DataFrame([[1, 2], [3, 4]], columns=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=[0, 1], index=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=[1, 0], index=[7, 8]), - pd.DataFrame( - { - 23: [315.3324, 3243.32432, 3232.332, -100.32], - 33: [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - 0: [315.3324, 3243.32432, 3232.332, -100.32], - 1: [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[7, 20, 11, 9], - ), - ], -) -@pytest.mark.parametrize( - "other", - [ - pd.Series([10, 11, 23, 234, 13]), - pd.Series([10, 11, 23, 234, 13], index=[11, 12, 13, 44, 33]), - {1: 1}, - {0: 10, 1: 100, 2: 102}, - ], -) -@pytest.mark.parametrize("sort", [False, True]) -def test_dataframe_concat_series(df, other, sort): - pdf = df - gdf = cudf.from_pandas(df) - - if isinstance(other, dict): - other_pd = pd.Series(other) - else: - other_pd = other - other_gd = cudf.from_pandas(other_pd) - - expected = pd.concat([pdf, other_pd], ignore_index=True, sort=sort) - actual = cudf.concat([gdf, other_gd], ignore_index=True, sort=sort) - - if expected.shape != df.shape: - # Ignore the column type comparison because pandas incorrectly - # returns pd.Index([1, 2, 3], dtype="object") instead - # of pd.Index([1, 2, 3], dtype="int64") - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_column_type=False, - check_index_type=True, - ) - else: - assert_eq(expected, actual, check_index_type=not gdf.empty) - - -def test_dataframe_concat_series_mixed_index(): - df = cudf.DataFrame({"first": [], "d": []}) - pdf = df.to_pandas() - - sr = cudf.Series([1, 2, 3, 4]) - psr = sr.to_pandas() - - assert_eq( - cudf.concat([df, sr], ignore_index=True), - pd.concat([pdf, psr], ignore_index=True), - check_dtype=False, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[10, 20, 30]), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[7, 20, 11, 9], - ), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[100]), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame( - {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, - index=[100, 200, 300, 400, 500, 0], - ), - ], -) -@pytest.mark.parametrize( - "other", - [ - [pd.DataFrame([[5, 6], [7, 8]], columns=list("AB"))], - [ - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - pd.DataFrame([[5, 6], [7, 8]], columns=list("BD")), - pd.DataFrame([[5, 6], [7, 8]], columns=list("DE")), - ], - [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()], - [ - pd.DataFrame( - {"c": [10, 11, 22, 33, 44, 100]}, index=[7, 8, 9, 10, 11, 20] - ), - pd.DataFrame(), - pd.DataFrame(), - pd.DataFrame([[5, 6], [7, 8]], columns=list("AB")), - ], - [ - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[200]), - ], - [pd.DataFrame([]), pd.DataFrame([], index=[100])], - [ - pd.DataFrame([]), - pd.DataFrame([], index=[100]), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - ], - [ - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - ], - [ - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - ], - [ - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - ], - [ - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[0, 100, 200, 300], - ), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - ], - ], -) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_concat_dataframe_lists(df, other, sort, ignore_index): - pdf = df - other_pd = other - - gdf = cudf.from_pandas(df) - other_gd = [cudf.from_pandas(o) for o in other] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf] + other_pd, sort=sort, ignore_index=ignore_index - ) - actual = cudf.concat( - [gdf] + other_gd, sort=sort, ignore_index=ignore_index - ) - - # In some cases, Pandas creates an empty Index([], dtype="object") for - # columns whereas cudf creates a RangeIndex(0, 0). - check_column_type = ( - False if len(expected.columns) == len(df.columns) == 0 else True - ) - - if expected.shape != df.shape: - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_column_type=check_column_type, - ) - else: - assert_eq( - expected, - actual, - check_index_type=not gdf.empty, - check_column_type=check_column_type, - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"A": [1, 2, 3, np.nan, None, 6]}), - pd.Series([1, 2, 3, None, np.nan, 5, 6, np.nan]), - ], -) -@pytest.mark.parametrize("alias", ["bfill", "backfill"]) -def test_dataframe_bfill(df, alias): - gdf = cudf.from_pandas(df) - - with expect_warning_if(alias == "backfill"): - actual = getattr(df, alias)() - with expect_warning_if(alias == "backfill"): - expected = getattr(gdf, alias)() - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"A": [1, 2, 3, np.nan, None, 6]}), - pd.Series([1, 2, 3, None, np.nan, 5, 6, np.nan]), - ], -) -@pytest.mark.parametrize("alias", ["ffill", "pad"]) -def test_dataframe_ffill(df, alias): - gdf = cudf.from_pandas(df) - - with expect_warning_if(alias == "pad"): - actual = getattr(df, alias)() - with expect_warning_if(alias == "pad"): - expected = getattr(gdf, alias)() - assert_eq(expected, actual) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB")), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[10, 20]), - pd.DataFrame([[1, 2], [3, 4]], columns=list("AB"), index=[7, 8]), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - } - ), - pd.DataFrame( - { - "a": [315.3324, 3243.32432, 3232.332, -100.32], - "z": [0.3223, 0.32, 0.0000232, 0.32224], - }, - index=[7, 20, 11, 9], - ), - pd.DataFrame({"l": [10]}), - pd.DataFrame({"l": [10]}, index=[100]), - pd.DataFrame({"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}), - pd.DataFrame( - {"f": [10.2, 11.2332, 0.22, 3.3, 44.23, 10.0]}, - index=[100, 200, 300, 400, 500, 0], - ), - pd.DataFrame({"first_col": [], "second_col": [], "third_col": []}), - ], -) -@pytest.mark.parametrize( - "other", - [ - [[1, 2], [10, 100]], - [[1, 2, 10, 100, 0.1, 0.2, 0.0021]], - [[]], - [[], [], [], []], - [[0.23, 0.00023, -10.00, 100, 200, 1000232, 1232.32323]], - ], -) -@pytest.mark.parametrize("sort", [False, True]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_dataframe_concat_lists(df, other, sort, ignore_index): - pdf = df - other_pd = [pd.DataFrame(o) for o in other] - - gdf = cudf.from_pandas(df) - other_gd = [cudf.from_pandas(o) for o in other_pd] - - with _hide_concat_empty_dtype_warning(): - expected = pd.concat( - [pdf] + other_pd, sort=sort, ignore_index=ignore_index - ) - actual = cudf.concat( - [gdf] + other_gd, sort=sort, ignore_index=ignore_index - ) - - if expected.shape != df.shape: - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_column_type=not gdf.empty, - ) - else: - assert_eq( - expected, - actual, - check_index_type=not gdf.empty, - check_column_type=len(gdf.columns) != 0, - ) - - -def test_dataframe_concat_series_without_name(): - df = cudf.DataFrame({"a": [1, 2, 3]}) - pdf = df.to_pandas() - gs = cudf.Series([1, 2, 3]) - ps = gs.to_pandas() - - assert_eq(pd.concat([pdf, ps]), cudf.concat([df, gs])) - - -def test_cudf_arrow_array_error(): - df = cudf.DataFrame({"a": [1, 2, 3]}) - - with pytest.raises( - TypeError, - match="Implicit conversion to a host PyArrow object via " - "__arrow_array__ is not allowed. Consider using .to_arrow()", - ): - df.__arrow_array__() - - sr = cudf.Series([1, 2, 3]) - - with pytest.raises( - TypeError, - match="Implicit conversion to a host PyArrow object via " - "__arrow_array__ is not allowed. Consider using .to_arrow()", - ): - sr.__arrow_array__() - - sr = cudf.Series(["a", "b", "c"]) - with pytest.raises( - TypeError, - match="Implicit conversion to a host PyArrow object via " - "__arrow_array__ is not allowed. Consider using .to_arrow()", - ): - sr.__arrow_array__() - - -@pytest.mark.parametrize( - "make_weights_axis_1", - [lambda _: None, lambda s: [1] * s, lambda s: np.ones(s)], -) -def test_sample_axis_1( - sample_n_frac, random_state_tuple_axis_1, make_weights_axis_1 -): - n, frac = sample_n_frac - pd_random_state, gd_random_state, checker = random_state_tuple_axis_1 - - pdf = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "float": [0.05, 0.2, 0.3, 0.2, 0.25], - "int": [1, 3, 5, 4, 2], - }, - ) - df = cudf.DataFrame.from_pandas(pdf) - - weights = make_weights_axis_1(len(pdf.columns)) - - expected = pdf.sample( - n=n, - frac=frac, - replace=False, - random_state=pd_random_state, - weights=weights, - axis=1, - ) - got = df.sample( - n=n, - frac=frac, - replace=False, - random_state=gd_random_state, - weights=weights, - axis=1, - ) - checker(expected, got) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "float": [0.05, 0.2, 0.3, 0.2, 0.25], - "int": [1, 3, 5, 4, 2], - }, - ), - pd.Series([1, 2, 3, 4, 5]), - ], -) -@pytest.mark.parametrize("replace", [True, False]) -def test_sample_axis_0( - pdf, sample_n_frac, replace, random_state_tuple_axis_0, make_weights_axis_0 -): - n, frac = sample_n_frac - pd_random_state, gd_random_state, checker = random_state_tuple_axis_0 - - df = cudf.from_pandas(pdf) - - pd_weights, gd_weights = make_weights_axis_0( - len(pdf), isinstance(gd_random_state, np.random.RandomState) - ) - if ( - not replace - and not isinstance(gd_random_state, np.random.RandomState) - and gd_weights is not None - ): - pytest.skip( - "`cupy.random.RandomState` doesn't support weighted sampling " - "without replacement." - ) - - expected = pdf.sample( - n=n, - frac=frac, - replace=replace, - random_state=pd_random_state, - weights=pd_weights, - axis=0, - ) - - got = df.sample( - n=n, - frac=frac, - replace=replace, - random_state=gd_random_state, - weights=gd_weights, - axis=0, - ) - checker(expected, got) - - -@pytest.mark.parametrize("replace", [True, False]) -@pytest.mark.parametrize( - "random_state_lib", [cupy.random.RandomState, np.random.RandomState] -) -def test_sample_reproducibility(replace, random_state_lib): - df = cudf.DataFrame({"a": cupy.arange(0, 1024)}) - - n = 1024 - expected = df.sample(n, replace=replace, random_state=random_state_lib(10)) - out = df.sample(n, replace=replace, random_state=random_state_lib(10)) - - assert_eq(expected, out) - - -@pytest.mark.parametrize("axis", [0, 1]) -def test_sample_invalid_n_frac_combo(axis): - n, frac = 2, 0.5 - pdf = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "float": [0.05, 0.2, 0.3, 0.2, 0.25], - "int": [1, 3, 5, 4, 2], - }, - ) - df = cudf.DataFrame.from_pandas(pdf) - - assert_exceptions_equal( - lfunc=pdf.sample, - rfunc=df.sample, - lfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}), - rfunc_args_and_kwargs=([], {"n": n, "frac": frac, "axis": axis}), - ) - - -@pytest.mark.parametrize("n, frac", [(100, None), (None, 3)]) -@pytest.mark.parametrize("axis", [0, 1]) -def test_oversample_without_replace(n, frac, axis): - pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}) - df = cudf.DataFrame.from_pandas(pdf) - - assert_exceptions_equal( - lfunc=pdf.sample, - rfunc=df.sample, - lfunc_args_and_kwargs=( - [], - {"n": n, "frac": frac, "axis": axis, "replace": False}, - ), - rfunc_args_and_kwargs=( - [], - {"n": n, "frac": frac, "axis": axis, "replace": False}, - ), - ) - - -@pytest.mark.parametrize("random_state", [None, cupy.random.RandomState(42)]) -def test_sample_unsupported_arguments(random_state): - df = cudf.DataFrame({"float": [0.05, 0.2, 0.3, 0.2, 0.25]}) - with pytest.raises( - NotImplementedError, - match="Random sampling with cupy does not support these inputs.", - ): - df.sample( - n=2, replace=False, random_state=random_state, weights=[1] * 5 - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[100, 10, 1, 0]), - pd.DataFrame(columns=["a", "b", "c", "d"]), - pd.DataFrame(columns=["a", "b", "c", "d"], index=[100]), - pd.DataFrame( - columns=["a", "b", "c", "d"], index=[100, 10000, 2131, 133] - ), - pd.DataFrame({"a": [1, 2, 3], "b": ["abc", "xyz", "klm"]}), - ], -) -def test_dataframe_empty(df): - pdf = df - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf.empty, gdf.empty) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame(), - pd.DataFrame(index=[100, 10, 1, 0]), - pd.DataFrame(columns=["a", "b", "c", "d"]), - pd.DataFrame(columns=["a", "b", "c", "d"], index=[100]), - pd.DataFrame( - columns=["a", "b", "c", "d"], index=[100, 10000, 2131, 133] - ), - pd.DataFrame({"a": [1, 2, 3], "b": ["abc", "xyz", "klm"]}), - ], -) -def test_dataframe_size(df): - pdf = df - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf.size, gdf.size) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series(dtype="float64"), - pd.Series(index=[100, 10, 1, 0], dtype="float64"), - pd.Series([], dtype="float64"), - pd.Series(["a", "b", "c", "d"]), - pd.Series(["a", "b", "c", "d"], index=[0, 1, 10, 11]), - ], -) -def test_series_empty(ps): - ps = ps - gs = cudf.from_pandas(ps) - - assert_eq(ps.empty, gs.empty) - - -@pytest.mark.parametrize( - "data", - [ - None, - [], - [1], - {"a": [10, 11, 12]}, - { - "a": [10, 11, 12], - "another column name": [12, 22, 34], - "xyz": [0, 10, 11], - }, - ], -) -@pytest.mark.parametrize( - "columns", - [["a"], ["another column name"], None, pd.Index(["a"], name="index name")], -) -def test_dataframe_init_with_columns(data, columns): - pdf = pd.DataFrame(data, columns=columns) - gdf = cudf.DataFrame(data, columns=columns) - - assert_eq( - pdf, - gdf, - check_index_type=len(pdf.index) != 0, - check_dtype=not (pdf.empty and len(pdf.columns)), - check_column_type=False, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data, ignore_dtype", - [ - ([pd.Series([1, 2, 3])], False), - ([pd.Series(index=[1, 2, 3], dtype="float64")], False), - ([pd.Series(name="empty series name", dtype="float64")], False), - ( - [pd.Series([1]), pd.Series([], dtype="float64"), pd.Series([3])], - False, - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series([3], name="series that is named"), - ], - False, - ), - ([pd.Series([1, 2, 3], name="hi")] * 10, False), - ([pd.Series([1, 2, 3], name=None, index=[10, 11, 12])] * 10, False), - ( - [ - pd.Series([1, 2, 3], name=None, index=[10, 11, 12]), - pd.Series([1, 2, 30], name=None, index=[13, 144, 15]), - ], - True, - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - ], - False, - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], name="abc", dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - ], - False, - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([1, -100, 200, -399, 400], name="abc"), - pd.Series([111, 222, 333], index=[10, 11, 12]), - ], - False, - ), - ], -) -@pytest.mark.parametrize( - "columns", - [ - None, - ["0"], - [0], - ["abc"], - [144, 13], - [2, 1, 0], - pd.Index(["abc"], name="custom_name"), - ], -) -def test_dataframe_init_from_series_list(data, ignore_dtype, columns): - gd_data = [cudf.from_pandas(obj) for obj in data] - - expected = pd.DataFrame(data, columns=columns) - actual = cudf.DataFrame(gd_data, columns=columns) - - if ignore_dtype: - # When a union is performed to generate columns, - # the order is never guaranteed. Hence sort by - # columns before comparison. - if not expected.columns.equals(actual.columns): - expected = expected.sort_index(axis=1) - actual = actual.sort_index(axis=1) - assert_eq( - expected.fillna(-1), - actual.fillna(-1), - check_dtype=False, - check_index_type=True, - ) - else: - assert_eq( - expected, - actual, - check_index_type=True, - check_column_type=False, - ) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data, ignore_dtype, index", - [ - ([pd.Series([1, 2, 3])], False, ["a", "b", "c"]), - ([pd.Series(index=[1, 2, 3], dtype="float64")], False, ["a", "b"]), - ( - [pd.Series(name="empty series name", dtype="float64")], - False, - ["index1"], - ), - ( - [pd.Series([1]), pd.Series([], dtype="float64"), pd.Series([3])], - False, - ["0", "2", "1"], - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series([3], name="series that is named"), - ], - False, - ["_", "+", "*"], - ), - ([pd.Series([1, 2, 3], name="hi")] * 10, False, ["mean"] * 10), - ( - [pd.Series([1, 2, 3], name=None, index=[10, 11, 12])] * 10, - False, - ["abc"] * 10, - ), - ( - [ - pd.Series([1, 2, 3], name=None, index=[10, 11, 12]), - pd.Series([1, 2, 30], name=None, index=[13, 144, 15]), - ], - True, - ["set_index_a", "set_index_b"], - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - ], - False, - ["a", "b", "c"], - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], name="abc", dtype="float64"), - pd.Series(index=[10, 11, 12], dtype="float64"), - ], - False, - ["a", "v", "z"], - ), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([1, -100, 200, -399, 400], name="abc"), - pd.Series([111, 222, 333], index=[10, 11, 12]), - ], - False, - ["a", "v", "z"], - ), - ], -) -@pytest.mark.parametrize( - "columns", [None, ["0"], [0], ["abc"], [144, 13], [2, 1, 0]] -) -def test_dataframe_init_from_series_list_with_index( - data, - ignore_dtype, - index, - columns, -): - gd_data = [cudf.from_pandas(obj) for obj in data] - - expected = pd.DataFrame(data, columns=columns, index=index) - actual = cudf.DataFrame(gd_data, columns=columns, index=index) - - if ignore_dtype: - # When a union is performed to generate columns, - # the order is never guaranteed. Hence sort by - # columns before comparison. - if not expected.columns.equals(actual.columns): - expected = expected.sort_index(axis=1) - actual = actual.sort_index(axis=1) - assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) - else: - assert_eq(expected, actual, check_column_type=False) - - -@pytest.mark.parametrize( - "data, index", - [ - ([pd.Series([1, 2]), pd.Series([1, 2])], ["a", "b", "c"]), - ( - [ - pd.Series([1, 0.324234, 32424.323, -1233, 34242]), - pd.Series([], dtype="float64"), - pd.Series([3], name="series that is named"), - ], - ["_", "+"], - ), - ([pd.Series([1, 2, 3], name="hi")] * 10, ["mean"] * 9), - ], -) -def test_dataframe_init_from_series_list_with_index_error(data, index): - gd_data = [cudf.from_pandas(obj) for obj in data] - - assert_exceptions_equal( - pd.DataFrame, - cudf.DataFrame, - ([data], {"index": index}), - ([gd_data], {"index": index}), - ) - - -@pytest.mark.parametrize( - "data", - [ - [pd.Series([1, 2, 3], index=["a", "a", "a"])], - [pd.Series([1, 2, 3], index=["a", "a", "a"])] * 4, - [ - pd.Series([1, 2, 3], index=["a", "b", "a"]), - pd.Series([1, 2, 3], index=["b", "b", "a"]), - ], - [ - pd.Series([1, 2, 3], index=["a", "b", "z"]), - pd.Series([1, 2, 3], index=["u", "b", "a"]), - pd.Series([1, 2, 3], index=["u", "b", "u"]), - ], - ], -) -def test_dataframe_init_from_series_list_duplicate_index_error(data): - gd_data = [cudf.from_pandas(obj) for obj in data] - - assert_exceptions_equal( - lfunc=pd.DataFrame, - rfunc=cudf.DataFrame, - lfunc_args_and_kwargs=([], {"data": data}), - rfunc_args_and_kwargs=([], {"data": gd_data}), - check_exception_type=False, - ) - - -def test_dataframe_iterrows_itertuples(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - - with pytest.raises( - TypeError, - match=re.escape( - "cuDF does not support iteration of DataFrame " - "via itertuples. Consider using " - "`.to_pandas().itertuples()` " - "if you wish to iterate over namedtuples." - ), - ): - df.itertuples() - - with pytest.raises( - TypeError, - match=re.escape( - "cuDF does not support iteration of DataFrame " - "via iterrows. Consider using " - "`.to_pandas().iterrows()` " - "if you wish to iterate over each row." - ), - ): - df.iterrows() - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame( - { - "a": [1, 2, 3], - "b": [10, 22, 33], - "c": [0.3234, 0.23432, 0.0], - "d": ["hello", "world", "hello"], - } - ), - cudf.DataFrame( - { - "a": [1, 2, 3], - "b": ["hello", "world", "hello"], - "c": [0.3234, 0.23432, 0.0], - } - ), - cudf.DataFrame( - { - "int_data": [1, 2, 3], - "str_data": ["hello", "world", "hello"], - "float_data": [0.3234, 0.23432, 0.0], - "timedelta_data": cudf.Series( - [1, 2, 1], dtype="timedelta64[ns]" - ), - "datetime_data": cudf.Series( - [1, 2, 1], dtype="datetime64[ns]" - ), - } - ), - cudf.DataFrame( - { - "int_data": [1, 2, 3], - "str_data": ["hello", "world", "hello"], - "float_data": [0.3234, 0.23432, 0.0], - "timedelta_data": cudf.Series( - [1, 2, 1], dtype="timedelta64[ns]" - ), - "datetime_data": cudf.Series( - [1, 2, 1], dtype="datetime64[ns]" - ), - "category_data": cudf.Series( - ["a", "a", "b"], dtype="category" - ), - } - ), - ], -) -@pytest.mark.parametrize( - "include", - [None, "all", ["object"], ["int"], ["object", "int", "category"]], -) -def test_describe_misc_include(df, include): - pdf = df.to_pandas() - - expected = pdf.describe(include=include) - actual = df.describe(include=include) - - for col in expected.columns: - if expected[col].dtype == np.dtype("object"): - expected[col] = expected[col].fillna(-1).astype("str") - actual[col] = actual[col].fillna(-1).astype("str") - - assert_eq(expected, actual) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame( - { - "a": [1, 2, 3], - "b": [10, 22, 33], - "c": [0.3234, 0.23432, 0.0], - "d": ["hello", "world", "hello"], - } - ), - cudf.DataFrame( - { - "a": [1, 2, 3], - "b": ["hello", "world", "hello"], - "c": [0.3234, 0.23432, 0.0], - } - ), - cudf.DataFrame( - { - "int_data": [1, 2, 3], - "str_data": ["hello", "world", "hello"], - "float_data": [0.3234, 0.23432, 0.0], - "timedelta_data": cudf.Series( - [1, 2, 1], dtype="timedelta64[ns]" - ), - "datetime_data": cudf.Series( - [1, 2, 1], dtype="datetime64[ns]" - ), - } - ), - cudf.DataFrame( - { - "int_data": [1, 2, 3], - "str_data": ["hello", "world", "hello"], - "float_data": [0.3234, 0.23432, 0.0], - "timedelta_data": cudf.Series( - [1, 2, 1], dtype="timedelta64[ns]" - ), - "datetime_data": cudf.Series( - [1, 2, 1], dtype="datetime64[ns]" - ), - "category_data": cudf.Series( - ["a", "a", "b"], dtype="category" - ), - } - ), - ], -) -@pytest.mark.parametrize( - "exclude", [None, ["object"], ["int"], ["object", "int", "category"]] -) -def test_describe_misc_exclude(df, exclude): - pdf = df.to_pandas() - - expected = pdf.describe(exclude=exclude) - actual = df.describe(exclude=exclude) - - for col in expected.columns: - if expected[col].dtype == np.dtype("object"): - expected[col] = expected[col].fillna(-1).astype("str") - actual[col] = actual[col].fillna(-1).astype("str") - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame({"a": [1, 2, 3]}), - cudf.DataFrame( - {"a": [1, 2, 3], "b": ["a", "z", "c"]}, index=["a", "z", "x"] - ), - cudf.DataFrame( - { - "a": [1, 2, 3, None, 2, 1, None], - "b": ["a", "z", "c", "a", "v", "z", "z"], - } - ), - cudf.DataFrame({"a": [], "b": []}), - cudf.DataFrame({"a": [None, None], "b": [None, None]}), - cudf.DataFrame( - { - "a": ["hello", "world", "rapids", "ai", "nvidia"], - "b": cudf.Series( - [1, 21, 21, 11, 11], - dtype="timedelta64[s]", - index=["a", "b", "c", "d", " e"], - ), - }, - index=["a", "b", "c", "d", " e"], - ), - cudf.DataFrame( - { - "a": ["hello", None, "world", "rapids", None, "ai", "nvidia"], - "b": cudf.Series( - [1, 21, None, 11, None, 11, None], dtype="datetime64[s]" - ), - } - ), - ], -) -@pytest.mark.parametrize("numeric_only", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_dataframe_mode(df, numeric_only, dropna): - pdf = df.to_pandas() - - expected = pdf.mode(numeric_only=numeric_only, dropna=dropna) - actual = df.mode(numeric_only=numeric_only, dropna=dropna) - - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0), (None, None), (None, "a")] -) -def test_equals_names(lhs, rhs): - lhs = cudf.DataFrame({lhs: [1, 2]}) - rhs = cudf.DataFrame({rhs: [1, 2]}) - - got = lhs.equals(rhs) - expect = lhs.to_pandas().equals(rhs.to_pandas()) - - assert_eq(expect, got) - - -def test_equals_dtypes(): - lhs = cudf.DataFrame({"a": [1, 2.0]}) - rhs = cudf.DataFrame({"a": [1, 2]}) - - got = lhs.equals(rhs) - expect = lhs.to_pandas().equals(rhs.to_pandas()) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "df1", - [ - pd.DataFrame({"a": [10, 11, 12]}, index=["a", "b", "z"]), - pd.DataFrame({"z": ["a"]}), - pd.DataFrame({"a": [], "b": []}), - ], -) -@pytest.mark.parametrize( - "df2", - [ - pd.DataFrame(), - pd.DataFrame({"a": ["a", "a", "c", "z", "A"], "z": [1, 2, 3, 4, 5]}), - ], -) -@pytest.mark.parametrize( - "op", - [ - operator.eq, - operator.ne, - operator.lt, - operator.gt, - operator.le, - operator.ge, - ], -) -def test_dataframe_error_equality(df1, df2, op): - gdf1 = cudf.from_pandas(df1) - gdf2 = cudf.from_pandas(df2) - - assert_exceptions_equal(op, op, ([df1, df2],), ([gdf1, gdf2],)) - - -@pytest.mark.parametrize( - "df,expected_pdf", - [ - ( - cudf.DataFrame( - { - "a": cudf.Series([1, 2, None, 3], dtype="uint8"), - "b": cudf.Series([23, None, None, 32], dtype="uint16"), - } - ), - pd.DataFrame( - { - "a": pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), - "b": pd.Series( - [23, None, None, 32], dtype=pd.UInt16Dtype() - ), - } - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series([None, 123, None, 1], dtype="uint32"), - "b": cudf.Series( - [234, 2323, 23432, None, None, 224], dtype="uint64" - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - [None, 123, None, 1], dtype=pd.UInt32Dtype() - ), - "b": pd.Series( - [234, 2323, 23432, None, None, 224], - dtype=pd.UInt64Dtype(), - ), - } - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series( - [-10, 1, None, -1, None, 3], dtype="int8" - ), - "b": cudf.Series( - [111, None, 222, None, 13], dtype="int16" - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - [-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype() - ), - "b": pd.Series( - [111, None, 222, None, 13], dtype=pd.Int16Dtype() - ), - } - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series( - [11, None, 22, 33, None, 2, None, 3], dtype="int32" - ), - "b": cudf.Series( - [32431, None, None, 32322, 0, 10, -32324, None], - dtype="int64", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - [11, None, 22, 33, None, 2, None, 3], - dtype=pd.Int32Dtype(), - ), - "b": pd.Series( - [32431, None, None, 32322, 0, 10, -32324, None], - dtype=pd.Int64Dtype(), - ), - } - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series( - [True, None, False, None, False, True, True, False], - dtype="bool_", - ), - "b": cudf.Series( - [ - "abc", - "a", - None, - "hello world", - "foo buzz", - "", - None, - "rapids ai", - ], - dtype="object", - ), - "c": cudf.Series( - [0.1, None, 0.2, None, 3, 4, 1000, None], - dtype="float64", - ), - } - ), - pd.DataFrame( - { - "a": pd.Series( - [True, None, False, None, False, True, True, False], - dtype=pd.BooleanDtype(), - ), - "b": pd.Series( - [ - "abc", - "a", - None, - "hello world", - "foo buzz", - "", - None, - "rapids ai", - ], - dtype=pd.StringDtype(), - ), - "c": pd.Series( - [0.1, None, 0.2, None, 3, 4, 1000, None], - dtype=pd.Float64Dtype(), - ), - } - ), - ), - ], -) -def test_dataframe_to_pandas_nullable_dtypes(df, expected_pdf): - actual_pdf = df.to_pandas(nullable=True) - - assert_eq(actual_pdf, expected_pdf) - - -@pytest.mark.parametrize( - "data", - [ - [{"a": 1, "b": 2, "c": 3}, {"a": 4, "b": 5, "c": 6}], - [{"a": 1, "b": 2, "c": None}, {"a": None, "b": 5, "c": 6}], - [{"a": 1, "b": 2}, {"a": 1, "b": 5, "c": 6}], - [{"a": 1, "b": 2}, {"b": 5, "c": 6}], - [{}, {"a": 1, "b": 5, "c": 6}], - [{"a": 1, "b": 2, "c": 3}, {"a": 4.5, "b": 5.5, "c": 6.5}], - ], -) -def test_dataframe_init_from_list_of_dicts(data): - expect = pd.DataFrame(data) - got = cudf.DataFrame(data) - - assert_eq(expect, got) - - -def test_dataframe_pipe(): - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - - def add_int_col(df, column): - df[column] = df._constructor_sliced([10, 20, 30, 40]) - return df - - def add_str_col(df, column): - df[column] = df._constructor_sliced(["a", "b", "xyz", "ai"]) - return df - - expected = ( - pdf.pipe(add_int_col, "one") - .pipe(add_int_col, column="two") - .pipe(add_str_col, "three") - ) - actual = ( - gdf.pipe(add_int_col, "one") - .pipe(add_int_col, column="two") - .pipe(add_str_col, "three") - ) - - assert_eq(expected, actual) - - expected = ( - pdf.pipe((add_str_col, "df"), column="one") - .pipe(add_str_col, column="two") - .pipe(add_int_col, "three") - ) - actual = ( - gdf.pipe((add_str_col, "df"), column="one") - .pipe(add_str_col, column="two") - .pipe(add_int_col, "three") - ) - - assert_eq(expected, actual) - - -def test_dataframe_pipe_error(): - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - - def custom_func(df, column): - df[column] = df._constructor_sliced([10, 20, 30, 40]) - return df - - assert_exceptions_equal( - lfunc=pdf.pipe, - rfunc=gdf.pipe, - lfunc_args_and_kwargs=([(custom_func, "columns")], {"columns": "d"}), - rfunc_args_and_kwargs=([(custom_func, "columns")], {"columns": "d"}), - ) - - -@pytest.mark.parametrize( - "op", - ["count", "kurt", "kurtosis", "skew"], -) -def test_dataframe_axis1_unsupported_ops(op): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [8, 9, 10]}) - - with pytest.raises( - NotImplementedError, match="Only axis=0 is currently supported." - ): - getattr(df, op)(axis=1) - - -def test_dataframe_from_pandas_duplicate_columns(): - pdf = pd.DataFrame(columns=["a", "b", "c", "a"]) - pdf["a"] = [1, 2, 3] - - with pytest.raises( - ValueError, match="Duplicate column names are not allowed" - ): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame( - {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]} - ), - pd.DataFrame(), - ], -) -@pytest.mark.parametrize( - "columns", - [ - None, - ["a"], - ["c", "a"], - ["b", "a", "c"], - [], - pd.Index(["c", "a"]), - cudf.Index(["c", "a"]), - ["abc", "a"], - ["column_not_exists1", "column_not_exists2"], - ], -) -@pytest.mark.parametrize("index", [["abc", "def", "ghi"]]) -def test_dataframe_constructor_columns(df, columns, index, request): - def assert_local_eq(actual, df, expected, host_columns): - check_index_type = not expected.empty - if host_columns is not None and any( - col not in df.columns for col in host_columns - ): - assert_eq( - expected, - actual, - check_dtype=False, - check_index_type=check_index_type, - ) - else: - assert_eq( - expected, - actual, - check_index_type=check_index_type, - check_column_type=False, - ) - - gdf = cudf.from_pandas(df) - host_columns = ( - columns.to_pandas() if isinstance(columns, cudf.BaseIndex) else columns - ) - - expected = pd.DataFrame(df, columns=host_columns, index=index) - actual = cudf.DataFrame(gdf, columns=columns, index=index) - - assert_local_eq(actual, df, expected, host_columns) - - -def test_dataframe_constructor_column_index_only(): - columns = ["a", "b", "c"] - index = ["r1", "r2", "r3"] - - gdf = cudf.DataFrame(index=index, columns=columns) - assert not id(gdf["a"]._column) == id(gdf["b"]._column) and not id( - gdf["b"]._column - ) == id(gdf["c"]._column) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2.5, 3], "b": [3, 4.5, 5], "c": [2.0, 3.0, 4.0]}, - {"a": [1, 2.2, 3], "b": [2.0, 3.0, 4.0], "c": [5.0, 6.0, 4.0]}, - ], -) -@pytest.mark.parametrize( - "aggs", - [ - ["min", "sum", "max"], - ("min", "sum", "max"), - {"min", "sum", "max"}, - "sum", - {"a": "sum", "b": "min", "c": "max"}, - {"a": ["sum"], "b": ["min"], "c": ["max"]}, - {"a": ("sum"), "b": ("min"), "c": ("max")}, - {"a": {"sum"}, "b": {"min"}, "c": {"max"}}, - {"a": ["sum", "min"], "b": ["sum", "max"], "c": ["min", "max"]}, - {"a": ("sum", "min"), "b": ("sum", "max"), "c": ("min", "max")}, - {"a": {"sum", "min"}, "b": {"sum", "max"}, "c": {"min", "max"}}, - ], -) -def test_agg_for_dataframes(data, aggs): - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - expect = pdf.agg(aggs).sort_index() - got = gdf.agg(aggs).sort_index() - - assert_eq(expect, got, check_dtype=True) - - -@pytest_unmark_spilling -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": [3.0, 4.0, 5.0], "c": [True, True, False]}, - {"a": [1, 2, 3], "b": [True, True, False], "c": [False, True, False]}, - ], -) -@pytest.mark.parametrize( - "aggs", - [ - ["min", "sum", "max"], - "sum", - {"a": "sum", "b": "min", "c": "max"}, - ], -) -def test_agg_for_dataframes_error(data, aggs): - gdf = cudf.DataFrame(data) - - with pytest.raises(TypeError): - gdf.agg(aggs) - - -@pytest.mark.parametrize("aggs", [{"a": np.sum, "b": np.min, "c": np.max}]) -def test_agg_for_unsupported_function(aggs): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - with pytest.raises(NotImplementedError): - gdf.agg(aggs) - - -@pytest.mark.parametrize("aggs", ["asdf"]) -def test_agg_for_dataframe_with_invalid_function(aggs): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - with pytest.raises( - AttributeError, - match=f"{aggs} is not a valid function for 'DataFrame' object", - ): - gdf.agg(aggs) - - -@pytest.mark.parametrize("aggs", [{"a": "asdf"}]) -def test_agg_for_series_with_invalid_function(aggs): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - with pytest.raises( - AttributeError, - match=f"{aggs['a']} is not a valid function for 'Series' object", - ): - gdf.agg(aggs) - - -@pytest.mark.parametrize( - "aggs", - [ - "sum", - ["min", "sum", "max"], - {"a": {"sum", "min"}, "b": {"sum", "max"}, "c": {"min", "max"}}, - ], -) -def test_agg_for_dataframe_with_string_columns(aggs): - gdf = cudf.DataFrame( - {"a": ["m", "n", "o"], "b": ["t", "u", "v"], "c": ["x", "y", "z"]}, - index=["a", "b", "c"], - ) - - with pytest.raises( - NotImplementedError, - match=re.escape( - "DataFrame.agg() is not supported for " - "frames containing string columns" - ), - ): - gdf.agg(aggs) - - -@pytest_unmark_spilling -@pytest.mark.parametrize("overwrite", [True, False]) -@pytest.mark.parametrize( - "left_keys,right_keys", - [ - [("a", "b"), ("a", "b")], - [("a", "b"), ("a", "c")], - [("a", "b"), ("d", "e")], - ], -) -@pytest.mark.parametrize( - "data_left,data_right", - [ - [([1, 2, 3], [3, 4, 5]), ([1, 2, 3], [3, 4, 5])], - [ - ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), - ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), - ], - [ - ([True, False, True], [False, False, False]), - ([True, False, True], [False, False, False]), - ], - [ - ([np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]), - ([np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan]), - ], - [([1, 2, 3], [3, 4, 5]), ([1, 2, 4], [30, 40, 50])], - [ - ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), - ([1.0, 2.0, 4.0], [30.0, 40.0, 50.0]), - ], - [([1, 2, 3], [3, 4, 5]), ([10, 20, 40], [30, 40, 50])], - [ - ([1.0, 2.0, 3.0], [3.0, 4.0, 5.0]), - ([10.0, 20.0, 40.0], [30.0, 40.0, 50.0]), - ], - ], -) -def test_update_for_dataframes( - left_keys, right_keys, data_left, data_right, overwrite -): - errors = "ignore" - join = "left" - left = dict(zip(left_keys, data_left)) - right = dict(zip(right_keys, data_right)) - pdf = pd.DataFrame(left) - gdf = cudf.DataFrame(left, nan_as_null=False) - - other_pd = pd.DataFrame(right) - other_gd = cudf.DataFrame(right, nan_as_null=False) - - pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors) - gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors) - - assert_eq(pdf, gdf, check_dtype=False) - - -@pytest.mark.parametrize( - "join", - ["right"], -) -def test_update_for_right_join(join): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - other_gd = cudf.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) - - with pytest.raises( - NotImplementedError, match="Only left join is supported" - ): - gdf.update(other_gd, join) - - -@pytest.mark.parametrize( - "errors", - ["raise"], -) -def test_update_for_data_overlap(errors): - pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3.0, 4.0, 5.0]}) - - other_pd = pd.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) - other_gd = cudf.DataFrame({"a": [1, np.nan, 3], "b": [np.nan, 2.0, 5.0]}) - - assert_exceptions_equal( - lfunc=pdf.update, - rfunc=gdf.update, - lfunc_args_and_kwargs=([other_pd, errors], {}), - rfunc_args_and_kwargs=([other_gd, errors], {}), - ) - - -@pytest.mark.parametrize( - "gdf", - [ - cudf.DataFrame({"a": [[1], [2], [3]]}), - cudf.DataFrame( - { - "left-a": [0, 1, 2], - "a": [[1], None, [3]], - "right-a": ["abc", "def", "ghi"], - } - ), - cudf.DataFrame( - { - "left-a": [[], None, None], - "a": [[1], None, [3]], - "right-a": ["abc", "def", "ghi"], - } - ), - ], -) -def test_dataframe_roundtrip_arrow_list_dtype(gdf): - table = gdf.to_arrow() - expected = cudf.DataFrame.from_arrow(table) - - assert_eq(gdf, expected) - - -@pytest.mark.parametrize( - "gdf", - [ - cudf.DataFrame({"a": [{"one": 3, "two": 4, "three": 10}]}), - cudf.DataFrame( - { - "left-a": [0, 1, 2], - "a": [{"x": 0.23, "y": 43}, None, {"x": 23.9, "y": 4.3}], - "right-a": ["abc", "def", "ghi"], - } - ), - cudf.DataFrame( - { - "left-a": [{"a": 1}, None, None], - "a": [ - {"one": 324, "two": 23432, "three": 324}, - None, - {"one": 3.24, "two": 1, "three": 324}, - ], - "right-a": ["abc", "def", "ghi"], - } - ), - ], -) -def test_dataframe_roundtrip_arrow_struct_dtype(gdf): - table = gdf.to_arrow() - expected = cudf.DataFrame.from_arrow(table) - - assert_eq(gdf, expected) - - -def test_dataframe_setitem_cupy_array(): - np.random.seed(0) - pdf = pd.DataFrame(np.random.randn(10, 2)) - gdf = cudf.from_pandas(pdf) - - gpu_array = cupy.array([True, False] * 5) - pdf[gpu_array.get()] = 1.5 - gdf[gpu_array] = 1.5 - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("level", ["x", 0]) -def test_rename_for_level_MultiIndex_dataframe(level): - data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} - index = {0: 123, 1: 4, 2: 6} - pdf = pd.DataFrame( - data, - index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]), - ) - pdf.index.names = ["x", "y", "z"] - gdf = cudf.from_pandas(pdf) - - expect = pdf.rename(index=index, level=level) - got = gdf.rename(index=index, level=level) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] -) -@pytest.mark.parametrize( - "columns", - [{"a": "f", "b": "g"}, {1: 3, 2: 4}, lambda s: 2 * s], -) -@pytest.mark.parametrize( - "level", - [0, 1], -) -def test_rename_for_level_MultiColumn_dataframe(data, columns, level): - gdf = cudf.DataFrame(data) - gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) - - pdf = gdf.to_pandas() - - expect = pdf.rename(columns=columns, level=level) - got = gdf.rename(columns=columns, level=level) - - assert_eq(expect, got) - - -def test_rename_for_level_RangeIndex_dataframe(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - pdf = gdf.to_pandas() - - expect = pdf.rename(columns={"a": "f"}, index={0: 3, 1: 4}, level=0) - got = gdf.rename(columns={"a": "f"}, index={0: 3, 1: 4}, level=0) - - assert_eq(expect, got) - - -def test_rename_for_level_is_None_MC(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) - pdf = gdf.to_pandas() - - expect = pdf.rename(columns={"a": "f"}, level=None) - got = gdf.rename(columns={"a": "f"}, level=None) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - [[1, 2, 3], 11, "a"], - [None, 22, "e"], - [[4], 33, "i"], - [[], 44, "o"], - [[5, 6], 55, "u"], - ], # nested - [ - [1, 11, "a"], - [2, 22, "e"], - [3, 33, "i"], - [4, 44, "o"], - [5, 55, "u"], - ], # non-nested - ], -) -@pytest.mark.parametrize( - ("labels", "label_to_explode"), - [ - (None, 0), - (pd.Index(["a", "b", "c"]), "a"), - ( - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] - ), - (0, "a"), - ), - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize( - "p_index", - [ - None, - ["ia", "ib", "ic", "id", "ie"], - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] - ), - ], -) -def test_explode(data, labels, ignore_index, p_index, label_to_explode): - pdf = pd.DataFrame(data, index=p_index, columns=labels) - gdf = cudf.from_pandas(pdf) - - expect = pdf.explode(label_to_explode, ignore_index) - got = gdf.explode(label_to_explode, ignore_index) - - assert_eq(expect, got, check_dtype=False) - - -def test_explode_preserve_categorical(): - gdf = cudf.DataFrame( - { - "A": [[1, 2], None, [2, 3]], - "B": cudf.Series([0, 1, 2], dtype="category"), - } - ) - result = gdf.explode("A") - expected = cudf.DataFrame( - { - "A": [1, 2, None, 2, 3], - "B": cudf.Series([0, 0, 1, 2, 2], dtype="category"), - } - ) - expected.index = cudf.Index([0, 0, 1, 2, 2]) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "df,ascending,expected", - [ - ( - cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}), - True, - cupy.array([1, 2, 0], dtype="int32"), - ), - ( - cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}), - False, - cupy.array([0, 2, 1], dtype="int32"), - ), - ], -) -def test_dataframe_argsort(df, ascending, expected): - actual = df.argsort(ascending=ascending) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data,columns,index", - [ - (pd.Series([1, 2, 3]), None, None), - (pd.Series(["a", "b", None, "c"], name="abc"), None, None), - ( - pd.Series(["a", "b", None, "c"], name="abc"), - ["abc", "b"], - [1, 2, 3], - ), - ], -) -def test_dataframe_init_from_series(data, columns, index): - expected = pd.DataFrame(data, columns=columns, index=index) - actual = cudf.DataFrame(data, columns=columns, index=index) - - assert_eq( - expected, - actual, - check_index_type=len(expected) != 0, - ) - - -def test_frame_series_where(): - gdf = cudf.DataFrame( - {"a": [1.0, 2.0, None, 3.0, None], "b": [None, 10.0, 11.0, None, 23.0]} - ) - pdf = gdf.to_pandas() - expected = gdf.where(gdf.notna(), gdf.mean()) - actual = pdf.where(pdf.notna(), pdf.mean(), axis=1) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [{"a": [1, 2, 3], "b": [1, 1, 0]}], -) -def test_frame_series_where_other(data): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = gdf.where(gdf["b"] == 1, cudf.NA) - actual = pdf.where(pdf["b"] == 1, pd.NA) - assert_eq( - actual.fillna(-1).values, - expected.fillna(-1).values, - check_dtype=False, - ) - - expected = gdf.where(gdf["b"] == 1, 0) - actual = pdf.where(pdf["b"] == 1, 0) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data, gkey", - [ - ( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - }, - ["id", "val1", "val2"], - ), - ( - { - "id": [0] * 4 + [1] * 3, - "a": [10, 3, 4, 2, -3, 9, 10], - "b": [10, 23, -4, 2, -3, 9, 19], - }, - ["id", "a"], - ), - ( - { - "id": ["a", "a", "b", "b", "c", "c"], - "val": cudf.Series( - [None, None, None, None, None, None], dtype="float64" - ), - }, - ["id"], - ), - ( - { - "id": ["a", "a", "b", "b", "c", "c"], - "val1": [None, 4, 6, 8, None, 2], - "val2": [4, 5, None, 2, 9, None], - }, - ["id"], - ), - ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]), - ], -) -@pytest.mark.parametrize( - "min_per", - [0, 1, 2, 3, 4], -) -def test_pearson_corr_passing(data, gkey, min_per): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per) - expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("method", ["kendall", "spearman"]) -def test_pearson_corr_unsupported_methods(method): - gdf = cudf.DataFrame( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - } - ) - - with pytest.raises( - NotImplementedError, - match="Only pearson correlation is currently supported", - ): - gdf.groupby("id").corr(method) - - -def test_pearson_corr_empty_columns(): - gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) - pdf = gdf.to_pandas() - - actual = gdf.groupby("id").corr("pearson") - expected = pdf.groupby("id").corr("pearson") - - assert_eq( - expected, - actual, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "data", - [ - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], - "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], - }, - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [1, 1, 1, 2, 2, 2, 3, 3, 3], - "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], - }, - ], -) -@pytest.mark.parametrize("gkey", ["id", "val1", "val2"]) -def test_pearson_corr_invalid_column_types(data, gkey): - with pytest.raises( - TypeError, - match="Correlation accepts only numerical column-pairs", - ): - cudf.DataFrame(data).groupby(gkey).corr("pearson") - - -def test_pearson_corr_multiindex_dataframe(): - gdf = cudf.DataFrame( - {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]} - ).set_index(["a", "b"]) - - actual = gdf.groupby(level="a").corr("pearson") - expected = gdf.to_pandas().groupby(level="a").corr("pearson") - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [np.nan, 1, 2], "b": [None, None, None]}, - {"a": [1, 2, np.nan, 2], "b": [np.nan, np.nan, np.nan, np.nan]}, - { - "a": [1, 2, np.nan, 2, None], - "b": [np.nan, np.nan, None, np.nan, np.nan], - }, - {"a": [1, 2, 2, None, 1.1], "b": [1, 2.2, 3, None, 5]}, - ], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_dataframe_constructor_nan_as_null(data, nan_as_null): - actual = cudf.DataFrame(data, nan_as_null=nan_as_null) - - if nan_as_null: - assert ( - not ( - actual.astype("float").replace( - cudf.Series([np.nan], nan_as_null=False), cudf.Series([-1]) - ) - == -1 - ) - .any() - .any() - ) - else: - actual = actual.select_dtypes(exclude=["object"]) - assert (actual.replace(np.nan, -1) == -1).any().any() - - -def test_dataframe_add_prefix(): - cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) - pdf = cdf.to_pandas() - - got = cdf.add_prefix("item_") - expected = pdf.add_prefix("item_") - - assert_eq(got, expected) - - -def test_dataframe_add_suffix(): - cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) - pdf = cdf.to_pandas() - - got = cdf.add_suffix("_item") - expected = pdf.add_suffix("_item") - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data, gkey", - [ - ( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - }, - ["id"], - ), - ( - { - "id": [0, 0, 0, 0, 1, 1, 1], - "a": [10.0, 3, 4, 2.0, -3.0, 9.0, 10.0], - "b": [10.0, 23, -4.0, 2, -3.0, 9, 19.0], - }, - ["id", "a"], - ), - ], -) -@pytest.mark.parametrize( - "min_periods", - [0, 3], -) -@pytest.mark.parametrize( - "ddof", - [1, 2], -) -def test_groupby_covariance(data, gkey, min_periods, ddof): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - actual = gdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) - # We observe a warning if there are too few observations to generate a - # non-singular covariance matrix _and_ there are enough that pandas will - # actually attempt to compute a value. Groups with fewer than min_periods - # inputs will be skipped altogether, so no warning occurs. - with expect_warning_if( - (pdf.groupby(gkey).count() < 2).all().all() - and (pdf.groupby(gkey).count() > min_periods).all().all(), - RuntimeWarning, - ): - expected = pdf.groupby(gkey).cov(min_periods=min_periods, ddof=ddof) - - assert_eq(expected, actual) - - -def test_groupby_covariance_multiindex_dataframe(): - gdf = cudf.DataFrame( - { - "a": [1, 1, 2, 2], - "b": [1, 1, 2, 2], - "c": [2, 3, 4, 5], - "d": [6, 8, 9, 1], - } - ).set_index(["a", "b"]) - - actual = gdf.groupby(level=["a", "b"]).cov() - expected = gdf.to_pandas().groupby(level=["a", "b"]).cov() - - assert_eq(expected, actual) - - -def test_groupby_covariance_empty_columns(): - gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) - pdf = gdf.to_pandas() - - actual = gdf.groupby("id").cov() - expected = pdf.groupby("id").cov() - - assert_eq( - expected, - actual, - check_dtype=False, - check_index_type=False, - ) - - -def test_groupby_cov_invalid_column_types(): - gdf = cudf.DataFrame( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], - "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], - }, - ) - with pytest.raises( - TypeError, - match="Covariance accepts only numerical column-pairs", - ): - gdf.groupby("id").cov() - - -def test_groupby_cov_positive_semidefinite_matrix(): - # Refer to discussions in PR #9889 re "pair-wise deletion" strategy - # being used in pandas to compute the covariance of a dataframe with - # rows containing missing values. - # Note: cuDF currently matches pandas behavior in that the covariance - # matrices are not guaranteed PSD (positive semi definite). - # https://github.com/rapidsai/cudf/pull/9889#discussion_r794158358 - gdf = cudf.DataFrame( - [[1, 2], [None, 4], [5, None], [7, 8]], columns=["v0", "v1"] - ) - actual = gdf.groupby(by=cudf.Series([1, 1, 1, 1])).cov() - actual.reset_index(drop=True, inplace=True) - - pdf = gdf.to_pandas() - expected = pdf.groupby(by=pd.Series([1, 1, 1, 1])).cov() - expected.reset_index(drop=True, inplace=True) - - assert_eq( - expected, - actual, - check_dtype=False, - ) - - -@pytest_xfail -def test_groupby_cov_for_pandas_bug_case(): - # Handles case: pandas bug using ddof with missing data. - # Filed an issue in Pandas on GH, link below: - # https://github.com/pandas-dev/pandas/issues/45814 - pdf = pd.DataFrame( - {"id": ["a", "a"], "val1": [1.0, 2.0], "val2": [np.nan, np.nan]} - ) - expected = pdf.groupby("id").cov(ddof=2) - - gdf = cudf.from_pandas(pdf) - actual = gdf.groupby("id").cov(ddof=2) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - np.random.RandomState(seed=10).randint(-50, 50, (25, 30)), - np.random.RandomState(seed=10).random_sample((4, 4)), - np.array([1.123, 2.343, 5.890, 0.0]), - [True, False, True, False, False], - {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, - ], -) -@pytest.mark.parametrize("periods", (-5, -1, 0, 1, 5)) -def test_diff_numeric_dtypes(data, periods): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - actual = gdf.diff(periods=periods, axis=0) - expected = pdf.diff(periods=periods, axis=0) - - assert_eq( - expected, - actual, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - ("precision", "scale"), - [(5, 2), (8, 5)], -) -@pytest.mark.parametrize( - "dtype", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype], -) -def test_diff_decimal_dtypes(precision, scale, dtype): - gdf = cudf.DataFrame( - np.random.default_rng(seed=42).uniform(10.5, 75.5, (10, 6)), - dtype=dtype(precision=precision, scale=scale), - ) - pdf = gdf.to_pandas() - - actual = gdf.diff() - expected = pdf.diff() - - assert_eq( - expected, - actual, - check_dtype=False, - ) - - -def test_diff_invalid_axis(): - gdf = cudf.DataFrame(np.array([1.123, 2.343, 5.890, 0.0])) - with pytest.raises(NotImplementedError, match="Only axis=0 is supported."): - gdf.diff(periods=1, axis=1) - - -@pytest.mark.parametrize( - "data", - [ - { - "int_col": [1, 2, 3, 4, 5], - "float_col": [1.0, 2.0, 3.0, 4.0, 5.0], - "string_col": ["a", "b", "c", "d", "e"], - }, - ["a", "b", "c", "d", "e"], - ], -) -def test_diff_unsupported_dtypes(data): - gdf = cudf.DataFrame(data) - with pytest.raises( - TypeError, - match=r"unsupported operand type\(s\)", - ): - gdf.diff() - - -def test_diff_many_dtypes(): - pdf = pd.DataFrame( - { - "dates": pd.date_range("2020-01-01", "2020-01-06", freq="D"), - "bools": [True, True, True, False, True, True], - "floats": [1.0, 2.0, 3.5, np.nan, 5.0, -1.7], - "ints": [1, 2, 3, 3, 4, 5], - "nans_nulls": [np.nan, None, None, np.nan, np.nan, None], - } - ) - gdf = cudf.from_pandas(pdf) - assert_eq(pdf.diff(), gdf.diff()) - assert_eq(pdf.diff(periods=2), gdf.diff(periods=2)) - - -def test_dataframe_assign_cp_np_array(): - m, n = 5, 3 - cp_ndarray = cupy.random.randn(m, n) - pdf = pd.DataFrame({f"f_{i}": range(m) for i in range(n)}) - gdf = cudf.DataFrame({f"f_{i}": range(m) for i in range(n)}) - pdf[[f"f_{i}" for i in range(n)]] = cupy.asnumpy(cp_ndarray) - gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "data", - [{"a": [1, 2, 3], "b": [1, 1, 0]}], -) -def test_dataframe_nunique(data): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - actual = gdf.nunique() - expected = pdf.nunique() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "columns", - [ - pd.RangeIndex(2, name="foo"), - pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]), - pd.Index([3, 5], dtype=np.int8, name="foo"), - ], -) -def test_nunique_preserve_column_in_index(columns): - df = cudf.DataFrame([[1, 2]], columns=columns) - result = df.nunique().index.to_pandas() - assert_eq(result, columns, exact=True) - - -@pytest.mark.parametrize( - "data", - [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], -) -def test_dataframe_nunique_index(data): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - actual = gdf.index.nunique() - expected = pdf.index.nunique() - - assert_eq(expected, actual) - - -def test_dataframe_rename_duplicate_column(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) - with pytest.raises( - ValueError, match="Duplicate column names are not allowed" - ): - gdf.rename(columns={"a": "b"}, inplace=True) - - -def test_dataframe_rename_columns_keep_type(): - gdf = cudf.DataFrame([[1, 2, 3]]) - gdf.columns = cudf.Index([4, 5, 6], dtype=np.int8) - result = gdf.rename({4: 50}, axis="columns").columns - expected = pd.Index([50, 5, 6], dtype=np.int8) - assert_eq(result, expected) - - -@pytest_unmark_spilling -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - np.random.RandomState(seed=10).randint(-50, 50, (10, 10)), - np.random.RandomState(seed=10).random_sample((4, 4)), - np.array([1.123, 2.343, 5.890, 0.0]), - {"a": [1.123, 2.343, np.nan, np.nan], "b": [None, 3, 9.08, None]}, - ], -) -@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) -@pytest.mark.parametrize( - "fill_method", ["ffill", "bfill", "pad", "backfill", no_default] -) -def test_dataframe_pct_change(data, periods, fill_method): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - with expect_warning_if(fill_method is not no_default): - actual = gdf.pct_change(periods=periods, fill_method=fill_method) - with expect_warning_if( - fill_method is not no_default or pdf.isna().any().any() - ): - expected = pdf.pct_change(periods=periods, fill_method=fill_method) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_mean_timeseries(numeric_only): - gdf = cudf.datasets.timeseries() - if not numeric_only: - gdf = gdf.select_dtypes(include="number") - pdf = gdf.to_pandas() - - expected = pdf.mean(numeric_only=numeric_only) - actual = gdf.mean(numeric_only=numeric_only) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - { - "a": [1, 2, 3, 4, 5], - "b": ["a", "b", "c", "d", "e"], - "c": [1.0, 2.0, 3.0, 4.0, 5.0], - } - ], -) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_std_different_dtypes(data, numeric_only): - gdf = cudf.DataFrame(data) - if not numeric_only: - gdf = gdf.select_dtypes(include="number") - pdf = gdf.to_pandas() - - expected = pdf.std(numeric_only=numeric_only) - actual = gdf.std(numeric_only=numeric_only) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], - "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], - } - ], -) -def test_empty_numeric_only(data): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - expected = pdf.prod(numeric_only=True) - actual = gdf.prod(numeric_only=True) - assert_eq(expected, actual, check_dtype=True) - - -@pytest.fixture(params=[0, 10], ids=["empty", "10"]) -def df_eval(request): - N = request.param - if N == 0: - value = np.zeros(0, dtype="int") - return cudf.DataFrame( - { - "a": value, - "b": value, - "c": value, - "d": value, - } - ) - int_max = 10 - rng = cupy.random.default_rng(0) - return cudf.DataFrame( - { - "a": rng.integers(N, size=int_max), - "b": rng.integers(N, size=int_max), - "c": rng.integers(N, size=int_max), - "d": rng.integers(N, size=int_max), - } - ) - - -# Note that for now expressions do not automatically handle casting, so inputs -# need to be casted appropriately -@pytest.mark.parametrize( - "expr, dtype", - [ - ("a", int), - ("+a", int), - ("a + b", int), - ("a == b", int), - ("a / b", float), - ("a * b", int), - ("a > b", int), - ("a >= b", int), - ("a > b > c", int), - ("a > b < c", int), - ("a & b", int), - ("a & b | c", int), - ("sin(a)", float), - ("exp(sin(abs(a)))", float), - ("sqrt(floor(a))", float), - ("ceil(arctanh(a))", float), - ("(a + b) - (c * d)", int), - ("~a", int), - ("(a > b) and (c > d)", int), - ("(a > b) or (c > d)", int), - ("not (a > b)", int), - ("a + 1", int), - ("a + 1.0", float), - ("-a + 1", int), - ("+a + 1", int), - ("e = a + 1", int), - ( - """ - e = log(cos(a)) + 1.0 - f = abs(c) - exp(d) - """, - float, - ), - ("a_b_are_equal = (a == b)", int), - ("a > b", str), - ("a < '1'", str), - ('a == "1"', str), - ], -) -def test_dataframe_eval(df_eval, expr, dtype): - df_eval = df_eval.astype(dtype) - with _hide_ufunc_warnings(expr): - expect = df_eval.to_pandas().eval(expr) - got = df_eval.eval(expr) - # In the specific case where the evaluated expression is a unary function - # of a single column with no nesting, pandas will retain the name. This - # level of compatibility is out of scope for now. - assert_eq(expect, got, check_names=False) - - # Test inplace - if re.search("[^=><]=[^=]", expr) is not None: - pdf_eval = df_eval.to_pandas() - with _hide_ufunc_warnings(expr): - pdf_eval.eval(expr, inplace=True) - df_eval.eval(expr, inplace=True) - assert_eq(pdf_eval, df_eval) - - -@pytest.mark.parametrize( - "expr", - [ - """ - e = a + b - a == b - """, - "a_b_are_equal = (a == b) = c", - ], -) -def test_dataframe_eval_errors(df_eval, expr): - with pytest.raises(ValueError): - df_eval.eval(expr) - - -def test_dataframe_eval_misc(): - df = cudf.DataFrame({"a": [1, 2, 3, None, 5]}) - got = df.eval("isnull(a)") - assert_eq(got, cudf.Series.isnull(df["a"]), check_names=False) - - df.eval("c = isnull(1)", inplace=True) - assert_eq(df["c"], cudf.Series([False] * len(df), name="c")) - - -@pytest.mark.parametrize( - "gdf,subset", - [ - ( - cudf.DataFrame( - {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - index=["falcon", "dog", "cat", "ant"], - ), - ["num_legs"], - ), - ( - cudf.DataFrame( - { - "first_name": ["John", "Anne", "John", "Beth"], - "middle_name": ["Smith", None, None, "Louise"], - } - ), - ["first_name"], - ), - ], -) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("use_subset", [True, False]) -def test_value_counts( - gdf, - subset, - sort, - ascending, - normalize, - dropna, - use_subset, -): - pdf = gdf.to_pandas() - - got = gdf.value_counts( - subset=subset if (use_subset) else None, - sort=sort, - ascending=ascending, - normalize=normalize, - dropna=dropna, - ) - expected = pdf.value_counts( - subset=subset if (use_subset) else None, - sort=sort, - ascending=ascending, - normalize=normalize, - dropna=dropna, - ) - - if not dropna: - # Convert the Pandas series to a cuDF one due to difference - # in the handling of NaNs between the two ( in cuDF and - # NaN in Pandas) when dropna=False. - assert_eq(got.sort_index(), cudf.from_pandas(expected).sort_index()) - else: - assert_eq(got.sort_index(), expected.sort_index()) - - with pytest.raises(KeyError): - gdf.value_counts(subset=["not_a_column_name"]) - - -@pytest.fixture -def wildcard_df(): - midx = cudf.MultiIndex.from_tuples( - [(c1, c2) for c1 in "abc" for c2 in "ab"] - ) - df = cudf.DataFrame({f"{i}": [i] for i in range(6)}) - df.columns = midx - return df - - -def test_multiindex_wildcard_selection_all(wildcard_df): - expect = wildcard_df.to_pandas().loc[:, (slice(None), "b")] - got = wildcard_df.loc[:, (slice(None), "b")] - assert_eq(expect, got) - - -@pytest_xfail(reason="Not yet properly supported.") -def test_multiindex_wildcard_selection_partial(wildcard_df): - expect = wildcard_df.to_pandas().loc[:, (slice("a", "b"), "b")] - got = wildcard_df.loc[:, (slice("a", "b"), "b")] - assert_eq(expect, got) - - -@pytest_xfail(reason="Not yet properly supported.") -def test_multiindex_wildcard_selection_three_level_all(): - midx = cudf.MultiIndex.from_tuples( - [(c1, c2, c3) for c1 in "abcd" for c2 in "abc" for c3 in "ab"] - ) - df = cudf.DataFrame({f"{i}": [i] for i in range(24)}) - df.columns = midx - - expect = df.to_pandas().loc[:, (slice("a", "c"), slice("a", "b"), "b")] - got = df.loc[:, (slice(None), "b")] - assert_eq(expect, got) - - -def test_dataframe_assign_scalar_to_empty_series(): - expected = pd.DataFrame({"a": []}) - actual = cudf.DataFrame({"a": []}) - expected.a = 0 - actual.a = 0 - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {0: [1, 2, 3], 2: [10, 11, 23]}, - {("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]}, - ], -) -def test_non_string_column_name_to_arrow(data): - df = cudf.DataFrame(data) - - expected = df.to_arrow() - actual = pa.Table.from_pandas(df.to_pandas()) - - assert expected.equals(actual) - - -def test_complex_types_from_arrow(): - expected = pa.Table.from_arrays( - [ - pa.array([1, 2, 3]), - pa.array([10, 20, 30]), - pa.array([{"a": 9}, {"b": 10}, {"c": 11}]), - pa.array([[{"a": 1}], [{"b": 2}], [{"c": 3}]]), - pa.array([10, 11, 12]).cast(pa.decimal128(21, 2)), - pa.array([{"a": 9}, {"b": 10, "c": {"g": 43}}, {"c": {"a": 10}}]), - ], - names=["a", "b", "c", "d", "e", "f"], - ) - - df = cudf.DataFrame.from_arrow(expected) - actual = df.to_arrow() - - assert expected.equals(actual) - - -@pytest.mark.parametrize( - "data", - [ - { - "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], - "style": ["cup", "cup", "cup", "pack", "pack"], - "rating": [4, 4, 3.5, 15, 5], - }, - { - "brand": ["Indomie", "Yum Yum", "Indomie", "Indomie", "Indomie"], - "style": ["cup", "cup", "cup", "cup", "pack"], - "rating": [4, 4, 3.5, 4, 5], - }, - ], -) -@pytest.mark.parametrize( - "subset", [None, ["brand"], ["rating"], ["style", "rating"]] -) -@pytest.mark.parametrize("keep", ["first", "last", False]) -def test_dataframe_duplicated(data, subset, keep): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = pdf.duplicated(subset=subset, keep=keep) - actual = gdf.duplicated(subset=subset, keep=keep) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, - {"a": [[{"b": 567}], None] * 10}, - {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, - ], -) -def test_dataframe_transpose_complex_types(data): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = pdf.T - actual = gdf.T - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"col": [{"a": 1.1}, {"a": 2.1}, {"a": 10.0}, {"a": 11.2323}, None]}, - {"a": [[{"b": 567}], None] * 10}, - {"a": [decimal.Decimal(10), decimal.Decimal(20), None]}, - ], -) -def test_dataframe_values_complex_types(data): - gdf = cudf.DataFrame(data) - with pytest.raises(NotImplementedError): - gdf.values - - -def test_dataframe_from_arrow_slice(): - table = pa.Table.from_pandas( - pd.DataFrame.from_dict( - {"a": ["aa", "bb", "cc"] * 3, "b": [1, 2, 3] * 3} - ) - ) - table_slice = table.slice(3, 7) - - expected = table_slice.to_pandas() - actual = cudf.DataFrame.from_arrow(table_slice) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 4}, - {"c": 4, "a": [1, 2, 3], "b": ["x", "y", "z"]}, - {"a": [1, 2, 3], "c": 4}, - ], -) -def test_dataframe_init_from_scalar_and_lists(data): - actual = cudf.DataFrame(data) - expected = pd.DataFrame(data) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,index", - [ - ({"a": [1, 2, 3], "b": ["x", "y", "z", "z"], "c": 4}, None), - ( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - }, - [10, 11], - ), - ( - { - "a": [1, 2, 3], - "b": ["x", "y", "z"], - }, - [10, 11], - ), - ([[10, 11], [12, 13]], ["a", "b", "c"]), - ], -) -def test_dataframe_init_length_error(data, index): - assert_exceptions_equal( - lfunc=pd.DataFrame, - rfunc=cudf.DataFrame, - lfunc_args_and_kwargs=( - [], - {"data": data, "index": index}, - ), - rfunc_args_and_kwargs=( - [], - {"data": data, "index": index}, - ), - ) - - -def test_dataframe_binop_with_mixed_date_types(): - df = pd.DataFrame( - np.random.rand(2, 2), - columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), - ) - ser = pd.Series(np.random.rand(3), index=[0, 1, 2]) - gdf = cudf.from_pandas(df) - gser = cudf.from_pandas(ser) - expected = df - ser - got = gdf - gser - assert_eq(expected, got) - - -def test_dataframe_binop_with_mixed_string_types(): - df1 = pd.DataFrame(np.random.rand(3, 3), columns=pd.Index([0, 1, 2])) - df2 = pd.DataFrame( - np.random.rand(6, 6), - columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]), - ) - gdf1 = cudf.from_pandas(df1) - gdf2 = cudf.from_pandas(df2) - - expected = df2 + df1 - got = gdf2 + gdf1 - - assert_eq(expected, got) - - -def test_dataframe_binop_and_where(): - df = pd.DataFrame(np.random.rand(2, 2), columns=pd.Index([True, False])) - gdf = cudf.from_pandas(df) - - expected = df > 1 - got = gdf > 1 - - assert_eq(expected, got) - - expected = df[df > 1] - got = gdf[gdf > 1] - - assert_eq(expected, got) - - -def test_dataframe_binop_with_datetime_index(): - df = pd.DataFrame( - np.random.rand(2, 2), - columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), - ) - ser = pd.Series( - np.random.rand(2), - index=pd.Index( - [ - "2000-01-04", - "2000-01-03", - ], - dtype="datetime64[ns]", - ), - ) - gdf = cudf.from_pandas(df) - gser = cudf.from_pandas(ser) - expected = df - ser - got = gdf - gser - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "columns", - ( - [], - ["c", "a"], - ["a", "d", "b", "e", "c"], - ["a", "b", "c"], - pd.Index(["b", "a", "c"], name="custom_name"), - ), -) -@pytest.mark.parametrize("index", (None, [4, 5, 6])) -def test_dataframe_dict_like_with_columns(columns, index): - data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} - expect = pd.DataFrame(data, columns=columns, index=index) - actual = cudf.DataFrame(data, columns=columns, index=index) - if index is None and len(columns) == 0: - # We make an empty range index, pandas makes an empty index - expect = expect.reset_index(drop=True) - assert_eq(expect, actual) - - -def test_dataframe_init_columns_named_multiindex(): - np.random.seed(0) - data = np.random.randn(2, 2) - columns = cudf.MultiIndex.from_tuples( - [("A", "one"), ("A", "two")], names=["y", "z"] - ) - gdf = cudf.DataFrame(data, columns=columns) - pdf = pd.DataFrame(data, columns=columns.to_pandas()) - - assert_eq(gdf, pdf) - - -def test_dataframe_init_columns_named_index(): - np.random.seed(0) - data = np.random.randn(2, 2) - columns = pd.Index(["a", "b"], name="custom_name") - gdf = cudf.DataFrame(data, columns=columns) - pdf = pd.DataFrame(data, columns=columns) - - assert_eq(gdf, pdf) - - -def test_dataframe_from_pandas_sparse(): - pdf = pd.DataFrame(range(2), dtype=pd.SparseDtype(np.int64, 0)) - with pytest.raises(NotImplementedError): - cudf.DataFrame(pdf) - - -def test_dataframe_constructor_unbounded_sequence(): - class A: - def __getitem__(self, key): - return 1 - - with pytest.raises(TypeError): - cudf.DataFrame([A()]) - - with pytest.raises(TypeError): - cudf.DataFrame({"a": A()}) - - -def test_dataframe_constructor_dataframe_list(): - df = cudf.DataFrame(range(2)) - with pytest.raises(ValueError): - cudf.DataFrame([df]) - - -def test_dataframe_constructor_from_namedtuple(): - Point1 = namedtuple("Point1", ["a", "b", "c"]) - Point2 = namedtuple("Point1", ["x", "y"]) - - data = [Point1(1, 2, 3), Point2(4, 5)] - idx = ["a", "b"] - gdf = cudf.DataFrame(data, index=idx) - pdf = pd.DataFrame(data, index=idx) - - assert_eq(gdf, pdf) - - data = [Point2(4, 5), Point1(1, 2, 3)] - with pytest.raises(ValueError): - cudf.DataFrame(data, index=idx) - with pytest.raises(ValueError): - pd.DataFrame(data, index=idx) - - -@pytest.mark.parametrize( - "dtype", ["datetime64[ns]", "timedelta64[ns]", "int64", "float32"] -) -def test_dataframe_mixed_dtype_error(dtype): - pdf = pd.Series([1, 2, 3], dtype=dtype).to_frame().astype(object) - with pytest.raises(TypeError): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "index_data,name", - [([10, 13], "a"), ([30, 40, 20], "b"), (["ef"], "c"), ([2, 3], "Z")], -) -def test_dataframe_reindex_with_index_names(index_data, name): - gdf = cudf.DataFrame( - { - "a": [10, 12, 13], - "b": [20, 30, 40], - "c": cudf.Series(["ab", "cd", "ef"], dtype="category"), - } - ) - if name in gdf.columns: - gdf = gdf.set_index(name) - pdf = gdf.to_pandas() - - gidx = cudf.Index(index_data, name=name) - actual = gdf.reindex(gidx) - expected = pdf.reindex(gidx.to_pandas()) - - assert_eq(actual, expected) - - actual = gdf.reindex(index_data) - expected = pdf.reindex(index_data) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"]) -def test_dataframe_nlargest_nsmallest_str_error(attr): - gdf = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - pdf = gdf.to_pandas() - - assert_exceptions_equal( - getattr(gdf, attr), - getattr(pdf, attr), - ([], {"n": 1, "columns": ["a", "b"]}), - ([], {"n": 1, "columns": ["a", "b"]}), - ) - - -def test_series_data_no_name_with_columns(): - gdf = cudf.DataFrame(cudf.Series([1]), columns=[1]) - pdf = pd.DataFrame(pd.Series([1]), columns=[1]) - assert_eq(gdf, pdf) - - -def test_series_data_no_name_with_columns_more_than_one_raises(): - with pytest.raises(ValueError): - cudf.DataFrame(cudf.Series([1]), columns=[1, 2]) - with pytest.raises(ValueError): - pd.DataFrame(pd.Series([1]), columns=[1, 2]) - - -def test_series_data_with_name_with_columns_matching(): - gdf = cudf.DataFrame(cudf.Series([1], name=1), columns=[1]) - pdf = pd.DataFrame(pd.Series([1], name=1), columns=[1]) - assert_eq(gdf, pdf) - - -@pytest.mark.xfail( - version.parse(pd.__version__) < version.parse("2.0"), - reason="pandas returns Index[object] instead of RangeIndex", -) -def test_series_data_with_name_with_columns_not_matching(): - gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1]) - pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1]) - assert_eq(gdf, pdf) - - -def test_series_data_with_name_with_columns_matching_align(): - gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1, 2]) - pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1, 2]) - assert_eq(gdf, pdf) - - -@pytest.mark.parametrize("digits", [0, 1, 3, 4, 10]) -def test_dataframe_round_builtin(digits): - pdf = pd.DataFrame( - { - "a": [1.2234242333234, 323432.3243423, np.nan], - "b": ["a", "b", "c"], - "c": pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"), - "d": pd.Series([224.242, None, 2424.234324], dtype="category"), - "e": [ - decimal.Decimal("342.3243234234242"), - decimal.Decimal("89.32432497687622"), - None, - ], - } - ) - gdf = cudf.from_pandas(pdf, nan_as_null=False) - - expected = round(pdf, digits) - actual = round(gdf, digits) - - assert_eq(expected, actual) - - -def test_dataframe_init_from_nested_dict(): - ordered_dict = OrderedDict( - [ - ("one", OrderedDict([("col_a", "foo1"), ("col_b", "bar1")])), - ("two", OrderedDict([("col_a", "foo2"), ("col_b", "bar2")])), - ("three", OrderedDict([("col_a", "foo3"), ("col_b", "bar3")])), - ] - ) - pdf = pd.DataFrame(ordered_dict) - gdf = cudf.DataFrame(ordered_dict) - - assert_eq(pdf, gdf) - regular_dict = {key: dict(value) for key, value in ordered_dict.items()} - - pdf = pd.DataFrame(regular_dict) - gdf = cudf.DataFrame(regular_dict) - assert_eq(pdf, gdf) - - -def test_init_from_2_categoricalindex_series_diff_categories(): - s1 = cudf.Series( - [39, 6, 4], index=cudf.CategoricalIndex(["female", "male", "unknown"]) - ) - s2 = cudf.Series( - [2, 152, 2, 242, 150], - index=cudf.CategoricalIndex(["f", "female", "m", "male", "unknown"]), - ) - result = cudf.DataFrame([s1, s2]) - expected = pd.DataFrame([s1.to_pandas(), s2.to_pandas()]) - # TODO: Remove once https://github.com/pandas-dev/pandas/issues/57592 - # is adressed - expected.columns = result.columns - assert_eq(result, expected, check_dtype=False) - - -def test_data_frame_values_no_cols_but_index(): - result = cudf.DataFrame(index=range(5)).values - expected = pd.DataFrame(index=range(5)).values - assert_eq(result, expected) - - -def test_dataframe_reduction_error(): - gdf = cudf.DataFrame( - { - "a": cudf.Series([1, 2, 3], dtype="float"), - "d": cudf.Series([10, 20, 30], dtype="timedelta64[ns]"), - } - ) - - with pytest.raises(TypeError): - gdf.sum() - - -def test_dataframe_from_generator(): - pdf = pd.DataFrame((i for i in range(5))) - gdf = cudf.DataFrame((i for i in range(5))) - assert_eq(pdf, gdf) - - -def test_dataframe_from_ndarray_dup_columns(): - with pytest.raises(ValueError): - cudf.DataFrame(np.eye(2), columns=["A", "A"]) - - -@pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA]) -@pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) -@pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) -def test_dataframe_contains(name, contains, other_names): - column_names = [name] + other_names - gdf = cudf.DataFrame({c: [0] for c in column_names}) - pdf = pd.DataFrame({c: [0] for c in column_names}) - - assert_eq(gdf, pdf) - - if contains is cudf.NA or name is cudf.NA: - expectation = contains is cudf.NA and name is cudf.NA - assert (contains in pdf) == expectation - assert (contains in gdf) == expectation - elif gdf.columns.dtype.kind == "f": - # In some cases, the columns are converted to an Index[float] based on - # the other column names. That casts name values from None to np.nan. - expectation = contains is np.nan and (name is None or name is np.nan) - assert (contains in pdf) == expectation - assert (contains in gdf) == expectation - else: - expectation = contains == name or ( - contains is np.nan and name is np.nan - ) - assert (contains in pdf) == expectation - assert (contains in gdf) == expectation - - assert (contains in pdf) == (contains in gdf) - - -def test_dataframe_series_dot(): - pser = pd.Series(range(2)) - gser = cudf.from_pandas(pser) - - expected = pser @ pser - actual = gser @ gser - - assert_eq(expected, actual) - - pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("ab")) - gdf = cudf.from_pandas(pdf) - - expected = pser @ pdf - actual = gser @ gdf - - assert_eq(expected, actual) - - assert_exceptions_equal( - lfunc=pdf.dot, - rfunc=gdf.dot, - lfunc_args_and_kwargs=([pser], {}), - rfunc_args_and_kwargs=([gser], {}), - ) - - assert_exceptions_equal( - lfunc=pdf.dot, - rfunc=gdf.dot, - lfunc_args_and_kwargs=([pdf], {}), - rfunc_args_and_kwargs=([gdf], {}), - ) - - pser = pd.Series(range(2), index=["a", "k"]) - gser = cudf.from_pandas(pser) - - pdf = pd.DataFrame([[1, 2], [3, 4]], columns=list("ab"), index=["a", "k"]) - gdf = cudf.from_pandas(pdf) - - expected = pser @ pdf - actual = gser @ gdf - - assert_eq(expected, actual) - - actual = gdf @ [2, 3] - expected = pdf @ [2, 3] - - assert_eq(expected, actual) - - actual = pser @ [12, 13] - expected = gser @ [12, 13] - - assert_eq(expected, actual) - - -def test_dataframe_reindex_keep_colname(): - gdf = cudf.DataFrame([1], columns=cudf.Index([1], name="foo")) - result = gdf.reindex(index=[0, 1]) - expected = cudf.DataFrame( - [1, None], columns=cudf.Index([1], name="foo"), index=[0, 1] - ) - assert_eq(result, expected) - - -def test_dataframe_duplicate_index_reindex(): - gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1]) - pdf = gdf.to_pandas() - - assert_exceptions_equal( - gdf.reindex, - pdf.reindex, - lfunc_args_and_kwargs=([10, 11, 12, 13], {}), - rfunc_args_and_kwargs=([10, 11, 12, 13], {}), - ) - - -def test_dataframe_columns_set_none_raises(): - df = cudf.DataFrame({"a": [0]}) - with pytest.raises(TypeError): - df.columns = None - - -@pytest.mark.parametrize( - "columns", - [cudf.RangeIndex(1, name="foo"), pd.RangeIndex(1, name="foo"), range(1)], -) -def test_dataframe_columns_set_rangeindex(columns): - df = cudf.DataFrame([1], columns=["a"]) - df.columns = columns - result = df.columns - expected = pd.RangeIndex(1, name=getattr(columns, "name", None)) - pd.testing.assert_index_equal(result, expected, exact=True) - - -@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) -def test_dataframe_columns_set_multiindex(klass): - columns = klass.from_arrays([[10]], names=["foo"]) - df = cudf.DataFrame([1], columns=["a"]) - df.columns = columns - result = df.columns - expected = pd.MultiIndex.from_arrays([[10]], names=["foo"]) - pd.testing.assert_index_equal(result, expected, exact=True) - - -@pytest.mark.parametrize( - "klass", - [ - functools.partial(cudf.Index, name="foo"), - functools.partial(cudf.Series, name="foo"), - functools.partial(pd.Index, name="foo"), - functools.partial(pd.Series, name="foo"), - np.array, - ], -) -def test_dataframe_columns_set_preserve_type(klass): - df = cudf.DataFrame([1], columns=["a"]) - columns = klass([10], dtype="int8") - df.columns = columns - result = df.columns - expected = pd.Index( - [10], dtype="int8", name=getattr(columns, "name", None) - ) - pd.testing.assert_index_equal(result, expected) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - [1], - decimal.Decimal("1.0"), - ], -) -def test_dataframe_to_pandas_arrow_type_nullable_raises(scalar): - pa_array = pa.array([scalar, None]) - df = cudf.DataFrame({"a": pa_array}) - with pytest.raises(ValueError): - df.to_pandas(nullable=True, arrow_type=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - [1], - decimal.Decimal("1.0"), - ], -) -def test_dataframe_to_pandas_arrow_type(scalar): - pa_array = pa.array([scalar, None]) - df = cudf.DataFrame({"a": pa_array}) - result = df.to_pandas(arrow_type=True) - expected = pd.DataFrame({"a": pd.arrays.ArrowExtensionArray(pa_array)}) - pd.testing.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("axis", [None, 0, "index", 1, "columns"]) -@pytest.mark.parametrize("data", [[[1, 2], [2, 3]], [1, 2], [1]]) -def test_squeeze(axis, data): - df = cudf.DataFrame(data) - result = df.squeeze(axis=axis) - expected = df.to_pandas().squeeze(axis=axis) - assert_eq(result, expected) - - -@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)]) -@pytest.mark.parametrize( - "operation", - [ - lambda df: df.where(df < 2, 2), - lambda df: df.nans_to_nulls(), - lambda df: df.isna(), - lambda df: df.notna(), - lambda df: abs(df), - lambda df: -df, - lambda df: ~df, - lambda df: df.cumsum(), - lambda df: df.replace(1, 2), - lambda df: df.replace(10, 20), - lambda df: df.clip(0, 10), - lambda df: df.rolling(1).mean(), - lambda df: df.interpolate(), - lambda df: df.shift(), - lambda df: df.sort_values(1), - lambda df: df.round(), - lambda df: df.rank(), - ], -) -def test_op_preserves_column_metadata(column, operation): - df = cudf.DataFrame([1], columns=cudf.Index(column)) - result = operation(df).columns - expected = pd.Index(column) - pd.testing.assert_index_equal(result, expected, exact=True) - - -def test_dataframe_init_with_nans(): - with cudf.option_context("mode.pandas_compatible", True): - gdf = cudf.DataFrame({"a": [1, 2, 3, np.nan]}) - assert gdf["a"].dtype == np.dtype("float64") - pdf = pd.DataFrame({"a": [1, 2, 3, np.nan]}) - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("dtype1", ["int16", "float32"]) -@pytest.mark.parametrize("dtype2", ["int16", "float32"]) -def test_dataframe_loc_int_float(dtype1, dtype2): - df = cudf.DataFrame( - {"a": [10, 11, 12, 13, 14]}, - index=cudf.Index([1, 2, 3, 4, 5], dtype=dtype1), - ) - pdf = df.to_pandas() - - gidx = cudf.Index([2, 3, 4], dtype=dtype2) - pidx = gidx.to_pandas() - - actual = df.loc[gidx] - expected = pdf.loc[pidx] - - assert_eq(actual, expected, check_index_type=True, check_dtype=True) - - -@pytest.mark.parametrize( - "data", - [ - cudf.DataFrame(range(2)), - None, - [cudf.Series(range(2))], - [[0], [1]], - {1: range(2)}, - cupy.arange(2), - ], -) -def test_init_with_index_no_shallow_copy(data): - idx = cudf.RangeIndex(2) - df = cudf.DataFrame(data, index=idx) - assert df.index is idx - - -def test_from_records_with_index_no_shallow_copy(): - idx = cudf.RangeIndex(2) - data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", " pd_data_2, gdf_data_1 > gdf_data_2) - assert_eq(pd_data_1 == pd_data_2, gdf_data_1 == gdf_data_2) - assert_eq(pd_data_1 <= pd_data_2, gdf_data_1 <= gdf_data_2) - assert_eq(pd_data_1 >= pd_data_2, gdf_data_1 >= gdf_data_2) - - -@pytest.mark.parametrize( - "lhs_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -@pytest.mark.parametrize( - "rhs_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_datetime_series_binops_numpy(lhs_dtype, rhs_dtype): - pd_data_1 = pd.Series( - pd.date_range("20010101", "20020215", freq="400h", name="times") - ) - pd_data_2 = pd.Series( - pd.date_range("20010101", "20020215", freq="401h", name="times") - ) - gdf_data_1 = Series(pd_data_1).astype(lhs_dtype) - gdf_data_2 = Series(pd_data_2).astype(rhs_dtype) - np_data_1 = np.array(pd_data_1).astype(lhs_dtype) - np_data_2 = np.array(pd_data_2).astype(rhs_dtype) - np.testing.assert_equal(np_data_1, gdf_data_1.to_numpy()) - np.testing.assert_equal(np_data_2, gdf_data_2.to_numpy()) - np.testing.assert_equal( - np.less(np_data_1, np_data_2), (gdf_data_1 < gdf_data_2).to_numpy() - ) - np.testing.assert_equal( - np.greater(np_data_1, np_data_2), (gdf_data_1 > gdf_data_2).to_numpy() - ) - np.testing.assert_equal( - np.equal(np_data_1, np_data_2), (gdf_data_1 == gdf_data_2).to_numpy() - ) - np.testing.assert_equal( - np.less_equal(np_data_1, np_data_2), - (gdf_data_1 <= gdf_data_2).to_numpy(), - ) - np.testing.assert_equal( - np.greater_equal(np_data_1, np_data_2), - (gdf_data_1 >= gdf_data_2).to_numpy(), - ) - - -@pytest.mark.parametrize("data", [data1(), data2()]) -def test_dt_ops(data): - pd_data = pd.Series(data.copy()) - gdf_data = Series(data.copy()) - - assert_eq(pd_data == pd_data, gdf_data == gdf_data) - assert_eq(pd_data < pd_data, gdf_data < gdf_data) - assert_eq(pd_data > pd_data, gdf_data > gdf_data) - - -# libcudf doesn't respect timezones -@pytest.mark.parametrize("data", [data1(), data2()]) -@pytest.mark.parametrize("field", fields) -def test_dt_series(data, field): - pd_data = pd.Series(data.copy()) - gdf_data = Series(pd_data) - base = getattr(pd_data.dt, field) - test = getattr(gdf_data.dt, field) - assert_eq(base, test, check_dtype=False) - - -@pytest.mark.parametrize("data", [data1(), data2()]) -@pytest.mark.parametrize("field", fields) -def test_dt_index(data, field): - pd_data = data.copy() - gdf_data = DatetimeIndex(pd_data) - assert_eq(getattr(gdf_data, field), getattr(pd_data, field), exact=False) - - -def test_setitem_datetime(): - df = DataFrame() - df["date"] = pd.date_range("20010101", "20010105").values - assert np.issubdtype(df.date.dtype, np.datetime64) - - -def test_sort_datetime(): - df = pd.DataFrame() - df["date"] = np.array( - [ - np.datetime64("2016-11-20"), - np.datetime64("2020-11-20"), - np.datetime64("2019-11-20"), - np.datetime64("1918-11-20"), - np.datetime64("2118-11-20"), - ] - ) - df["vals"] = np.random.sample(len(df["date"])) - - gdf = cudf.from_pandas(df) - - s_df = df.sort_values(by="date") - s_gdf = gdf.sort_values(by="date") - - assert_eq(s_df, s_gdf) - - -def test_issue_165(): - df_pandas = pd.DataFrame() - start_date = datetime.datetime.strptime("2000-10-21", "%Y-%m-%d") - data = [(start_date + datetime.timedelta(days=x)) for x in range(6)] - df_pandas["dates"] = data - df_pandas["num"] = [1, 2, 3, 4, 5, 6] - df_cudf = DataFrame.from_pandas(df_pandas) - - base = df_pandas.query("dates==@start_date") - test = df_cudf.query("dates==@start_date") - assert_eq(base, test) - assert len(test) > 0 - - mask = df_cudf.dates == start_date - base_mask = df_pandas.dates == start_date - assert_eq(mask, base_mask, check_names=False) - assert mask.to_pandas().sum() > 0 - - start_date_ts = pd.Timestamp(start_date) - test = df_cudf.query("dates==@start_date_ts") - base = df_pandas.query("dates==@start_date_ts") - assert_eq(base, test) - assert len(test) > 0 - - mask = df_cudf.dates == start_date_ts - base_mask = df_pandas.dates == start_date_ts - assert_eq(mask, base_mask, check_names=False) - assert mask.to_pandas().sum() > 0 - - start_date_np = np.datetime64(start_date_ts, "ns") - test = df_cudf.query("dates==@start_date_np") - base = df_pandas.query("dates==@start_date_np") - assert_eq(base, test) - assert len(test) > 0 - - mask = df_cudf.dates == start_date_np - base_mask = df_pandas.dates == start_date_np - assert_eq(mask, base_mask, check_names=False) - assert mask.to_pandas().sum() > 0 - - -@pytest.mark.parametrize("data", [data1(), data2()]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -def test_typecast_from_datetime(data, dtype): - pd_data = pd.Series(data.copy()) - np_data = np.array(pd_data) - gdf_data = Series(pd_data) - - np_casted = np_data.astype(dtype) - gdf_casted = gdf_data.astype(dtype) - - np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) - - -@pytest.mark.parametrize("data", [data1(), data2()]) -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_from_datetime_to_int64_to_datetime(data, dtype): - pd_data = pd.Series(data.copy()) - np_data = np.array(pd_data) - gdf_data = Series(pd_data) - - np_casted = np_data.astype(np.int64).astype(dtype) - gdf_casted = gdf_data.astype(np.int64).astype(dtype) - - np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) - - -@pytest.mark.parametrize("data", [timeseries_us_data()]) -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_to_different_datetime_resolutions(data, dtype): - pd_data = pd.Series(data.copy()) - np_data = np.array(pd_data).astype(dtype) - gdf_series = Series(pd_data).astype(dtype) - np.testing.assert_equal(np_data, gdf_series.to_numpy()) - - -@pytest.mark.parametrize( - "data", [timestamp_ms_data(), timestamp_us_data(), timestamp_ns_data()] -) -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_string_timstamp_typecast_to_different_datetime_resolutions( - data, dtype -): - pd_sr = data - gdf_sr = cudf.Series.from_pandas(pd_sr) - - expect = pd_sr.values.astype(dtype) - got = gdf_sr.astype(dtype).values_host - - np.testing.assert_equal(expect, got) - - -@pytest.mark.parametrize("data", [numerical_data()]) -@pytest.mark.parametrize("from_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "to_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_to_datetime(data, from_dtype, to_dtype): - np_data = data.astype(from_dtype) - gdf_data = Series(np_data) - - np_casted = np_data.astype(to_dtype) - gdf_casted = gdf_data.astype(to_dtype) - - np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) - - -@pytest.mark.parametrize("data", [numerical_data()]) -@pytest.mark.parametrize("from_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "to_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_to_from_datetime(data, from_dtype, to_dtype): - np_data = data.astype(from_dtype) - gdf_data = Series(np_data) - - np_casted = np_data.astype(to_dtype).astype(from_dtype) - gdf_casted = gdf_data.astype(to_dtype).astype(from_dtype) - - np.testing.assert_equal(np_casted, gdf_casted.to_numpy()) - - -@pytest.mark.parametrize("data", [numerical_data()]) -@pytest.mark.parametrize( - "from_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -@pytest.mark.parametrize( - "to_dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_from_datetime_to_datetime(data, from_dtype, to_dtype): - np_data = data.astype(from_dtype) - ser = Series(np_data) - - np_casted = np_data.astype(to_dtype) - ser_casted = ser.astype(to_dtype) - - np.testing.assert_equal(np_casted, ser_casted.to_numpy()) - - -@pytest.mark.parametrize("data", [numerical_data()]) -@pytest.mark.parametrize("nulls", ["some", "all"]) -def test_to_from_pandas_nulls(data, nulls): - pd_data = pd.Series(data.copy().astype("datetime64[ns]")) - if nulls == "some": - # Fill half the values with NaT - pd_data[list(range(0, len(pd_data), 2))] = np.datetime64("nat", "ns") - elif nulls == "all": - # Fill all the values with NaT - pd_data[:] = np.datetime64("nat", "ns") - gdf_data = Series.from_pandas(pd_data) - - expect = pd_data - got = gdf_data.to_pandas() - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_datetime_to_arrow(dtype): - timestamp = ( - cudf.datasets.timeseries( - start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={} - ) - .reset_index()["timestamp"] - .reset_index(drop=True) - ) - gdf = DataFrame({"timestamp": timestamp.astype(dtype)}) - assert_eq(gdf, DataFrame.from_arrow(gdf.to_arrow(preserve_index=False))) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([], dtype="datetime64[ns]"), - pd.Series(pd.date_range("2010-01-01", "2010-02-01")), - pd.Series([None, None], dtype="datetime64[ns]"), - ], -) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_datetime_unique(data, nulls): - psr = data.copy() - - if len(data) > 0: - if nulls == "some": - p = np.random.randint(0, len(data), 2) - psr[p] = None - - gsr = cudf.from_pandas(psr) - expected = psr.unique() - got = gsr.unique() - - # Unique does not provide a guarantee on ordering. - assert_eq( - pd.Series(expected).sort_values(ignore_index=True), - got.sort_values(ignore_index=True).to_pandas(), - ) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([], dtype="datetime64[ns]"), - pd.Series(pd.date_range("2010-01-01", "2010-02-01")), - pd.Series([None, None], dtype="datetime64[ns]"), - ], -) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_datetime_nunique(data, nulls): - psr = data.copy() - - if len(data) > 0: - if nulls == "some": - p = np.random.randint(0, len(data), 2) - psr[p] = None - - gsr = cudf.from_pandas(psr) - expected = psr.nunique() - got = gsr.nunique() - assert_eq(got, expected) - - -testdata = [ - ( - Series( - ["2018-01-01", None, "2019-01-31", None, "2018-01-01"], - dtype="datetime64[ms]", - ), - True, - ), - ( - Series( - [ - "2018-01-01", - "2018-01-02", - "2019-01-31", - "2018-03-01", - "2018-01-01", - ], - dtype="datetime64[ms]", - ), - False, - ), - ( - Series( - np.array( - ["2018-01-01", None, "2019-12-30"], dtype="datetime64[ms]" - ) - ), - True, - ), -] - - -@pytest.mark.parametrize("data, expected", testdata) -def test_datetime_has_null_test(data, expected): - pd_data = data.to_pandas() - count = pd_data.notna().value_counts() - expected_count = 0 - if False in count.keys(): - expected_count = count[False] - - assert_eq(expected, data.has_nulls) - assert_eq(expected_count, data.null_count) - - -def test_datetime_has_null_test_pyarrow(): - data = Series( - pa.array( - [0, np.iinfo("int64").min, np.iinfo("int64").max, None], - type=pa.timestamp("ns"), - ) - ) - expected = True - expected_count = 1 - - assert_eq(expected, data.has_nulls) - assert_eq(expected_count, data.null_count) - - -def test_datetime_dataframe(): - data = { - "timearray": np.array( - [0, 1, None, 2, 20, None, 897], dtype="datetime64[ms]" - ) - } - gdf = cudf.DataFrame(data) - pdf = pd.DataFrame(data) - - assert_eq(pdf, gdf) - - assert_eq(pdf.dropna(), gdf.dropna()) - - assert_eq(pdf.isnull(), gdf.isnull()) - - data = np.array([0, 1, None, 2, 20, None, 897], dtype="datetime64[ms]") - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(ps, gs) - - assert_eq(ps.dropna(), gs.dropna()) - - assert_eq(ps.isnull(), gs.isnull()) - - -@pytest.mark.parametrize( - "data", - [ - None, - [], - pd.Series([], dtype="float64"), - pd.Index([]), - pd.Series([1, 2, 3]), - pd.Series([0, 1, -1]), - pd.Series([0, 1, -1, 100.3, 200, 47637289]), - pd.Series(["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"]), - [1, 2, 3, 100, -123, -1, 0, 1000000000000679367], - pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}), - pd.DataFrame( - {"year": ["2015", "2016"], "month": ["2", "3"], "day": [4, 5]} - ), - pd.DataFrame( - { - "year": [2015, 2016], - "month": [2, 3], - "day": [4, 5], - "minute": [1, 100], - "second": [90, 10], - "hour": [1, 0.5], - }, - index=["a", "b"], - ), - pd.DataFrame( - { - "year": [], - "month": [], - "day": [], - "minute": [], - "second": [], - "hour": [], - }, - ), - ["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], - pd.Index([1, 2, 3, 4]), - pd.DatetimeIndex( - ["1970-01-01 00:00:00.000000001", "1970-01-01 00:00:00.000000002"], - dtype="datetime64[ns]", - freq=None, - ), - pd.DatetimeIndex( - [], - dtype="datetime64[ns]", - freq=None, - ), - pd.Series([1, 2, 3]).astype("datetime64[ns]"), - pd.Series([1, 2, 3]).astype("datetime64[us]"), - pd.Series([1, 2, 3]).astype("datetime64[ms]"), - pd.Series([1, 2, 3]).astype("datetime64[s]"), - pd.Series([1, 2, 3]).astype("datetime64[D]"), - 1, - 100, - 17, - 53.638435454, - np.array([1, 10, 15, 478925, 2327623467]), - np.array([0.3474673, -10, 15, 478925.34345, 2327623467]), - ], -) -@pytest.mark.parametrize("dayfirst", [True, False]) -def test_cudf_to_datetime(data, dayfirst): - pd_data = data - if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): - gd_data = cudf.from_pandas(pd_data) - else: - if type(pd_data).__module__ == np.__name__: - gd_data = cp.array(pd_data) - else: - gd_data = pd_data - - expected = pd.to_datetime(pd_data, dayfirst=dayfirst) - actual = cudf.to_datetime(gd_data, dayfirst=dayfirst) - - if isinstance(expected, pd.Series): - assert_eq(actual, expected, check_dtype=False) - else: - assert_eq(actual, expected, check_exact=False) - - -@pytest.mark.parametrize( - "data", - [ - "2", - ["1", "2", "3"], - ["1/1/1", "2/2/2", "1"], - pd.Series([1, 2, 3], dtype="timedelta64[ns]"), - pd.DataFrame( - { - "year": [2015, 2016], - "month": [2, 3], - "day": [4, 5], - "minute": [1, 100], - "second": [90, 10], - "hour": [1, 0], - "blablacol": [1, 1], - } - ), - pd.DataFrame( - { - "month": [2, 3], - "day": [4, 5], - "minute": [1, 100], - "second": [90, 10], - "hour": [1, 0], - } - ), - ], -) -def test_to_datetime_errors(data): - pd_data = data - if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): - gd_data = cudf.from_pandas(pd_data) - else: - gd_data = pd_data - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - assert_exceptions_equal( - pd.to_datetime, - cudf.to_datetime, - ([pd_data],), - ([gd_data],), - ) - - -def test_to_datetime_not_implemented(): - with pytest.raises(NotImplementedError): - cudf.to_datetime([], exact=False) - - with pytest.raises(NotImplementedError): - cudf.to_datetime([], origin="julian") - - with pytest.raises(NotImplementedError): - cudf.to_datetime([], yearfirst=True) - - -@pytest.mark.parametrize( - "data", - [ - 1, - [], - pd.Series([], dtype="float64"), - pd.Index([]), - pd.Series([1, 2, 3]), - pd.Series([1, 2.4, 3]), - pd.Series([0, 1, -1]), - pd.Series([0, 1, -1, 100, 200, 47637]), - [10, 12, 1200, 15003], - pd.DatetimeIndex( - [], - dtype="datetime64[ns]", - freq=None, - ), - pd.Index([1, 2, 3, 4]), - ], -) -@pytest.mark.parametrize("unit", ["D", "s", "ms", "us", "ns"]) -def test_to_datetime_units(data, unit): - pd_data = data - if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): - gd_data = cudf.from_pandas(pd_data) - else: - gd_data = pd_data - - expected = pd.to_datetime(pd_data, unit=unit) - actual = cudf.to_datetime(gd_data, unit=unit) - - if isinstance(expected, pd.Series): - assert_eq(actual, expected, check_dtype=False) - else: - assert_eq(actual, expected, exact=False, check_exact=False) - - -@pytest.mark.parametrize( - "data,format", - [ - ("2012-10-11", None), - ("2012-10-11", "%Y-%m-%d"), - ("2012-10-11", "%Y-%d-%m"), - (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], None), - (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], "%Y-%m-%d"), - (["2012-10-11", "2010-01-01", "2016-07-07", "2014-02-02"], "%Y-%d-%m"), - (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], "%m-%d-%Y"), - (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], "%d-%m-%Y"), - (["10-11-2012", "01-01-2010", "07-07-2016", "02-02-2014"], None), - (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], None), - (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], "%Y/%m/%d"), - (["2012/10/11", "2010/01/01", "2016/07/07", "2014/02/02"], "%Y/%d/%m"), - (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%m/%d/%Y"), - (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], "%d/%m/%Y"), - (["10/11/2012", "01/01/2010", "07/07/2016", "02/02/2014"], None), - (["2021-04-13 12:30:04.123456789"], "%Y-%m-%d %H:%M:%S.%f"), - (pd.Series([2015, 2020, 2021]), "%Y"), - pytest.param( - pd.Series(["1", "2", "1"]), - "%m", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/6109" - "https://github.com/pandas-dev/pandas/issues/35934" - ), - ), - pytest.param( - pd.Series(["14", "20", "10"]), - "%d", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/6109" - "https://github.com/pandas-dev/pandas/issues/35934" - ), - ), - (pd.Series([2015, 2020.0, 2021.2]), "%Y"), - ], -) -@pytest.mark.parametrize("infer_datetime_format", [True, False]) -def test_to_datetime_format(data, format, infer_datetime_format): - pd_data = data - if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): - gd_data = cudf.from_pandas(pd_data) - else: - gd_data = pd_data - - with expect_warning_if(True, UserWarning): - expected = pd.to_datetime( - pd_data, format=format, infer_datetime_format=infer_datetime_format - ) - with expect_warning_if(not infer_datetime_format): - actual = cudf.to_datetime( - gd_data, format=format, infer_datetime_format=infer_datetime_format - ) - - if isinstance(expected, pd.Series): - assert_eq(actual, expected, check_dtype=False) - else: - assert_eq(actual, expected, check_exact=False) - - -def test_to_datetime_data_out_of_range_for_format(): - with pytest.raises(ValueError): - cudf.to_datetime("2015-02-99", format="%Y-%m-%d") - - -def test_to_datetime_different_formats_notimplemented(): - with pytest.raises(NotImplementedError): - cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"]) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas.", -) -def test_datetime_can_cast_safely(): - sr = cudf.Series( - ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]" - ) - assert sr._column.can_cast_safely(np.dtype("datetime64[ns]")) - - sr = cudf.Series( - ["1677-01-01", "2000-01-31", "2263-01-01"], dtype="datetime64[ms]" - ) - - assert sr._column.can_cast_safely(np.dtype("datetime64[ns]")) is False - - -# Cudf autocasts unsupported time_units -@pytest.mark.parametrize( - "dtype", - ["datetime64[D]", "datetime64[W]", "datetime64[M]", "datetime64[Y]"], -) -def test_datetime_array_timeunit_cast(dtype): - testdata = np.array( - [ - np.datetime64("2016-11-20"), - np.datetime64("2020-11-20"), - np.datetime64("2019-11-20"), - np.datetime64("1918-11-20"), - np.datetime64("2118-11-20"), - ], - dtype=dtype, - ) - - gs = Series(testdata) - ps = pd.Series(testdata) - - assert_eq(ps, gs) - - gdf = DataFrame() - gdf["a"] = np.arange(5) - gdf["b"] = testdata - - pdf = pd.DataFrame() - pdf["a"] = np.arange(5) - pdf["b"] = testdata - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize("timeunit", ["D", "W", "M", "Y"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_datetime_scalar_timeunit_cast(timeunit): - testscalar = np.datetime64("2016-11-20", timeunit) - - gs = Series(testscalar) - ps = pd.Series(testscalar) - - assert_eq(ps, gs, check_dtype=False) - - gdf = DataFrame() - gdf["a"] = np.arange(5) - gdf["b"] = testscalar - - pdf = pd.DataFrame() - pdf["a"] = np.arange(5) - pdf["b"] = testscalar - - assert gdf["b"].dtype == cudf.dtype("datetime64[s]") - assert_eq(pdf, gdf, check_dtype=True) - - -@pytest.mark.parametrize( - "data", - [ - ["2001-01-01", "2002-02-02", "2000-01-05", "NaT"], - ["2001-01-01", "2002-02-02", "2000-01-05", None], - [None, None, None, None, None], - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_str_null_to_datetime(data, dtype): - psr = pd.Series(data) - gsr = Series(data) - - assert_eq(psr.astype(dtype), gsr.astype(dtype)) - - -def test_str_to_datetime_error(): - psr = pd.Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) - gsr = Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) - - assert_exceptions_equal( - lfunc=psr.astype, - rfunc=gsr.astype, - lfunc_args_and_kwargs=(["datetime64[s]"],), - rfunc_args_and_kwargs=(["datetime64[s]"],), - check_exception_type=False, - ) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 10, 100, 20000], - [None] * 7, - [10, 20, 30, None, 100, 200, None], - [3223.234, 342.2332, 23423.23, 3343.23324, 23432.2323, 242.23, 233], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 10, 100, 20000], - [None] * 7, - [10, 20, 30, None, 100, 200, None], - [3223.234, 342.2332, 23423.23, 3343.23324, 23432.2323, 242.23, 233], - np.datetime64("2005-02"), - np.datetime64("2005-02-25"), - np.datetime64("2005-02-25T03:30"), - np.datetime64("nat"), - # TODO: https://github.com/pandas-dev/pandas/issues/52295 - ], -) -@pytest.mark.parametrize("data_dtype", DATETIME_TYPES) -@pytest.mark.parametrize("other_dtype", DATETIME_TYPES) -def test_datetime_subtract(data, other, data_dtype, other_dtype): - gsr = cudf.Series(data, dtype=data_dtype) - psr = gsr.to_pandas() - - if isinstance(other, np.datetime64): - gsr_other = other - psr_other = other - else: - gsr_other = cudf.Series(other, dtype=other_dtype) - psr_other = gsr_other.to_pandas() - - expected = psr - psr_other - actual = gsr - gsr_other - - assert_eq(expected, actual) - - expected = psr_other - psr - actual = gsr_other - gsr - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize( - "other_scalars", - [ - datetime.timedelta(days=768), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - datetime.timedelta(minutes=447), - datetime.timedelta(hours=447), - datetime.timedelta(weeks=734), - np.timedelta64(4, "s"), - np.timedelta64(456, "D"), - np.timedelta64(46, "h"), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize( - "op", - ["add", "sub"], -) -def test_datetime_series_ops_with_scalars(data, other_scalars, dtype, op): - gsr = cudf.Series(data=data, dtype=dtype) - psr = gsr.to_pandas() - - if op == "add": - expected = psr + other_scalars - actual = gsr + other_scalars - elif op == "sub": - expected = psr - other_scalars - actual = gsr - other_scalars - - assert_eq(expected, actual) - - if op == "add": - expected = other_scalars + psr - actual = other_scalars + gsr - - assert_eq(expected, actual) - - elif op == "sub": - assert_exceptions_equal( - lfunc=operator.sub, - rfunc=operator.sub, - lfunc_args_and_kwargs=([other_scalars, psr],), - rfunc_args_and_kwargs=([other_scalars, gsr],), - ) - - -@pytest.mark.parametrize("data", ["20110101", "20120101", "20130101"]) -@pytest.mark.parametrize("other_scalars", ["20110101", "20120101", "20130101"]) -@pytest.mark.parametrize("op", _cmpops) -@pytest.mark.parametrize( - "dtype", - ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], -) -def test_datetime_series_cmpops_with_scalars(data, other_scalars, dtype, op): - gsr = cudf.Series(data=data, dtype=dtype) - psr = gsr.to_pandas() - - expect = op(psr, other_scalars) - got = op(gsr, other_scalars) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize( - "scalar", - [ - datetime.timedelta(days=768), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - pytest.param(np.timedelta64("nat"), marks=pytest.mark.xfail), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize("op", [np.add, np.subtract]) -def test_datetime_series_ops_with_cudf_scalars(data, scalar, dtype, op): - gsr = cudf.Series(data=data, dtype=dtype) - psr = gsr.to_pandas() - - expect = op(psr, scalar) - got = op(gsr, cudf.Scalar(scalar)) - - assert_eq(expect, got) - - -def test_datetime_invalid_ops(): - sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - psr = sr.to_pandas() - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), - rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), - ) - - assert_exceptions_equal( - lfunc=operator.truediv, - rfunc=operator.truediv, - lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), - rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), - ) - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, psr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - assert_exceptions_equal( - lfunc=operator.floordiv, - rfunc=operator.floordiv, - lfunc_args_and_kwargs=([psr, psr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - assert_exceptions_equal( - lfunc=operator.floordiv, - rfunc=operator.floordiv, - lfunc_args_and_kwargs=([psr, pd.Timestamp(1513393355.5, unit="s")],), - rfunc_args_and_kwargs=([sr, pd.Timestamp(1513393355.5, unit="s")],), - ) - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, 1],), - rfunc_args_and_kwargs=([sr, 1],), - ) - - assert_exceptions_equal( - lfunc=operator.truediv, - rfunc=operator.truediv, - lfunc_args_and_kwargs=([psr, "a"],), - rfunc_args_and_kwargs=([sr, "a"],), - ) - - assert_exceptions_equal( - lfunc=operator.mul, - rfunc=operator.mul, - lfunc_args_and_kwargs=([psr, 1],), - rfunc_args_and_kwargs=([sr, 1],), - ) - - -@pytest.mark.parametrize( - "data", - [ - [], - [1, 2, 3], - [None, 1, 10, 11, None], - [None, None, None, None, None], - [None], - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize( - "fill_value", - [ - np.datetime64("2005-02"), - np.datetime64("2005-02-25"), - np.datetime64("2005-02-25T03:30"), - np.datetime64("nat"), - "NaT", - ], -) -def test_datetime_fillna(data, dtype, fill_value): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - expected = psr.dropna() - actual = sr.dropna() - - assert_eq(expected, actual) - - expected = psr.fillna(fill_value) - actual = sr.fillna(fill_value) - - assert_eq(expected, actual) - - expected = expected.dropna() - actual = actual.dropna() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", [[1, 2, 3, None], [], [100121, 1221312, 321312321, 1232131223]] -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize( - "date_format", - [ - "%d - %m", - "%y/%H", - "%Y", - "%I - %M / %S", - "%f", - "%j", - "%p", - "%w", - "%U", - "%W", - "%G", - "%u", - "%V", - "%b", - "%B", - "%a", - "%A", - ], -) -def test_datetime_strftime(data, dtype, date_format): - gsr = cudf.Series(data, dtype=dtype) - psr = gsr.to_pandas() - - expected = psr.dt.strftime(date_format=date_format) - actual = gsr.dt.strftime(date_format=date_format) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("date_format", ["%c", "%x", "%X"]) -def test_datetime_strftime_not_implemented_formats(date_format): - gsr = cudf.Series([1, 2, 3], dtype="datetime64[ms]") - - with pytest.raises(NotImplementedError): - gsr.dt.strftime(date_format=date_format) - - -@pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -@pytest.mark.parametrize("stat", ["mean", "quantile"]) -def test_datetime_stats(data, dtype, stat): - gsr = cudf.Series(data, dtype=dtype) - psr = gsr.to_pandas() - - expected = getattr(psr, stat)() - actual = getattr(gsr, stat)() - - if len(data) == 0: - assert np.isnat(expected.to_numpy()) and np.isnat(actual.to_numpy()) - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize("op", ["max", "min", "std", "median"]) -@pytest.mark.parametrize( - "data", - [ - [], - [1, 2, 3, 100], - [10, None, 100, None, None], - [None, None, None], - [1231], - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_datetime_reductions(data, op, dtype): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - actual = getattr(sr, op)() - with expect_warning_if( - psr.size > 0 and psr.isnull().all() and op == "median", RuntimeWarning - ): - expected = getattr(psr, op)() - - if ( - expected is pd.NaT - and actual is pd.NaT - or (np.isnat(expected.to_numpy()) and np.isnat(actual)) - ): - assert True - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize("timezone", ["", "Z"]) -@pytest.mark.parametrize( - "data", - [ - "2002-10-27T04:30", - "2002-10-27T04:30:00", - "2002-10-27T04:30:00.000", - "2002-10-27T04:30:00.000000", - "2002-10-27T04:30:00.000000000", - ], -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_datetime_infer_format(data, timezone, dtype): - ts_data = [data + timezone] - sr = cudf.Series(ts_data) - psr = pd.Series(ts_data) - if not timezone: - expected = psr.astype(dtype) - actual = sr.astype(dtype) - - assert_eq(expected, actual) - else: - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - # pandas doesn't allow parsing "Z" to naive type - sr.astype(dtype) - - -def test_dateoffset_instance_subclass_check(): - assert not issubclass(pd.DateOffset, cudf.DateOffset) - assert not isinstance(pd.DateOffset(), cudf.DateOffset) - - -def test_datetime_to_datetime_error(): - assert_exceptions_equal( - lfunc=pd.to_datetime, - rfunc=cudf.to_datetime, - lfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"],), - rfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"],), - check_exception_type=False, - ) - - -def test_is_leap_year(): - data = [ - "2020-05-31 08:00:00", - None, - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - None, - "1900-02-28 07:00:00", - "1800-03-14 07:30:00", - "2100-03-14 07:30:00", - "1970-01-01 00:00:00", - "1969-12-31 12:59:00", - ] - - # Series - ps = pd.Series(data, dtype="datetime64[s]") - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_leap_year - got = gs.dt.is_leap_year - - assert_eq(expect, got) - - # DatetimeIndex - pIndex = pd.DatetimeIndex(data) - gIndex = cudf.from_pandas(pIndex) - - expect2 = pIndex.is_leap_year - got2 = gIndex.is_leap_year - - assert_eq(expect2, got2) - - -def test_quarter(): - data = [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - "1900-02-28 07:00:00", - "1800-03-14 07:30:00", - "2100-03-14 07:30:00", - "1970-01-01 00:00:00", - "1969-12-31 12:59:00", - ] - dtype = "datetime64[s]" - - # Series - ps = pd.Series(data, dtype=dtype) - gs = cudf.from_pandas(ps) - - expect = ps.dt.quarter - got = gs.dt.quarter - - assert_eq(expect, got, check_dtype=False) - - # DatetimeIndex - pIndex = pd.DatetimeIndex(data) - gIndex = cudf.from_pandas(pIndex) - - expect2 = pIndex.quarter - got2 = gIndex.quarter - - assert_eq(expect2.values, got2.values) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([], dtype="datetime64[ns]"), - pd.Series(pd.date_range("2010-01-01", "2010-02-01")), - pd.Series([None, None], dtype="datetime64[ns]"), - pd.Series("2020-05-31 08:00:00", dtype="datetime64[s]"), - pd.Series( - pd.date_range(start="2021-07-25", end="2021-07-30"), - index=["a", "b", "c", "d", "e", "f"], - ), - ], -) -def test_isocalendar_series(data): - ps = data.copy() - gs = cudf.from_pandas(ps) - - expect = ps.dt.isocalendar() - got = gs.dt.isocalendar() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - pd.DatetimeIndex([], dtype="datetime64[ns]"), - pd.DatetimeIndex([None, None], dtype="datetime64[ns]"), - pd.DatetimeIndex( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - ], - dtype="datetime64[ns]", - ), - pd.DatetimeIndex(["2100-03-14 07:30:00"], dtype="datetime64[ns]"), - ], -) -def test_isocalendar_index(data): - ps = data.copy() - gs = cudf.from_pandas(ps) - - expect = ps.isocalendar() - got = gs.isocalendar() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_days_in_months(dtype): - nrows = 1000 - - data = dataset_generator.rand_dataframe( - dtypes_meta=[ - {"dtype": dtype, "null_frequency": 0.4, "cardinality": nrows} - ], - rows=nrows, - use_threads=False, - seed=23, - ) - - ps = data.to_pandas()["0"] - gs = cudf.from_pandas(ps) - - assert_eq(ps.dt.days_in_month, gs.dt.days_in_month) - - -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_month_start(data, dtype): - # Series - ps = pd.Series(data, dtype=dtype) - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_month_start - got = gs.dt.is_month_start - - assert_eq(expect, got) - - -################################################################## -# Date Range Tests # -################################################################## - -date_range_test_dates_start = [ - "2000-02-13 08:41:06", # leap year - "1996-11-21 04:05:30", # non leap year - "1970-01-01 00:00:00", # unix epoch time 0 - "1831-05-08 15:23:21", -] -date_range_test_dates_end = [ - "2000-02-13 08:41:06", # leap year - "1996-11-21 04:05:30", # non leap year - "1970-01-01 00:00:00", # unix epoch time 0 - "1831-05-08 15:23:21", -] -date_range_test_periods = [1, 10, 100] -date_range_test_freq = [ - {"months": 3, "years": 1}, - {"hours": 10, "days": 57, "nanoseconds": 3}, - "83D", - "17h", - "-680min", - "110546s", - "110546789ms", - "110546789248us", -] - - -@pytest.fixture(params=date_range_test_dates_start[:]) -def start(request): - return request.param - - -@pytest.fixture(params=date_range_test_dates_end[:]) -def end(request): - return request.param - - -@pytest.fixture(params=date_range_test_periods[:]) -def periods(request): - return request.param - - -@pytest.fixture(params=date_range_test_freq[:]) -def freq(request): - return request.param - - -def test_date_range_start_end_periods(start, end, periods): - expect = pd.date_range(start=start, end=end, periods=periods, name="a") - got = cudf.date_range(start=start, end=end, periods=periods, name="a") - - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_date_range_start_end_freq(start, end, freq): - if isinstance(freq, str): - _gfreq = _pfreq = freq - else: - _gfreq = cudf.DateOffset(**freq) - _pfreq = pd.DateOffset(**freq) - - expect = pd.date_range(start=start, end=end, freq=_pfreq, name="a") - got = cudf.date_range(start=start, end=end, freq=_gfreq, name="a") - - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_date_range_start_freq_periods(start, freq, periods): - if isinstance(freq, str): - _gfreq = _pfreq = freq - else: - _gfreq = cudf.DateOffset(**freq) - _pfreq = pd.DateOffset(**freq) - - expect = pd.date_range(start=start, periods=periods, freq=_pfreq, name="a") - got = cudf.date_range(start=start, periods=periods, freq=_gfreq, name="a") - - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/46877", -) -def test_date_range_end_freq_periods(end, freq, periods): - if isinstance(freq, str): - _gfreq = _pfreq = freq - else: - _gfreq = cudf.DateOffset(**freq) - _pfreq = pd.DateOffset(**freq) - - expect = pd.date_range(end=end, periods=periods, freq=_pfreq, name="a") - got = cudf.date_range(end=end, periods=periods, freq=_gfreq, name="a") - - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -def test_date_range_freq_does_not_divide_range(): - expect = pd.date_range( - "2001-01-01 00:00:00.000000", "2001-01-01 00:00:00.000010", freq="3us" - ) - got = cudf.date_range( - "2001-01-01 00:00:00.000000", "2001-01-01 00:00:00.000010", freq="3us" - ) - np.testing.assert_allclose( - expect.to_numpy().astype("int64"), - got.to_pandas().to_numpy().astype("int64"), - ) - - -def test_date_range_raise_overflow(): - # Fixed offset - start = np.datetime64(np.iinfo("int64").max, "ns") - periods = 2 - freq = cudf.DateOffset(nanoseconds=1) - with pytest.raises(pd.errors.OutOfBoundsDatetime): - cudf.date_range(start=start, periods=periods, freq=freq) - - # Non-fixed offset - start = np.datetime64(np.iinfo("int64").max, "ns") - periods = 2 - freq = cudf.DateOffset(months=1) - with pytest.raises(pd.errors.OutOfBoundsDatetime): - # Extending beyond the max value will trigger a warning when pandas - # does an internal conversion to a Python built-in datetime.datetime - # object, which only supports down to microsecond resolution. - with pytest.warns(UserWarning): - cudf.date_range(start=start, periods=periods, freq=freq) - - -@pytest.mark.parametrize( - "freqstr_unsupported", - [ - "1ME", - "2SME", - "3MS", - "4BME", - "5CBME", - "6SMS", - "7BMS", - "8CBMS", - "QE", - "2BQE", - "3BQS", - "10YE", - "9BYE", - "8YS", - "7BYS", - "bh", - "B", - ], -) -def test_date_range_raise_unsupported(freqstr_unsupported): - if not PANDAS_GE_220 and freqstr_unsupported.endswith("E"): - pytest.skip(reason="YE, etc. support was added in pandas 2.2") - - s, e = "2001-01-01", "2008-01-31" - pd.date_range(start=s, end=e, freq=freqstr_unsupported) - with pytest.raises(ValueError, match="does not yet support"): - cudf.date_range(start=s, end=e, freq=freqstr_unsupported) - - # We also check that these values are unsupported when using lowercase - # characters. We exclude the value 3MS (every 3 month starts) because 3ms - # is a valid frequency for every 3 milliseconds. - if freqstr_unsupported != "3MS": - freqstr_unsupported = freqstr_unsupported.lower() - with pytest.raises(ValueError, match="does not yet support"): - with expect_warning_if(PANDAS_GE_220): - cudf.date_range(start=s, end=e, freq=freqstr_unsupported) - - -################################################################## -# End of Date Range Test # -################################################################## - - -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_month_end(data, dtype): - # Series - ps = pd.Series(data, dtype=dtype) - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_month_end - got = gs.dt.is_month_end - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-01-01", - "1800-03-14", - "2100-03-10", - "1970-01-01", - "1969-12-11", - "2017-12-30", - "2017-12-31", - "2018-01-01", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_year_start(data, dtype): - ps = pd.Series(data, dtype=dtype) - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_year_start - got = gs.dt.is_year_start - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-31", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-12-31", - "1800-03-14", - "2017-12-30", - "2017-12-31", - "2020-12-31 08:00:00", - None, - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - None, - "1800-12-14 07:30:00", - "2100-12-14 07:30:00", - "2020-05-31", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_year_end(data, dtype): - ps = pd.Series(data, dtype=dtype) - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_year_end - got = gs.dt.is_year_end - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-01", - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-04-1", - "1970-01-01", - "1969-12-11", - "2020-12-31", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_quarter_start(data, dtype): - # Series - ps = pd.Series(data, dtype=dtype) - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_quarter_start - got = gs.dt.is_quarter_start - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - "2020-05-01", - "2020-05-31", - "2020-02-29", - None, - "1999-12-01", - "2000-12-21", - None, - "1900-02-28", - "1800-03-14", - "2100-03-10", - "1970-04-1", - "1970-01-01", - "1969-12-11", - "2020-12-31", - ] - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_is_quarter_end(data, dtype): - # Series - ps = pd.Series(data, dtype=dtype) - gs = cudf.from_pandas(ps) - - expect = ps.dt.is_quarter_end - got = gs.dt.is_quarter_end - - assert_eq(expect, got) - - -def test_error_values(): - s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - with pytest.raises( - NotImplementedError, - match="DateTime Arrays is not yet implemented in cudf", - ): - s.values - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/52761", -) -@pytest.mark.parametrize( - "data", - [ - ( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - ) - ], -) -@pytest.mark.parametrize("time_type", DATETIME_TYPES) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] -) -def test_ceil(data, time_type, resolution): - gs = cudf.Series(data, dtype=time_type) - ps = gs.to_pandas() - - expect = ps.dt.ceil(resolution) - got = gs.dt.ceil(resolution) - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/52761", -) -@pytest.mark.parametrize( - "data", - [ - ( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - ) - ], -) -@pytest.mark.parametrize("time_type", DATETIME_TYPES) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] -) -def test_floor(data, time_type, resolution): - gs = cudf.Series(data, dtype=time_type) - ps = gs.to_pandas() - - expect = ps.dt.floor(resolution) - got = gs.dt.floor(resolution) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ( - [ - "2020-05-31 08:00:00", - "1999-12-31 18:40:10", - "2000-12-31 04:00:05", - "1900-02-28 07:00:06", - "1800-03-14 07:30:20", - "2100-03-14 07:30:20", - "1970-01-01 00:00:09", - "1969-12-31 12:59:10", - ] - ) - ], -) -@pytest.mark.parametrize("time_type", DATETIME_TYPES) -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "min", "s", "ms", "us", "ns"] -) -def test_round(data, time_type, resolution): - gs = cudf.Series(data, dtype=time_type) - ps = gs.to_pandas() - - expect = ps.dt.round(resolution) - got = gs.dt.round(resolution) - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "idx", - [ - pd.DatetimeIndex([]), - pd.DatetimeIndex(["2010-05-31"]), - pd.date_range("2000-01-01", "2000-12-31", periods=21), - ], -) -@pytest.mark.parametrize( - "offset", - [ - "10Y", - "6M", - "M", - "31D", - "0H", - "44640T", - "44640min", - "2678000S", - "2678000000L", - "2678000000ms", - "2678000000000U", - "2678000000000us", - "2678000000000000N", - "2678000000000000ns", - ], -) -def test_first(idx, offset): - p = pd.Series(range(len(idx)), dtype="int64", index=idx) - g = cudf.from_pandas(p) - - with pytest.warns(FutureWarning): - expect = p.first(offset=offset) - with pytest.warns(FutureWarning): - got = g.first(offset=offset) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - # This test case tests correctness when start is end of month - "idx, offset", - [ - ( - pd.DatetimeIndex( - [ - "2020-01-31", - "2020-02-15", - "2020-02-29", - "2020-03-15", - "2020-03-31", - "2020-04-15", - "2020-04-30", - ] - ), - "3M", - ) - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -def test_first_start_at_end_of_month(idx, offset): - p = pd.Series(range(len(idx)), index=idx) - g = cudf.from_pandas(p) - - with pytest.warns(FutureWarning): - expect = p.first(offset=offset) - with pytest.warns(FutureWarning): - got = g.first(offset=offset) - - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "idx", - [ - pd.DatetimeIndex([]), - pd.DatetimeIndex(["2010-05-31"]), - pd.date_range("2000-01-01", "2000-12-31", periods=21), - ], -) -@pytest.mark.parametrize( - "offset", - [ - "10Y", - "6M", - "M", - "31D", - "0H", - "44640T", - "44640min", - "2678000S", - "2678000000L", - "2678000000ms", - "2678000000000U", - "2678000000000us", - "2678000000000000N", - "2678000000000000ns", - ], -) -def test_last(idx, offset): - p = pd.Series(range(len(idx)), dtype="int64", index=idx) - g = cudf.from_pandas(p) - - with pytest.warns(FutureWarning): - expect = p.last(offset=offset) - with pytest.warns(FutureWarning): - got = g.last(offset=offset) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - "2020-01-31", - "2020-02-15", - "2020-02-29", - "2020-03-15", - "2020-03-31", - "2020-04-15", - "2020-04-30", - ], - [43534, 43543, 37897, 2000], - ], -) -@pytest.mark.parametrize("dtype", [None, "datetime64[ns]"]) -def test_datetime_constructor(data, dtype): - expected = pd.DatetimeIndex(data=data, dtype=dtype) - actual = cudf.DatetimeIndex(data=data, dtype=dtype) - - assert_eq(expected, actual) - - expected = pd.DatetimeIndex(data=pd.Series(data), dtype=dtype) - actual = cudf.DatetimeIndex(data=cudf.Series(data), dtype=dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("op", _cmpops) -def test_datetime_binop_tz_timestamp(op): - s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") - with pytest.raises(NotImplementedError): - op(s, pd_tz_timestamp) - - date_scalar = datetime.datetime.now(datetime.timezone.utc) - with pytest.raises(NotImplementedError): - op(s, date_scalar) - - -@pytest.mark.parametrize( - "data1", [["20110101", "20120101", None, "20140101", None]] -) -@pytest.mark.parametrize( - "data2", [["20110101", "20120101", "20130101", None, None]] -) -@pytest.mark.parametrize("op", _cmpops) -def test_datetime_series_cmpops_pandas_compatibility(data1, data2, op): - gsr1 = cudf.Series(data=data1, dtype="datetime64[ns]") - psr1 = gsr1.to_pandas() - - gsr2 = cudf.Series(data=data2, dtype="datetime64[ns]") - psr2 = gsr2.to_pandas() - - expect = op(psr1, psr2) - with cudf.option_context("mode.pandas_compatible", True): - got = op(gsr1, gsr2) - - assert_eq(expect, got) - - -def test_datetime_getitem_na(): - s = cudf.Series([1, 2, None, 3], dtype="datetime64[ns]") - assert s[2] is cudf.NaT - - -def test_daterange_pandas_compatibility(): - with cudf.option_context("mode.pandas_compatible", True): - expected = pd.date_range( - "2010-01-01", "2010-02-01", periods=10, name="times" - ) - actual = cudf.date_range( - "2010-01-01", "2010-02-01", periods=10, name="times" - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,dtype,freq", - [ - ([10], "datetime64[ns]", "2ns"), - ([10, 12, 14, 16], "datetime64[ns]", "2ns"), - ([10, 11, 12, 13], "datetime64[ns]", "1ns"), - ([100, 200, 300, 400], "datetime64[s]", "100s"), - ([101, 201, 301, 401], "datetime64[ms]", "100ms"), - ], -) -def test_datetime_index_with_freq(data, dtype, freq): - actual = cudf.DatetimeIndex(data, dtype=dtype, freq=freq) - expected = pd.DatetimeIndex(data, dtype=dtype, freq=freq) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data,dtype,freq", - [ - ([10, 1232, 13244, 13426], "datetime64[ns]", "2ns"), - ([10, 11, 12, 13], "datetime64[ns]", "1s"), - ([10000, 200, 300, 400], "datetime64[s]", "100s"), - ([107871, 201, 301, 401], "datetime64[ms]", "100ns"), - ], -) -def test_datetime_index_freq_error(data, dtype, freq): - assert_exceptions_equal( - pd.DatetimeIndex, - cudf.DatetimeIndex, - ([data], {"dtype": dtype, "freq": freq}), - ([data], {"dtype": dtype, "freq": freq}), - ) - - -def test_strings_with_utc_offset_not_implemented(): - with pytest.raises(NotImplementedError): - DatetimeIndex(["2022-07-22 00:00:00+02:00"]) - - -@pytest.mark.parametrize("code", ["z", "Z"]) -def test_format_timezone_not_implemented(code): - with pytest.raises(NotImplementedError): - cudf.to_datetime( - ["2020-01-01 00:00:00 UTC"], format=f"%Y-%m-%d %H:%M:%S %{code}" - ) - - -@pytest.mark.parametrize("tz", ["UTC-3", "+01:00"]) -def test_utc_offset_not_implemented(tz): - with pytest.raises((NotImplementedError, ValueError)): - cudf.to_datetime([f"2020-01-01 00:00:00{tz}"]) - - -def test_Z_utc_offset(): - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.to_datetime(["2020-01-01 00:00:00Z"]) - - result = cudf.to_datetime(["2020-01-01 00:00:00Z"]) - expected = cudf.to_datetime(["2020-01-01 00:00:00"]) - assert_eq(result, expected) - - -@pytest.mark.parametrize("arg", [True, False]) -def test_args_not_datetime_typerror(arg): - with pytest.raises(TypeError): - cudf.to_datetime([arg]) - - -@pytest.mark.parametrize( - "data, dtype", - [ - [ - [ - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000000000", - ], - "datetime64[s]", - ], - [ - [ - "2000-01-01 00:00:00.000000000", - None, - "2000-01-01 00:00:00.000000000", - ], - "datetime64[s]", - ], - [ - [ - "2000-01-01 00:00:00.001000000", - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000000000", - ], - "datetime64[us]", - ], - [ - [ - "2000-01-01 00:00:00.010000000", - "2000-01-01 00:00:00.020000000", - "2000-01-01 00:00:00.030000000", - ], - "datetime64[ms]", - ], - [ - [ - "2000-01-01 00:00:00.010000000", - "2000-01-01 00:00:00.020000000", - None, - ], - "datetime64[ms]", - ], - [ - [ - "2000-01-01 00:00:00.000001000", - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000004000", - ], - "datetime64[us]", - ], - [ - [ - None, - "2000-01-01 00:00:00.000000000", - "2000-01-01 00:00:00.000004000", - ], - "datetime64[us]", - ], - [ - [ - "2000-01-01 00:00:00.000000010", - "2000-01-01 00:00:00.000000002", - "2000-01-01 00:00:00.000000000", - ], - "datetime64[ns]", - ], - [ - [ - "2000-01-01 00:00:00.000000010", - None, - "2000-01-01 00:00:00.000000000", - ], - "datetime64[ns]", - ], - [ - [ - "2000-01-01 00:00:01.000000000", - "2000-01-01 00:00:40.000000000", - "2000-01-01 00:00:59.000000000", - ], - "datetime64[s]", - ], - [ - [ - "2000-01-01 00:10:00.000000000", - "2000-01-01 00:30:40.000000000", - "2000-01-01 00:59:00.000000000", - ], - "datetime64[s]", - ], - [ - [ - "2000-01-01 07:00:00.000000000", - "2000-01-01 08:00:00.000000000", - None, - ], - "datetime64[s]", - ], - [[None, None, None], "datetime64[s]"], - [[], "datetime64[s]"], - [ - [ - "2000-01-01 00:10:00.123456789", - "2000-01-01 00:30:40.123123456", - "2000-01-01 00:59:00.675347634", - ], - "datetime64[ns]", - ], - ], -) -def test_datetime_to_str(data, dtype): - gs = cudf.Series(data, dtype=dtype) - ps = gs.to_pandas() - - with cudf.option_context("mode.pandas_compatible", True): - actual = gs.astype("str") - - expected = ps.astype("string") - - assert_eq(actual.to_pandas(nullable=True), expected) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_datetime_string_to_datetime_resolution_loss_raises(): - data = ["2020-01-01 00:00:00.00001"] - dtype = "datetime64[s]" - with pytest.raises(ValueError): - cudf.Series(data, dtype=dtype) - with pytest.raises(ValueError): - pd.Series(data, dtype=dtype) - - -def test_dateimeindex_from_noniso_string(): - data = ["20160920", "20160925"] - gdti = cudf.DatetimeIndex(data) - pdti = pd.DatetimeIndex(data) - - assert_eq(gdti, pdti) - - -@pytest.mark.parametrize("errors", ["coerce", "ignore"]) -def test_to_datetime_errors_non_scalar_not_implemented(errors): - with pytest.raises(NotImplementedError): - cudf.to_datetime([1, ""], unit="s", errors=errors) - - -@pytest.mark.parametrize( - "freqstr", - [ - "H", - "N", - "T", - "L", - "U", - "S", - ], -) -def test_datetime_raise_warning(freqstr): - t = cudf.Series( - ["2001-01-01 00:04:45", "2001-01-01 00:04:58", "2001-01-01 00:05:04"], - dtype="datetime64[ns]", - ) - with pytest.warns(FutureWarning): - t.dt.ceil(freqstr) - - -def test_timezone_pyarrow_array(): - pa_array = pa.array( - [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)], - type=pa.timestamp("ns", "UTC"), - ) - result = cudf.Series(pa_array) - expected = pa_array.to_pandas() - assert_eq(result, expected) - - -def test_to_datetime_errors_ignore_deprecated(): - with pytest.warns(FutureWarning): - cudf.to_datetime("2001-01-01 00:04:45", errors="ignore") - - -def test_date_range_freq_default(): - result = pd.date_range("2020-01-01", periods=2, name="foo") - expected = cudf.date_range("2020-01-01", periods=2, name="foo") - assert_eq(result, expected) - - -def test_date_range_tz(): - result = pd.date_range("2020-01-01", periods=2, tz="UTC") - expected = cudf.date_range("2020-01-01", periods=2, tz="UTC") - assert_eq(result, expected) - - result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") - expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC") - assert_eq(result, expected) - - -@pytest.mark.parametrize("meth", ["day_name", "month_name"]) -@pytest.mark.parametrize("klass", [pd.Series, pd.DatetimeIndex]) -def test_day_month_name(meth, klass): - data = [ - "2020-05-31 08:00:00", - None, - "1999-12-31 18:40:00", - "2000-12-31 04:00:00", - None, - "1900-02-28 07:00:00", - "1800-03-14 07:30:00", - "2100-03-14 07:30:00", - "1970-01-01 00:00:00", - "1969-12-31 12:59:00", - ] - - p_obj = klass(data, dtype="datetime64[s]") - g_obj = cudf.from_pandas(p_obj) - - if klass is pd.Series: - p_obj = p_obj.dt - g_obj = g_obj.dt - - expect = getattr(p_obj, meth)() - got = getattr(g_obj, meth)() - - assert_eq(expect, got) - - -@pytest.mark.parametrize("meth", ["day_name", "month_name"]) -@pytest.mark.parametrize("klass", [cudf.Series, cudf.DatetimeIndex]) -def test_day_month_name_locale_not_implemented(meth, klass): - obj = klass(cudf.date_range("2020-01-01", periods=7)) - if klass is cudf.Series: - obj = obj.dt - with pytest.raises(NotImplementedError): - getattr(obj, meth)(locale="pt_BR.utf8") - - -@pytest.mark.parametrize( - "attr", - [ - "is_month_start", - "is_month_end", - "is_quarter_end", - "is_quarter_start", - "is_year_end", - "is_year_start", - "days_in_month", - "timetz", - "time", - "date", - ], -) -def test_dti_datetime_attributes(attr): - data = [ - "2020-01-01", - "2020-01-31", - "2020-03-01", - "2020-03-31", - "2020-03-31", - "2020-12-31", - None, - ] - pd_dti = pd.DatetimeIndex(data, name="foo") - cudf_dti = cudf.from_pandas(pd_dti) - - result = getattr(cudf_dti, attr) - expected = getattr(pd_dti, attr) - if isinstance(result, np.ndarray): - # numpy doesn't assert object arrays with NaT correctly - tm.assert_numpy_array_equal(result, expected) - else: - assert_eq(result, expected) - - -@pytest.mark.parametrize("attr", ["freq", "unit"]) -def test_dti_properties(attr): - pd_dti = pd.DatetimeIndex( - ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]" - ) - cudf_dti = cudf.DatetimeIndex( - ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]" - ) - - result = getattr(cudf_dti, attr) - expected = getattr(pd_dti, attr) - assert result == expected - - -def test_dti_asi8(): - pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") - cudf_dti = cudf.from_pandas(pd_dti) - - result = pd_dti.asi8 - expected = cudf_dti.asi8 - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "method, kwargs", - [ - ["mean", {}], - pytest.param( - "std", - {}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - pytest.param( - "std", - {"ddof": 0}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - ], -) -def test_dti_reduction(method, kwargs): - pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") - cudf_dti = cudf.from_pandas(pd_dti) - - result = getattr(cudf_dti, method)(**kwargs) - expected = getattr(pd_dti, method)(**kwargs) - assert result == expected - - -@pytest.mark.parametrize( - "method, kwargs", - [ - ["to_pydatetime", {}], - ["to_period", {"freq": "D"}], - ["strftime", {"date_format": "%Y-%m-%d"}], - ], -) -def test_dti_methods(method, kwargs): - pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") - cudf_dti = cudf.from_pandas(pd_dti) - - result = getattr(cudf_dti, method)(**kwargs) - expected = getattr(pd_dti, method)(**kwargs) - assert_eq(result, expected) - - -def test_date_range_start_end_divisible_by_freq(): - result = cudf.date_range("2011-01-01", "2011-01-02", freq="h") - expected = pd.date_range("2011-01-01", "2011-01-02", freq="h") - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py deleted file mode 100644 index 048b3a656e3..00000000000 --- a/python/cudf/cudf/tests/test_decimal.py +++ /dev/null @@ -1,410 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import decimal -from decimal import Decimal - -import numpy as np -import pyarrow as pa -import pytest -from packaging import version - -import cudf -from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import ( - FLOAT_TYPES, - INTEGER_TYPES, - SIGNED_TYPES, - _decimal_series, - expect_warning_if, -) - -data_ = [ - [Decimal("1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [Decimal("-1.1"), Decimal("2.2"), Decimal("3.3"), Decimal("4.4")], - [1], - [-1], - [1, 2, 3, 4], - [42, 17, 41], - [1, 2, None, 4], - [None, None, None], - [], -] -typ_ = [ - pa.decimal128(precision=4, scale=2), - pa.decimal128(precision=5, scale=3), - pa.decimal128(precision=6, scale=4), -] - - -@pytest.mark.parametrize("data_", data_) -@pytest.mark.parametrize("typ_", typ_) -def test_round_trip_decimal64_column(data_, typ_): - pa_arr = pa.array(data_, type=typ_) - col_64 = Decimal64Column.from_arrow(pa_arr) - assert pa_arr.equals(col_64.to_arrow()) - - -@pytest.mark.parametrize("data_", data_) -@pytest.mark.parametrize("typ_", typ_) -def test_round_trip_decimal32_column(data_, typ_): - pa_arr = pa.array(data_, type=typ_) - col_32 = Decimal32Column.from_arrow(pa_arr) - assert pa_arr.equals(col_32.to_arrow()) - - -def test_from_arrow_max_precision_decimal64(): - with pytest.raises(ValueError): - Decimal64Column.from_arrow( - pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=19)) - ) - - -def test_from_arrow_max_precision_decimal32(): - with pytest.raises(ValueError): - Decimal32Column.from_arrow( - pa.array([1, 2, 3], type=pa.decimal128(scale=0, precision=10)) - ) - - -@pytest.mark.parametrize( - "data", - [ - cudf.Series( - [ - 14.12302, - 97938.2, - np.nan, - 0.0, - -8.302014, - np.nan, - 94.31304, - -112.2314, - 0.3333333, - np.nan, - ] - ), - ], -) -@pytest.mark.parametrize("from_dtype", FLOAT_TYPES) -@pytest.mark.parametrize( - "to_dtype", - [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)], -) -def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype): - request.applymarker( - pytest.mark.xfail( - condition=version.parse(pa.__version__) >= version.parse("13.0.0") - and from_dtype == np.dtype("float32") - and to_dtype.precision > 12, - reason="https://github.com/rapidsai/cudf/issues/14169", - ) - ) - got = data.astype(from_dtype) - - pa_arr = got.to_arrow().cast( - pa.decimal128(to_dtype.precision, to_dtype.scale) - ) - expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) - - got = got.astype(to_dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - cudf.Series( - [ - 14.12302, - 38.2, - np.nan, - 0.0, - -8.302014, - np.nan, - 94.31304, - np.nan, - -112.2314, - 0.3333333, - np.nan, - ] - ), - ], -) -@pytest.mark.parametrize("from_dtype", INTEGER_TYPES) -@pytest.mark.parametrize( - "to_dtype", - [Decimal64Dtype(9, 3), Decimal64Dtype(11, 4), Decimal64Dtype(18, 9)], -) -def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype): - got = data.astype(from_dtype) - - pa_arr = ( - got.to_arrow() - .cast("float64") - .cast(pa.decimal128(to_dtype.precision, to_dtype.scale)) - ) - expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) - - got = got.astype(to_dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - cudf.Series( - [ - 14.12309, - 2.343942, - np.nan, - 0.0, - -8.302082, - np.nan, - 94.31308, - -112.2364, - -8.029972, - np.nan, - ] - ), - ], -) -@pytest.mark.parametrize( - "from_dtype", - [ - Decimal64Dtype(7, 2), - Decimal64Dtype(11, 4), - Decimal64Dtype(18, 10), - Decimal32Dtype(7, 2), - Decimal32Dtype(5, 3), - Decimal32Dtype(9, 5), - ], -) -@pytest.mark.parametrize( - "to_dtype", - [ - Decimal64Dtype(7, 2), - Decimal64Dtype(18, 10), - Decimal64Dtype(11, 4), - Decimal32Dtype(7, 2), - Decimal32Dtype(9, 5), - Decimal32Dtype(5, 3), - ], -) -def test_typecast_to_from_decimal(data, from_dtype, to_dtype): - if from_dtype.scale > to_dtype.MAX_PRECISION: - pytest.skip( - "This is supposed to overflow because the representation value in " - "the source exceeds the max representable in destination dtype." - ) - s = data.astype(from_dtype) - - pa_arr = s.to_arrow().cast( - pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False - ) - if isinstance(to_dtype, Decimal32Dtype): - expected = cudf.Series._from_column(Decimal32Column.from_arrow(pa_arr)) - elif isinstance(to_dtype, Decimal64Dtype): - expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr)) - - with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning): - got = s.astype(to_dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - cudf.Series( - [ - 14.12309, - 2.343942, - np.nan, - 0.0, - -8.302082, - np.nan, - 94.31308, - -112.2364, - -8.029972, - np.nan, - ] - ), - ], -) -@pytest.mark.parametrize( - "from_dtype", - [Decimal64Dtype(7, 2), Decimal64Dtype(11, 4), Decimal64Dtype(17, 10)], -) -@pytest.mark.parametrize("to_dtype", SIGNED_TYPES) -def test_typecast_from_decimal(data, from_dtype, to_dtype): - got = data.astype(from_dtype) - pa_arr = got.to_arrow().cast(to_dtype, safe=False) - - got = got.astype(to_dtype) - expected = cudf.Series._from_column(NumericalColumn.from_arrow(pa_arr)) - - assert_eq(got, expected) - assert_eq(got.dtype, expected.dtype) - - -@pytest.mark.parametrize( - "args", - [ - # scatter to a single index - ( - ["1", "2", "3"], - Decimal64Dtype(1, 0), - Decimal(5), - 1, - ["1", "5", "3"], - ), - ( - ["1.5", "2.5", "3.5"], - Decimal64Dtype(2, 1), - Decimal("5.5"), - 1, - ["1.5", "5.5", "3.5"], - ), - ( - ["1.0042", "2.0042", "3.0042"], - Decimal64Dtype(5, 4), - Decimal("5.0042"), - 1, - ["1.0042", "5.0042", "3.0042"], - ), - # scatter via boolmask - ( - ["1", "2", "3"], - Decimal64Dtype(1, 0), - Decimal(5), - cudf.Series([True, False, True]), - ["5", "2", "5"], - ), - ( - ["1.5", "2.5", "3.5"], - Decimal64Dtype(2, 1), - Decimal("5.5"), - cudf.Series([True, True, True]), - ["5.5", "5.5", "5.5"], - ), - ( - ["1.0042", "2.0042", "3.0042"], - Decimal64Dtype(5, 4), - Decimal("5.0042"), - cudf.Series([False, False, True]), - ["1.0042", "2.0042", "5.0042"], - ), - # We will allow assigning a decimal with less precision - ( - ["1.00", "2.00", "3.00"], - Decimal64Dtype(3, 2), - Decimal(5), - 1, - ["1.00", "5.00", "3.00"], - ), - # But not truncation - ( - ["1", "2", "3"], - Decimal64Dtype(1, 0), - Decimal("5.5"), - 1, - pa.lib.ArrowInvalid, - ), - # We will allow for setting scalars into decimal columns - (["1", "2", "3"], Decimal64Dtype(1, 0), 5, 1, ["1", "5", "3"]), - # But not if it has too many digits to fit the precision - (["1", "2", "3"], Decimal64Dtype(1, 0), 50, 1, pa.lib.ArrowInvalid), - ], -) -def test_series_setitem_decimal(args): - data, dtype, item, to, expect = args - data = _decimal_series(data, dtype) - - if expect is pa.lib.ArrowInvalid: - with pytest.raises(expect): - data[to] = item - return - else: - expect = _decimal_series(expect, dtype) - data[to] = item - assert_eq(data, expect) - - -@pytest.mark.parametrize( - "input_obj", [[decimal.Decimal(1), cudf.NA, decimal.Decimal(3)]] -) -def test_series_construction_with_nulls(input_obj): - expect = pa.array(input_obj, from_pandas=True) - got = cudf.Series(input_obj).to_arrow() - - assert expect == got - - -@pytest.mark.parametrize( - "data", - [ - { - "a": _decimal_series( - ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0) - ) - }, - { - "a": _decimal_series( - ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0) - ), - "b": _decimal_series( - ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) - ), - "c": _decimal_series( - ["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1) - ), - }, - { - "a": _decimal_series( - ["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0) - ), - "b": _decimal_series( - ["1.0", "2.0", None], dtype=cudf.Decimal64Dtype(2, 1) - ), - "c": _decimal_series( - [None, "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1) - ), - }, - ], -) -def test_serialize_decimal_columns(data): - df = cudf.DataFrame(data) - recreated = df.__class__.deserialize(*df.serialize()) - assert_eq(recreated, df) - - -def test_decimal_invalid_precision(): - with pytest.raises(pa.ArrowInvalid): - _ = cudf.Series([10, 20, 30], dtype=cudf.Decimal64Dtype(2, 2)) - - with pytest.raises(pa.ArrowInvalid): - _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1)) - - -def test_decimal_overflow(): - s = cudf.Series([Decimal("0.0009384233522166997927180531650178250")]) - result = s * s - assert_eq(cudf.Decimal128Dtype(precision=38, scale=37), result.dtype) - - s = cudf.Series([1, 2], dtype=cudf.Decimal128Dtype(precision=38, scale=0)) - result = s * Decimal("1.0") - assert_eq(cudf.Decimal128Dtype(precision=38, scale=1), result.dtype) - - -def test_decimal_binop_upcast_operands(): - ser1 = cudf.Series([0.51, 1.51, 2.51]).astype(cudf.Decimal64Dtype(18, 2)) - ser2 = cudf.Series([0.90, 0.96, 0.99]).astype(cudf.Decimal128Dtype(19, 2)) - result = ser1 + ser2 - expected = cudf.Series([1.41, 2.47, 3.50]).astype( - cudf.Decimal128Dtype(20, 2) - ) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py deleted file mode 100644 index 44270d20d59..00000000000 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -from __future__ import annotations - -from typing import Any - -import cupy as cp -import pandas as pd -import pytest - -import cudf -from cudf.core.buffer import as_buffer -from cudf.core.column import as_column, build_column -from cudf.core.df_protocol import ( - DataFrameObject, - _CuDFBuffer, - _CuDFColumn, - _DtypeKind, - _MaskKind, - _protocol_buffer_to_cudf_buffer, - from_dataframe, - protocol_dtype_to_cupy_dtype, -) -from cudf.testing import assert_eq - - -@pytest.fixture( - params=[ - {"a": [1, 2, 3], "b": ["x", "y", "z"]}, - {"a": [1, 2, None], "b": ["x", "y", "z"]}, - {"a": [1, 2, 3], "b": pd.Categorical(["x", "y", None])}, - ] -) -def pandas_df(request): - data = request.param - return pd.DataFrame(data) - - -def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): - if null == _MaskKind.BYTEMASK: - protocol_mask = _protocol_buffer_to_cudf_buffer(protocol_buffer) - assert_eq( - as_column(protocol_mask, dtype="bool"), - as_column(cudf_buffer, dtype="bool"), - ) - elif null == _MaskKind.BITMASK: - protocol_mask = _protocol_buffer_to_cudf_buffer(protocol_buffer) - cudf_mask = cudf_buffer - assert_eq( - build_column( - as_buffer(cp.zeros(10, dtype="int8")), - "int8", - size=size, - mask=protocol_mask, - children=(), - ), - build_column( - as_buffer(cp.zeros(10, dtype="int8")), - "int8", - size=size, - mask=cudf_mask, - children=(), - ), - ) - else: - raise NotImplementedError() - - -def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol): - buf, dtype = buffer_and_dtype - device_id = cp.asarray(cudfcol.data).device.id - assert buf.__dlpack_device__() == (2, device_id) - col_from_buf = build_column( - _protocol_buffer_to_cudf_buffer(buf), - protocol_dtype_to_cupy_dtype(dtype), - ) - # check that non null values are the equals as nulls are represented - # by sentinel values in the buffer. - # FIXME: In gh-10202 some minimal fixes were added to unblock CI. But - # currently only non-null values are compared, null positions are - # unchecked. - non_null_idxs = cudfcol.notnull() - assert_eq( - col_from_buf.apply_boolean_mask(non_null_idxs), - cudfcol.apply_boolean_mask(non_null_idxs), - ) - array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get() - col_array = cp.asarray(cudfcol.data_array_view(mode="read")).get() - assert_eq( - array_from_dlpack[non_null_idxs.values_host].flatten(), - col_array[non_null_idxs.values_host].flatten(), - ) - - -def assert_column_equal(col: _CuDFColumn, cudfcol): - assert col.size() == cudfcol.size - assert col.offset == 0 - assert col.null_count == cudfcol.null_count - assert col.num_chunks() == 1 - if col.null_count == 0: - pytest.raises(RuntimeError, col._get_validity_buffer) - assert col.get_buffers()["validity"] is None - else: - assert_validity_equal( - col.get_buffers()["validity"][0], - cudfcol.mask, - cudfcol.size, - *col.describe_null, - ) - - if col.dtype[0] == _DtypeKind.CATEGORICAL: - assert_buffer_equal(col.get_buffers()["data"], cudfcol.codes) - assert col.get_buffers()["offsets"] is None - - elif col.dtype[0] == _DtypeKind.STRING: - chars_col = build_column(data=cudfcol.data, dtype="int8") - assert_buffer_equal(col.get_buffers()["data"], chars_col) - assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0]) - - else: - assert_buffer_equal(col.get_buffers()["data"], cudfcol) - assert col.get_buffers()["offsets"] is None - - if col.null_count == 0: - assert col.describe_null == (0, None) - else: - assert col.describe_null == (3, 0) - - -def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): - assert dfo.num_columns() == len(df.columns) - assert dfo.num_rows() == len(df) - assert dfo.num_chunks() == 1 - assert dfo.column_names() == tuple(df.columns) - for col in df.columns: - assert_column_equal(dfo.get_column_by_name(col), df[col]._column) - - -def assert_from_dataframe_equals(dfobj, allow_copy): - df2 = from_dataframe(dfobj, allow_copy=allow_copy) - - assert_dataframe_equal(dfobj.__dataframe__(allow_copy), df2) - if isinstance(dfobj, cudf.DataFrame): - assert_eq(dfobj, df2) - - elif isinstance(dfobj, pd.DataFrame): - assert_eq(cudf.DataFrame(dfobj), df2) - - else: - raise TypeError(f"{type(dfobj)} not supported yet.") - - -def test_from_dataframe_exception(pandas_df): - exception_msg = "This operation must copy data from CPU to GPU." - " Set `allow_copy=True` to allow it." - with pytest.raises(TypeError, match=exception_msg): - from_dataframe(pandas_df) - - -def assert_df_unique_dtype_cols(data): - cdf = cudf.DataFrame(data=data) - assert_from_dataframe_equals(cdf, allow_copy=False) - assert_from_dataframe_equals(cdf, allow_copy=True) - - -def test_from_dataframe(): - data = dict(a=[1, 2, 3], b=[9, 10, 11]) - df1 = cudf.DataFrame(data=data) - df2 = cudf.from_dataframe(df1) - assert_eq(df1, df2) - - df3 = cudf.from_dataframe(df2) - assert_eq(df1, df3) - - -def test_int_dtype(): - data_int = dict(a=[1, 2, 3], b=[9, 10, 11]) - assert_df_unique_dtype_cols(data_int) - - -def test_float_dtype(): - data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8]) - assert_df_unique_dtype_cols(data_float) - - -def test_categorical_dtype(): - cdf = cudf.DataFrame({"A": [1, 2, 5, 1]}) - cdf["A"] = cdf["A"].astype("category") - col = cdf.__dataframe__().get_column_by_name("A") - assert col.dtype[0] == _DtypeKind.CATEGORICAL - assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - assert_from_dataframe_equals(cdf, allow_copy=False) - assert_from_dataframe_equals(cdf, allow_copy=True) - - -def test_bool_dtype(): - data_bool = dict(a=[True, True, False], b=[False, True, False]) - assert_df_unique_dtype_cols(data_bool) - - -def test_string_dtype(): - data_string = dict(a=["a", "b", "cdef", "", "g"]) - assert_df_unique_dtype_cols(data_string) - - -def test_mixed_dtype(): - data_mixed = dict( - int=[1, 2, 3], - float=[1.5, 2.5, 3.5], - bool=[True, False, True], - categorical=[5, 1, 5], - string=["rapidsai-cudf ", "", "df protocol"], - ) - assert_df_unique_dtype_cols(data_mixed) - - -def test_NA_int_dtype(): - data_int = dict( - a=[1, None, 3, None, 5], - b=[9, 10, None, 7, 8], - c=[6, 19, 20, 100, 1000], - ) - assert_df_unique_dtype_cols(data_int) - - -def test_NA_float_dtype(): - data_float = dict( - a=[1.4, None, 3.6, None, 5.2], - b=[9.7, 10.9, None, 7.8, 8.2], - c=[6.1, 19.2, 20.3, 100.4, 1000.5], - ) - assert_df_unique_dtype_cols(data_float) - - -def test_NA_categorical_dtype(): - df = cudf.DataFrame({"A": [1, 2, 5, 1]}) - df["B"] = df["A"].astype("category") - df.at[[1, 3], "B"] = None # Set two items to null - - # Some detailed testing for correctness of dtype and null handling: - col = df.__dataframe__().get_column_by_name("B") - assert col.dtype[0] == _DtypeKind.CATEGORICAL - assert col.null_count == 2 - assert col.describe_null == (3, 0) - assert col.num_chunks() == 1 - assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) - assert_from_dataframe_equals(df, allow_copy=False) - assert_from_dataframe_equals(df, allow_copy=True) - - -def test_NA_bool_dtype(): - data_bool = dict(a=[None, True, False], b=[False, None, None]) - assert_df_unique_dtype_cols(data_bool) - - -def test_NA_string_dtype(): - df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]}) - df["B"] = df["A"].astype("object") - df.at[1, "B"] = cudf.NA # Set one item to null - - # Test for correctness and null handling: - col = df.__dataframe__().get_column_by_name("B") - assert col.dtype[0] == _DtypeKind.STRING - assert col.null_count == 1 - assert col.describe_null == (3, 0) - assert col.num_chunks() == 1 - assert_from_dataframe_equals(df, allow_copy=False) - assert_from_dataframe_equals(df, allow_copy=True) - - -def test_NA_mixed_dtype(): - data_mixed = dict( - int=[1, None, 2, 3, 1000], - float=[None, 1.5, 2.5, 3.5, None], - bool=[True, None, False, None, None], - categorical=[5, 1, 5, 3, None], - string=[None, None, None, "df protocol", None], - ) - assert_df_unique_dtype_cols(data_mixed) - - -def test_from_cpu_df(pandas_df): - cudf.from_dataframe(pandas_df, allow_copy=True) diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py deleted file mode 100644 index ebcc35784ee..00000000000 --- a/python/cudf/cudf/tests/test_dlpack.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import itertools -from contextlib import ExitStack as does_not_raise - -import cupy -import numpy as np -import pytest -from packaging import version - -import cudf -from cudf.testing import assert_eq - -nelems = [0, 3, 10] -dtype = [np.uint16, np.int32, np.float64] -nulls = ["some", "none"] -params_1d = itertools.product(nelems, dtype, nulls) - -ncols = [0, 1, 2] -params_2d = itertools.product(ncols, nelems, dtype, nulls) - - -if version.parse(cupy.__version__) < version.parse("10"): - # fromDlpack deprecated in cupy version 10, replaced by from_dlpack - cupy_from_dlpack = cupy.fromDlpack -else: - cupy_from_dlpack = cupy.from_dlpack - - -def data_size_expectation_builder(data, nan_null_param=False): - if nan_null_param and np.isnan(data).any(): - return pytest.raises((ValueError,)) - - if len(data.shape) == 2 and data.size == 0: - return pytest.raises((ValueError, IndexError)) - else: - return does_not_raise() - - -@pytest.fixture(params=params_1d) -def data_1d(request): - nelems = request.param[0] - dtype = request.param[1] - nulls = request.param[2] - a = np.random.randint(10, size=nelems).astype(dtype) - if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating): - idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False) - a[idx] = np.nan - return a - - -@pytest.fixture(params=params_2d) -def data_2d(request): - ncols = request.param[0] - nrows = request.param[1] - dtype = request.param[2] - nulls = request.param[3] - a = np.random.randint(10, size=(nrows, ncols)).astype(dtype) - if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating): - idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False) - a.ravel()[idx] = np.nan - return np.ascontiguousarray(a) - - -def test_to_dlpack_dataframe(data_2d): - expectation = data_size_expectation_builder(data_2d) - - with expectation: - gdf = cudf.DataFrame.from_records(data_2d) - dlt = gdf.to_dlpack() - - # PyCapsules are a C-API thing so couldn't come up with a better way - assert str(type(dlt)) == "" - - -def test_to_dlpack_series(data_1d): - expectation = data_size_expectation_builder(data_1d, nan_null_param=False) - - with expectation: - gs = cudf.Series(data_1d, nan_as_null=False) - dlt = gs.to_dlpack() - - # PyCapsules are a C-API thing so couldn't come up with a better way - assert str(type(dlt)) == "" - - -def test_to_dlpack_series_null(data_1d): - expectation = data_size_expectation_builder(data_1d, nan_null_param=True) - - with expectation: - gs = cudf.Series(data_1d, nan_as_null=True) - dlt = gs.to_dlpack() - - # PyCapsules are a C-API thing so couldn't come up with a better way - assert str(type(dlt)) == "" - - -def test_to_dlpack_index(data_1d): - expectation = data_size_expectation_builder(data_1d) - - with expectation: - if np.isnan(data_1d).any(): - pytest.skip("Nulls not allowed in Index") - gi = cudf.Index(data_1d) - dlt = gi.to_dlpack() - - # PyCapsules are a C-API thing so couldn't come up with a better way - assert str(type(dlt)) == "" - - -def test_to_dlpack_cupy_1d(data_1d): - expectation = data_size_expectation_builder(data_1d, False) - with expectation: - gs = cudf.Series(data_1d, nan_as_null=False) - cudf_host_array = gs.to_numpy(na_value=np.nan) - dlt = gs.to_dlpack() - - cupy_array = cupy_from_dlpack(dlt) - cupy_host_array = cupy_array.get() - - assert_eq(cudf_host_array, cupy_host_array) - - -def test_to_dlpack_cupy_2d(data_2d): - expectation = data_size_expectation_builder(data_2d) - - with expectation: - gdf = cudf.DataFrame.from_records(data_2d) - cudf_host_array = np.array(gdf.to_pandas()).flatten() - dlt = gdf.to_dlpack() - - cupy_array = cupy_from_dlpack(dlt) - cupy_host_array = cupy_array.get().flatten() - - assert_eq(cudf_host_array, cupy_host_array) - - -def test_from_dlpack_cupy_1d(data_1d): - cupy_array = cupy.array(data_1d) - cupy_host_array = cupy_array.get() - dlt = cupy_array.toDlpack() - - gs = cudf.from_dlpack(dlt) - cudf_host_array = gs.to_numpy(na_value=np.nan) - - assert_eq(cudf_host_array, cupy_host_array) - - -def test_from_dlpack_cupy_2d(data_2d): - cupy_array = cupy.array(data_2d, order="F") - cupy_host_array = cupy_array.get().flatten() - dlt = cupy_array.toDlpack() - - gdf = cudf.from_dlpack(dlt) - cudf_host_array = np.array(gdf.to_pandas()).flatten() - - assert_eq(cudf_host_array, cupy_host_array) - - -def test_to_dlpack_cupy_2d_null(data_2d): - expectation = data_size_expectation_builder(data_2d, nan_null_param=True) - - with expectation: - gdf = cudf.DataFrame.from_records(data_2d, nan_as_null=True) - cudf_host_array = np.array(gdf.to_pandas()).flatten() - dlt = gdf.to_dlpack() - - cupy_array = cupy_from_dlpack(dlt) - cupy_host_array = cupy_array.get().flatten() - - assert_eq(cudf_host_array, cupy_host_array) - - -def test_to_dlpack_cupy_1d_null(data_1d): - expectation = data_size_expectation_builder(data_1d, nan_null_param=True) - - with expectation: - gs = cudf.Series(data_1d) - cudf_host_array = gs.to_numpy(na_value=np.nan) - dlt = gs.to_dlpack() - - cupy_array = cupy_from_dlpack(dlt) - cupy_host_array = cupy_array.get() - - assert_eq(cudf_host_array, cupy_host_array) - - -def test_to_dlpack_mixed_dtypes(): - df = cudf.DataFrame({"a": [1, 2, 3, 4], "b": [10.32, 0.4, -0.2, -1000.32]}) - - cudf_host_array = df.to_numpy() - dlt = df.to_dlpack() - - cupy_array = cupy_from_dlpack(dlt) - cupy_host_array = cupy_array.get() - - assert_eq(cudf_host_array, cupy_host_array) - - -@pytest.mark.parametrize( - "shape", - [ - (0, 3), - (3, 0), - (0, 0), - ], -) -def test_from_dlpack_zero_sizes(shape): - arr = cupy.empty(shape, dtype=float) - df = cudf.io.dlpack.from_dlpack(arr.__dlpack__()) - assert_eq(df, cudf.DataFrame(arr)) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py deleted file mode 100644 index 5d3d18cbe95..00000000000 --- a/python/cudf/cudf/tests/test_doctests.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -import contextlib -import doctest -import inspect -import io -import itertools -import os - -import numpy as np -import pytest -from packaging import version - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION - -pytestmark = pytest.mark.filterwarnings("ignore::FutureWarning") - -# modules that will be searched for doctests -tests = [cudf, cudf.core.groupby] - - -def _name_in_all(parent, name): - return name in getattr(parent, "__all__", []) - - -def _is_public_name(parent, name): - return not name.startswith("_") - - -def _find_doctests_in_obj(obj, finder=None, criteria=None): - """Find all doctests in an object. - - Parameters - ---------- - obj : module or class - The object to search for docstring examples. - finder : doctest.DocTestFinder, optional - The DocTestFinder object to use. If not provided, a DocTestFinder is - constructed. - criteria : callable, optional - Callable indicating whether to recurse over members of the provided - object. If not provided, names not defined in the object's ``__all__`` - property are ignored. - - Yields - ------ - doctest.DocTest - The next doctest found in the object. - """ - if finder is None: - finder = doctest.DocTestFinder() - if criteria is None: - criteria = _name_in_all - for docstring in finder.find(obj): - if docstring.examples: - yield docstring - for name, member in inspect.getmembers(obj): - # Only recurse over members matching the criteria - if not criteria(obj, name): - continue - # Recurse over the public API of modules (objects defined in the - # module's __all__) - if inspect.ismodule(member): - yield from _find_doctests_in_obj( - member, finder, criteria=_name_in_all - ) - # Recurse over the public API of classes (attributes not prefixed with - # an underscore) - if inspect.isclass(member): - yield from _find_doctests_in_obj( - member, finder, criteria=_is_public_name - ) - - -class TestDoctests: - @pytest.fixture(autouse=True) - def chdir_to_tmp_path(cls, tmp_path): - # Some doctests generate files, so this fixture runs the tests in a - # temporary directory. - original_directory = os.getcwd() - os.chdir(tmp_path) - yield - os.chdir(original_directory) - - @pytest.fixture(autouse=True) - def prinoptions(cls): - # TODO: NumPy now prints scalars as `np.int8(1)`, etc. this should - # be adapted evantually. - if version.parse(np.__version__) >= version.parse("2.0"): - with np.printoptions(legacy="1.25"): - yield - else: - yield - - @pytest.mark.parametrize( - "docstring", - itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]), - ids=lambda docstring: docstring.name, - ) - @pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Doctests not expected to pass on older versions of pandas", - ) - def test_docstring(self, docstring): - # We ignore differences in whitespace in the doctest output, and enable - # the use of an ellipsis "..." to match any string in the doctest - # output. An ellipsis is useful for, e.g., memory addresses or - # imprecise floating point values. - optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE - runner = doctest.DocTestRunner(optionflags=optionflags) - - # These global names are pre-defined and can be used in doctests - # without first importing them. - globals = dict( - cudf=cudf, - np=np, - ) - docstring.globs = globals - - # Capture stdout and include failing outputs in the traceback. - doctest_stdout = io.StringIO() - with contextlib.redirect_stdout(doctest_stdout): - runner.run(docstring) - results = runner.summarize() - assert not results.failed, ( - f"{results.failed} of {results.attempted} doctests failed for " - f"{docstring.name}:\n{doctest_stdout.getvalue()}" - ) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py deleted file mode 100644 index 5b1ee0ffac6..00000000000 --- a/python/cudf/cudf/tests/test_dropna.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize( - "data", - [ - [], - [1.0, 2, None, 4], - ["one", "two", "three", "four"], - pd.Series(["a", "b", "c", "d"], dtype="category"), - pd.Series(pd.date_range("2010-01-01", "2010-01-04")), - ], -) -@pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_series(data, nulls, inplace): - psr = pd.Series(data) - - if len(data) > 0: - if nulls == "one": - p = np.random.randint(0, 4) - psr[p] = None - elif nulls == "some": - p1, p2 = np.random.randint(0, 4, (2,)) - psr[p1] = None - psr[p2] = None - elif nulls == "all": - psr[:] = None - - gsr = cudf.from_pandas(psr) - - check_dtype = True - if gsr.null_count == len(gsr): - check_dtype = False - - expected = psr.dropna() - actual = gsr.dropna() - - if inplace: - expected = psr - actual = gsr - - assert_eq(expected, actual, check_dtype=check_dtype) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, None]}, - {"a": [1, 2, None], "b": [3, 4, 5]}, - {"a": [1, 2, None], "b": [3, 4, None]}, - {"a": [None, 1, 2], "b": [1, 2, None]}, - {"a": [None, 1, None], "b": [None, 2, None]}, - {"a": [None, None, 1], "b": [1, 2, None]}, - {"a": ["d", "e", "f"], "b": ["a", None, "c"]}, - ], -) -@pytest.mark.parametrize("how", ["all", "any"]) -@pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_dataframe(data, how, axis, inplace): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - expected = pdf.dropna(axis=axis, how=how, inplace=inplace) - actual = gdf.dropna(axis=axis, how=how, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("how", ["all", "any"]) -@pytest.mark.parametrize( - "data", - [ - { - "a": cudf.Series([None, None, None], dtype="float64"), - "b": cudf.Series([1, 2, None]), - }, - { - "a": cudf.Series([np.nan, np.nan, np.nan], dtype="float64"), - "b": cudf.Series([1, 2, None]), - }, - cudf.Series([None, None, None], dtype="object"), - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -def test_dropna_with_all_nulls(how, data, axis): - gdf = cudf.DataFrame({"a": data}) - pdf = gdf.to_pandas() - - assert_eq(pdf.dropna(axis=axis, how=how), gdf.dropna(axis=axis, how=how)) - - -def test_dropna_nan_as_null(): - sr = cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False) - assert_eq(sr.dropna(), sr[:2]) - sr = sr.nans_to_nulls() - assert_eq(sr.dropna(), sr[:2]) - - df = cudf.DataFrame( - { - "a": cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False), - "b": cudf.Series([1, 2, 3, 4]), - } - ) - - got = df.dropna() - expected = df[:2] - assert_eq(expected, got) - - df = df.nans_to_nulls() - got = df.dropna() - expected = df[:2] - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data,subset", - [ - ({"a": [1, None], "b": [1, 2]}, ["a"]), - ({"a": [1, None], "b": [1, 2]}, ["b"]), - ({"a": [1, None], "b": [1, 2]}, []), - ({"a": [1, 2], "b": [1, 2]}, ["b"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["a"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["b"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["a", "b"]), - ], -) -def test_dropna_subset_rows(data, subset): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf.dropna(subset=subset), gdf.dropna(subset=subset)) - - -@pytest.mark.parametrize( - "data, subset", - [ - ({"a": [1, None], "b": [1, 2]}, [0]), - ({"a": [1, None], "b": [1, 2]}, [1]), - ({"a": [1, None], "b": [1, 2]}, []), - ({"a": [1, 2], "b": [1, 2]}, [0]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [1]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0, 1]), - ], -) -def test_dropna_subset_cols(data, subset): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.dropna(axis=1, subset=subset), gdf.dropna(axis=1, subset=subset) - ) - - -# TODO: can't test with subset=[] below since Pandas -# returns empty DF when both subset=[] and thresh are specified. -@pytest.mark.parametrize("thresh", [0, 1, 2]) -@pytest.mark.parametrize("subset", [None, ["a"], ["b"], ["a", "b"]]) -def test_dropna_thresh(thresh, subset): - pdf = pd.DataFrame({"a": [1, 2, None, None], "b": [1, 2, 3, None]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.dropna(axis=0, thresh=thresh, subset=subset), - gdf.dropna(axis=0, thresh=thresh, subset=subset), - ) - - -@pytest.mark.parametrize("thresh", [0, 1, 2]) -@pytest.mark.parametrize("subset", [None, [0], [1], [0, 1]]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_thresh_cols(thresh, subset, inplace): - pdf = pd.DataFrame( - {"a": [1, 2], "b": [3, 4], "c": [5, None], "d": [np.nan, np.nan]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.dropna( - axis=1, thresh=thresh, subset=subset, inplace=inplace - ) - actual = gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq( - expected, - actual, - ) - - -@pytest.mark.parametrize( - "data", - [ - { - "key": [1, 2, 10], - "val": cudf.Series([np.nan, 3, 1], nan_as_null=False), - "abc": [np.nan, None, 1], - }, - { - "key": [None, 2, 1], - "val": cudf.Series([3, np.nan, 0.1], nan_as_null=True), - "abc": [None, 1, None], - }, - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -def test_dropna_dataframe_np_nan(data, axis): - gdf = cudf.DataFrame(data) - pd_data = { - key: value.to_pandas() if isinstance(value, cudf.Series) else value - for key, value in data.items() - } - pdf = pd.DataFrame(pd_data) - - assert_eq(pdf.dropna(axis=axis), gdf.dropna(axis=axis), check_dtype=False) - - -@pytest.mark.parametrize( - "data, dtype", - [ - ([1, float("nan"), 2], "float64"), - (["x", None, "y"], "str"), - (["x", None, "y"], "category"), - (["2020-01-20", pd.NaT, "2020-03-15"], "datetime64[ns]"), - (["1s", pd.NaT, "3d"], "timedelta64[ns]"), - ], -) -def test_dropna_index(data, dtype): - pi = pd.Index(data, dtype=dtype) - gi = cudf.from_pandas(pi) - - expect = pi.dropna() - got = gi.dropna() - - assert_eq(expect, got) - - -@pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]]) -@pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex(data, how): - pi = pd.MultiIndex.from_arrays(data) - gi = cudf.from_pandas(pi) - - expect = pi.dropna(how) - got = gi.dropna(how) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], - [pd.NaT, pd.NaT, pd.Timestamp("2020-03-01")], - ], - [ - [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], - [np.nan, np.nan, 1.0], - ], - [[1.0, np.nan, 2.0], [np.nan, np.nan, 1.0]], - ], -) -@pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex_2(data, how): - pi = pd.MultiIndex.from_arrays(data) - gi = cudf.from_pandas(pi) - - expect = pi.dropna(how) - got = gi.dropna(how) - - assert_eq(expect, got) - - -def test_ignore_index(): - pser = pd.Series([1, 2, np.nan], index=[2, 4, 1]) - gser = cudf.from_pandas(pser) - - result = pser.dropna(ignore_index=True) - expected = gser.dropna(ignore_index=True) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py deleted file mode 100644 index c62b5889fdd..00000000000 --- a/python/cudf/cudf/tests/test_dtypes.py +++ /dev/null @@ -1,364 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest -from pandas.core.arrays.arrow.extension_types import ArrowIntervalType - -import cudf -from cudf.core.column import ColumnBase -from cudf.core.dtypes import ( - CategoricalDtype, - Decimal32Dtype, - Decimal64Dtype, - Decimal128Dtype, - IntervalDtype, - ListDtype, - StructDtype, -) -from cudf.testing import assert_eq -from cudf.utils.dtypes import np_to_pa_dtype - - -def test_cdt_basic(): - psr = pd.Series(["a", "b", "a", "c"], dtype="category") - sr = cudf.Series(["a", "b", "a", "c"], dtype="category") - assert isinstance(sr.dtype, CategoricalDtype) - assert_eq(sr.dtype.categories, psr.dtype.categories) - - -@pytest.mark.parametrize( - "data", [None, [], ["a"], [1], [1.0], ["a", "b", "c"]] -) -@pytest.mark.parametrize("ordered", [None, False, True]) -def test_cdt_eq(data, ordered): - dt = cudf.CategoricalDtype(categories=data, ordered=ordered) - assert dt == "category" - assert dt == dt - assert dt == cudf.CategoricalDtype(categories=None, ordered=ordered) - assert dt == cudf.CategoricalDtype(categories=data, ordered=ordered) - assert not dt == cudf.CategoricalDtype( - categories=data, ordered=not ordered - ) - - -@pytest.mark.parametrize( - "data", [None, [], ["a"], [1], [1.0], ["a", "b", "c"]] -) -@pytest.mark.parametrize("ordered", [None, False, True]) -def test_cdf_to_pandas(data, ordered): - assert ( - pd.CategoricalDtype(data, ordered) - == cudf.CategoricalDtype(categories=data, ordered=ordered).to_pandas() - ) - - -@pytest.mark.parametrize( - "value_type", - [ - int, - "int32", - np.int32, - "datetime64[ms]", - "datetime64[ns]", - "str", - "object", - ], -) -def test_list_dtype_pyarrow_round_trip(value_type): - pa_type = pa.list_(cudf.utils.dtypes.np_to_pa_dtype(np.dtype(value_type))) - expect = pa_type - got = ListDtype.from_arrow(expect).to_arrow() - assert expect.equals(got) - - -def test_list_dtype_eq(): - lhs = ListDtype("int32") - rhs = ListDtype("int32") - assert lhs == rhs - rhs = ListDtype("int64") - assert lhs != rhs - - -def test_list_nested_dtype(): - dt = ListDtype(ListDtype("int32")) - expect = ListDtype("int32") - got = dt.element_type - assert expect == got - - -@pytest.mark.parametrize( - "fields", - [ - {}, - {"a": "int64"}, - {"a": "datetime64[ms]"}, - {"a": "int32", "b": "int64"}, - ], -) -def test_struct_dtype_pyarrow_round_trip(fields): - pa_type = pa.struct( - { - k: cudf.utils.dtypes.np_to_pa_dtype(np.dtype(v)) - for k, v in fields.items() - } - ) - expect = pa_type - got = StructDtype.from_arrow(expect).to_arrow() - assert expect.equals(got) - - -def test_struct_dtype_eq(): - lhs = StructDtype( - {"a": "int32", "b": StructDtype({"c": "int64", "ab": "int32"})} - ) - rhs = StructDtype( - {"a": "int32", "b": StructDtype({"c": "int64", "ab": "int32"})} - ) - assert lhs == rhs - rhs = StructDtype({"a": "int32", "b": "int64"}) - assert lhs != rhs - lhs = StructDtype({"b": "int64", "a": "int32"}) - assert lhs != rhs - - -@pytest.mark.parametrize( - "fields", - [ - {}, - {"a": "int32"}, - {"a": "object"}, - {"a": "str"}, - {"a": "datetime64[D]"}, - {"a": "int32", "b": "int64"}, - {"a": "int32", "b": StructDtype({"a": "int32", "b": "int64"})}, - ], -) -def test_struct_dtype_fields(fields): - fields = {"a": "int32", "b": StructDtype({"c": "int64", "d": "int32"})} - dt = StructDtype(fields) - assert_eq(dt.fields, fields) - - -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_decimal_dtype_arrow_roundtrip(decimal_type): - dt = decimal_type(4, 2) - assert dt.to_arrow() == pa.decimal128(4, 2) - assert dt == decimal_type.from_arrow(pa.decimal128(4, 2)) - - -@pytest.mark.parametrize( - "decimal_type,max_precision", - [ - (cudf.Decimal32Dtype, 9), - (cudf.Decimal64Dtype, 18), - (cudf.Decimal128Dtype, 38), - ], -) -def test_max_precision(decimal_type, max_precision): - decimal_type(scale=0, precision=max_precision) - with pytest.raises(ValueError): - decimal_type(scale=0, precision=max_precision + 1) - - -@pytest.fixture(params=["int64", "int32"]) -def subtype(request): - return request.param - - -@pytest.fixture(params=["left", "right", "both", "neither"]) -def closed(request): - return request.param - - -def test_interval_dtype_pyarrow_round_trip(subtype, closed): - pa_array = ArrowIntervalType(subtype, closed) - expect = pa_array - got = IntervalDtype.from_arrow(expect).to_arrow() - assert expect.equals(got) - - -def test_interval_dtype_from_pandas(subtype, closed): - expect = cudf.IntervalDtype(subtype, closed=closed) - pd_type = pd.IntervalDtype(subtype, closed=closed) - got = cudf.IntervalDtype.from_pandas(pd_type) - assert expect == got - - -def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array): - """ - In cudf, each column holds its dtype. And since column may have child - columns, child columns also holds their datatype. This method tests - that every level of `column` matches the type of the given `array` - recursively. - """ - - if isinstance(column.dtype, ListDtype): - return array.type.equals( - column.dtype.to_arrow() - ) and assert_column_array_dtype_equal( - column.base_children[1], array.values - ) - elif isinstance(column.dtype, StructDtype): - return array.type.equals(column.dtype.to_arrow()) and all( - assert_column_array_dtype_equal(child, array.field(i)) - for i, child in enumerate(column.base_children) - ) - elif isinstance( - column.dtype, (Decimal128Dtype, Decimal64Dtype, Decimal32Dtype) - ): - return array.type.equals(column.dtype.to_arrow()) - elif isinstance(column.dtype, CategoricalDtype): - raise NotImplementedError() - else: - return array.type.equals(np_to_pa_dtype(column.dtype)) - - -@pytest.mark.parametrize( - "data", - [ - [[{"name": 123}]], - [ - [ - { - "IsLeapYear": False, - "data": {"Year": 1999, "Month": 7}, - "names": ["Mike", None], - }, - { - "IsLeapYear": True, - "data": {"Year": 2004, "Month": 12}, - "names": None, - }, - { - "IsLeapYear": False, - "data": {"Year": 1996, "Month": 2}, - "names": ["Rose", "Richard"], - }, - ] - ], - [ - [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], - [ - {"human?": None, "deets": {"weight": 5.3, "age": 25}}, - {"human?": False, "deets": {"weight": 8.0, "age": 31}}, - {"human?": False, "deets": None}, - ], - [], - None, - [{"human?": None, "deets": {"weight": 6.9, "age": None}}], - ], - [ - { - "name": "var0", - "val": [ - {"name": "var1", "val": None, "type": "optional"} - ], - "type": "list", - }, - {}, - { - "name": "var2", - "val": [ - { - "name": "var3", - "val": {"field": 42}, - "type": "optional", - }, - { - "name": "var4", - "val": {"field": 3.14}, - "type": "optional", - }, - ], - "type": "list", - }, - None, - ], - ], -) -def test_lists_of_structs_dtype(data): - got = cudf.Series(data) - expected = pa.array(data) - - assert_column_array_dtype_equal(got._column, expected) - assert expected.equals(got._column.to_arrow()) - - -@pytest.mark.parametrize( - "in_dtype,expect", - [ - (np.dtype("int8"), np.dtype("int8")), - (np.int8, np.dtype("int8")), - (pd.Int8Dtype(), np.dtype("int8")), - (pd.StringDtype(), np.dtype("object")), - ("int8", np.dtype("int8")), - ("boolean", np.dtype("bool")), - ("bool_", np.dtype("bool")), - (np.bool_, np.dtype("bool")), - (int, np.dtype("int64")), - (float, np.dtype("float64")), - (cudf.ListDtype("int64"), cudf.ListDtype("int64")), - (np.dtype("U"), np.dtype("object")), - ("timedelta64[ns]", np.dtype(" NA - - -def func_eq_na(x): - return x == NA - - -def func_ne_na(x): - return x != NA - - -def func_ge_na(x): - return x >= NA - - -def func_le_na(x): - return x <= NA - - -def func_na_lt(x): - return x < NA - - -def func_na_gt(x): - return x > NA - - -def func_na_eq(x): - return x == NA - - -def func_na_ne(x): - return x != NA - - -def func_na_ge(x): - return x >= NA - - -def func_na_le(x): - return x <= NA - - -na_comparison_funcs = ( - func_lt_na, - func_gt_na, - func_eq_na, - func_ne_na, - func_ge_na, - func_le_na, - func_na_lt, - func_na_gt, - func_na_eq, - func_na_ne, - func_na_ge, - func_na_le, -) - - -@pytest.mark.parametrize("fn", na_comparison_funcs) -@pytest.mark.parametrize("ty", number_types, ids=number_ids) -def test_na_masked_comparisons(fn, ty): - device_fn = cuda.jit(device=True)(fn) - - @cuda.jit - def test_kernel(err): - unmasked = ty(1) - valid_masked = Masked(unmasked, True) - invalid_masked = Masked(unmasked, False) - - valid_cmp_na = device_fn(valid_masked) - invalid_cmp_na = device_fn(invalid_masked) - - if valid_cmp_na: - err[0] = 1 - - if invalid_cmp_na: - err[0] = 2 - - err = cp.asarray([0], dtype="int8") - with _CUDFNumbaConfig(): - test_kernel[1, 1](err) - assert err[0] == 0 - - -# xfail because scalars do not yet cast for a comparison to NA -@pytest.mark.xfail -@pytest.mark.parametrize("fn", na_comparison_funcs) -@pytest.mark.parametrize("ty", number_types, ids=number_ids) -def test_na_scalar_comparisons(fn, ty): - device_fn = cuda.jit(device=True)(fn) - - @cuda.jit - def test_kernel(err): - unmasked = ty(1) - - unmasked_cmp_na = device_fn(unmasked) - - if unmasked_cmp_na: - err[0] = 1 - - err = cp.asarray([0], dtype="int8") - with _CUDFNumbaConfig(): - test_kernel[1, 1](err) - assert err[0] == 0 diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py deleted file mode 100644 index 47f9180dcb1..00000000000 --- a/python/cudf/cudf/tests/test_factorize.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import DataFrame, Index -from cudf.testing import assert_eq - - -@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) -def test_factorize_series_obj(ncats, nelem): - df = DataFrame() - np.random.seed(0) - - # initialize data frame - df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) - - uvals, labels = df["cats"].factorize() - np.testing.assert_array_equal(labels.to_numpy(), sorted(set(arr))) - assert isinstance(uvals, cp.ndarray) - assert isinstance(labels, Index) - - encoder = {labels[idx]: idx for idx in range(len(labels))} - handcoded = [encoder[v] for v in arr] - np.testing.assert_array_equal(uvals.get(), handcoded) - - -@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) -def test_factorize_index_obj(ncats, nelem): - df = DataFrame() - np.random.seed(0) - - # initialize data frame - df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) - df = df.set_index("cats") - - uvals, labels = df.index.factorize() - np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) - assert isinstance(uvals, cp.ndarray) - assert isinstance(labels, Index) - - encoder = {labels[idx]: idx for idx in range(len(labels))} - handcoded = [encoder[v] for v in arr] - np.testing.assert_array_equal(uvals.get(), handcoded) - - -def test_factorize_series_index(): - df = DataFrame() - df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] - df["col2"] = [ - 2992443.0, - 2992447.0, - 2992466.0, - 2992440.0, - 2992441.0, - 2992442.0, - 2992444.0, - 2992445.0, - 2992446.0, - 2992448.0, - ] - assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) - assert_eq( - df.col1.factorize()[1].to_pandas().values, - df.to_pandas().col1.factorize()[1].values, - ) - - df = df.set_index("col2") - - assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) - assert_eq( - df.col1.factorize()[1].to_pandas().values, - df.to_pandas().col1.factorize()[1].values, - ) - - -def test_cudf_factorize_series(): - data = [1, 2, 3, 4, 5] - - psr = pd.Series(data) - gsr = cudf.Series(data) - - expect = pd.factorize(psr) - got = cudf.factorize(gsr) - - assert len(expect) == len(got) - - np.testing.assert_array_equal(expect[0], got[0].get()) - np.testing.assert_array_equal(expect[1], got[1].values.get()) - - -def test_cudf_factorize_index(): - data = [1, 2, 3, 4, 5] - - pi = pd.Index(data) - gi = cudf.Index(data) - - expect = pd.factorize(pi) - got = cudf.factorize(gi) - - assert len(expect) == len(got) - - np.testing.assert_array_equal(expect[0], got[0].get()) - np.testing.assert_array_equal(expect[1], got[1].values.get()) - - -def test_cudf_factorize_array(): - data = [1, 2, 3, 4, 5] - - parr = np.array(data) - garr = cp.array(data) - - expect = pd.factorize(parr) - got = cudf.factorize(garr) - - assert len(expect) == len(got) - - np.testing.assert_array_equal(expect[0], got[0].get()) - np.testing.assert_array_equal(expect[1], got[1].get()) - - -@pytest.mark.parametrize("pandas_compatibility", [True, False]) -def test_factorize_code_pandas_compatibility(pandas_compatibility): - psr = pd.Series([1, 2, 3, 4, 5]) - gsr = cudf.from_pandas(psr) - - expect = pd.factorize(psr) - with cudf.option_context("mode.pandas_compatible", pandas_compatibility): - got = cudf.factorize(gsr) - assert_eq(got[0], expect[0]) - assert_eq(got[1], expect[1]) - if pandas_compatibility: - assert got[0].dtype == expect[0].dtype - else: - assert got[0].dtype == cudf.dtype("int8") - - -def test_factorize_result_classes(): - data = [1, 2, 3] - - labels, cats = cudf.factorize(cudf.Series(data)) - - assert isinstance(labels, cp.ndarray) - assert isinstance(cats, cudf.BaseIndex) - - labels, cats = cudf.factorize(cudf.Index(data)) - - assert isinstance(labels, cp.ndarray) - assert isinstance(cats, cudf.BaseIndex) - - labels, cats = cudf.factorize(cp.array(data)) - - assert isinstance(labels, cp.ndarray) - assert isinstance(cats, cp.ndarray) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "def", "abc", "a", "def", None], - [10, 20, 100, -10, 0, 1, None, 10, 100], - ], -) -def test_category_dtype_factorize(data): - gs = cudf.Series(data, dtype="category") - ps = gs.to_pandas() - - actual_codes, actual_uniques = gs.factorize() - expected_codes, expected_uniques = ps.factorize() - - assert_eq(actual_codes, expected_codes) - assert_eq(actual_uniques, expected_uniques) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py deleted file mode 100644 index 7e5523bb8c7..00000000000 --- a/python/cudf/cudf/tests/test_feather.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import os -from string import ascii_letters - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES - - -@pytest.fixture(params=[0, 1, 10, 100]) -def pdf(request): - types = NUMERIC_TYPES + ["bool"] - nrows = request.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) - for typ in types - } - ) - # Delete the name of the column index, and rename the row index - test_pdf.columns.name = None - test_pdf.index.name = "index" - - # Create non-numeric categorical data otherwise may get typecasted - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] - test_pdf["col_category"] = pd.Series(data, dtype="category") - - # Feather can't handle indexes properly - test_pdf = test_pdf.reset_index(drop=True) - test_pdf.index.name = None - - return test_pdf - - -@pytest.fixture -def gdf(pdf): - return cudf.DataFrame.from_pandas(pdf) - - -@pytest.fixture -def feather_file(tmp_path_factory, pdf): - fname = tmp_path_factory.mktemp("feather") / "test.feather" - pdf.to_feather(fname) - return fname - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings("ignore:Strings are not yet supported") -@pytest.mark.parametrize( - "columns", - [["col_int8"], ["col_category"], ["col_int32", "col_float32"], None], -) -def test_feather_reader(feather_file, columns): - expect = pa.feather.read_table(feather_file, columns=columns).to_pandas() - got = ( - cudf.read_feather(feather_file, columns=columns) - .to_arrow(preserve_index=False) - .to_pandas() - ) - - assert_eq(expect, got, check_categorical=False) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_feather_writer(tmpdir, pdf, gdf): - pdf_fname = tmpdir.join("pdf.feather") - gdf_fname = tmpdir.join("gdf.feather") - - pdf.to_feather(pdf_fname) - gdf.to_feather(gdf_fname) - - assert os.path.exists(pdf_fname) - assert os.path.exists(gdf_fname) - - expect = pa.feather.read_table(pdf_fname) - got = pa.feather.read_table(gdf_fname) - - assert pa.Table.equals(expect, got) diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py deleted file mode 100644 index 82ecd356bbf..00000000000 --- a/python/cudf/cudf/tests/test_gcs.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import io -import os - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - -gcsfs = pytest.importorskip("gcsfs") - -TEST_PROJECT = "cudf-gcs-test-project" -TEST_BUCKET = "cudf-gcs-test-bucket" - - -@pytest.fixture -def pdf(scope="module"): - df = pd.DataFrame() - df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Float"] = np.array([9.001, 8.343, 6, 2.781]) - df["Integer2"] = np.array([2345, 106, 2088, 789277]) - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df["Boolean"] = np.array([True, False, True, False]) - return df - - -def test_read_csv(pdf, monkeypatch, tmpdir): - # Write to buffer - fpath = TEST_BUCKET + "test_csv_reader.csv" - buffer = pdf.to_csv(index=False) - - def mock_open(*args, **kwargs): - return io.BytesIO(buffer.encode()) - - def mock_size(*args): - return len(buffer.encode()) - - monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open) - monkeypatch.setattr(gcsfs.core.GCSFileSystem, "size", mock_size) - - # Test read from explicit path. - with pytest.warns(FutureWarning): - got = cudf.read_csv(f"gcs://{fpath}") - assert_eq(pdf, got) - - # AbstractBufferedFile -> PythonFile conversion - # will work fine with the monkey-patched FS if we - # pass in an fsspec file object - fs = gcsfs.core.GCSFileSystem() - with fs.open(f"gcs://{fpath}") as f: - got = cudf.read_csv(f) - assert_eq(pdf, got) - - -def test_write_orc(pdf, monkeypatch, tmpdir): - gcs_fname = TEST_BUCKET + "test_orc_writer.orc" - local_filepath = os.path.join(tmpdir, "test_orc.orc") - gdf = cudf.from_pandas(pdf) - - def mock_open(*args, **kwargs): - return open(local_filepath, "wb") - - monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open) - gdf.to_orc(f"gcs://{gcs_fname}") - - got = pd.read_orc(local_filepath) - assert_eq(pdf, got) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py deleted file mode 100644 index 14ba9894fd3..00000000000 --- a/python/cudf/cudf/tests/test_groupby.py +++ /dev/null @@ -1,4048 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import collections -import datetime -import itertools -import operator -import string -import textwrap -from decimal import Decimal -from functools import partial - -import numpy as np -import pandas as pd -import pytest -from numba import cuda -from numpy.testing import assert_array_equal - -import rmm - -import cudf -from cudf import DataFrame, Series -from cudf.api.extensions import no_default -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops -from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES -from cudf.core.udf.utils import UDFError, precompiled -from cudf.testing import assert_eq -from cudf.testing._utils import ( - DATETIME_TYPES, - SIGNED_TYPES, - TIMEDELTA_TYPES, - assert_exceptions_equal, - expect_warning_if, -) -from cudf.testing.dataset_generator import rand_dataframe - -_now = np.datetime64("now") -_tomorrow = _now + np.timedelta64(1, "D") -_now = np.int64(_now.astype("datetime64[ns]")) -_tomorrow = np.int64(_tomorrow.astype("datetime64[ns]")) -_index_type_aggs = {"count", "idxmin", "idxmax", "cumcount"} - - -def assert_groupby_results_equal( - expect, got, sort=True, as_index=True, by=None, **kwargs -): - # Because we don't sort by index by default in groupby, - # sort expect and got by index before comparing. - if sort: - if as_index: - expect = expect.sort_index() - got = got.sort_index() - else: - assert by is not None - if isinstance(expect, (pd.DataFrame, cudf.DataFrame)): - expect = expect.sort_values(by=by).reset_index(drop=True) - else: - expect = expect.sort_values(by=by).reset_index(drop=True) - - if isinstance(got, cudf.DataFrame): - got = got.sort_values(by=by).reset_index(drop=True) - else: - got = got.sort_values(by=by).reset_index(drop=True) - - assert_eq(expect, got, **kwargs) - - -def make_frame( - dataframe_class, - nelem, - seed=0, - extra_levels=(), - extra_vals=(), - with_datetime=False, -): - np.random.seed(seed) - - df = dataframe_class() - - df["x"] = np.random.randint(0, 5, nelem) - df["y"] = np.random.randint(0, 3, nelem) - for lvl in extra_levels: - df[lvl] = np.random.randint(0, 2, nelem) - - df["val"] = np.random.random(nelem) - for val in extra_vals: - df[val] = np.random.random(nelem) - - if with_datetime: - df["datetime"] = np.random.randint( - _now, _tomorrow, nelem, dtype=np.int64 - ).astype("datetime64[ns]") - - return df - - -@pytest.fixture -def gdf(): - return DataFrame({"x": [1, 2, 3], "y": [0, 1, 1]}) - - -@pytest.fixture -def pdf(gdf): - return gdf.to_pandas() - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_mean(nelem): - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).mean() - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).mean() - ) - assert_groupby_results_equal(got_df, expect_df) - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_mean_3level(nelem): - lvls = "z" - bys = list("xyz") - got_df = ( - make_frame(DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys) - .mean() - ) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem, extra_levels=lvls) - .groupby(bys) - .mean() - ) - assert_groupby_results_equal(got_df, expect_df) - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_agg_mean_min(nelem): - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"]) - .agg(["mean", "min"]) - ) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"]) - .agg(["mean", "min"]) - ) - assert_groupby_results_equal(got_df, expect_df) - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_agg_min_max_dictargs(nelem): - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) - .agg({"a": "min", "b": "max"}) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) - .agg({"a": "min", "b": "max"}) - ) - assert_groupby_results_equal(expect_df, got_df) - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -def test_groupby_agg_min_max_dictlist(nelem): - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) - .agg({"a": ["min", "max"], "b": ["min", "max"]}) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem, extra_vals="ab") - .groupby(["x", "y"]) - .agg({"a": ["min", "max"], "b": ["min", "max"]}) - ) - assert_groupby_results_equal(got_df, expect_df) - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_as_index_single_agg(pdf, gdf, as_index): - gdf = gdf.groupby("y", as_index=as_index).agg({"x": "mean"}) - pdf = pdf.groupby("y", as_index=as_index).agg({"x": "mean"}) - assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") - - -@pytest.mark.parametrize("engine", ["cudf", "jit"]) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_as_index_apply(pdf, gdf, as_index, engine): - gdf = gdf.groupby("y", as_index=as_index).apply( - lambda df: df["x"].mean(), engine=engine - ) - kwargs = {"func": lambda df: df["x"].mean(), "include_groups": False} - pdf = pdf.groupby("y", as_index=as_index).apply(**kwargs) - assert_groupby_results_equal(pdf, gdf, as_index=as_index, by="y") - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_as_index_multiindex(pdf, gdf, as_index): - pdf = pd.DataFrame( - {"a": [1, 2, 1], "b": [3, 3, 3], "c": [2, 2, 3], "d": [3, 1, 2]} - ) - gdf = cudf.from_pandas(pdf) - - gdf = gdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( - {"c": "mean"} - ) - pdf = pdf.groupby(["a", "b"], as_index=as_index, sort=True).agg( - {"c": "mean"} - ) - - if as_index: - assert_eq(pdf, gdf) - else: - # column names don't match - check just the values - for gcol, pcol in zip(gdf, pdf): - assert_array_equal(gdf[gcol].to_numpy(), pdf[pcol].values) - - -def test_groupby_default(pdf, gdf): - gdf = gdf.groupby("y").agg({"x": "mean"}) - pdf = pdf.groupby("y").agg({"x": "mean"}) - assert_groupby_results_equal(pdf, gdf) - - -def test_group_keys_true(pdf, gdf): - gdf = gdf.groupby("y", group_keys=True).sum() - pdf = pdf.groupby("y", group_keys=True).sum() - assert_groupby_results_equal(pdf, gdf) - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_getitem_getattr(as_index): - pdf = pd.DataFrame({"x": [1, 3, 1], "y": [1, 2, 3], "z": [1, 4, 5]}) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - pdf.groupby("x", as_index=as_index)["y"].sum(), - gdf.groupby("x", as_index=as_index)["y"].sum(), - as_index=as_index, - by="x", - ) - assert_groupby_results_equal( - pdf.groupby("x", as_index=as_index).y.sum(), - gdf.groupby("x", as_index=as_index).y.sum(), - as_index=as_index, - by="x", - ) - assert_groupby_results_equal( - pdf.groupby("x", as_index=as_index)[["y"]].sum(), - gdf.groupby("x", as_index=as_index)[["y"]].sum(), - as_index=as_index, - by="x", - ) - assert_groupby_results_equal( - pdf.groupby(["x", "y"], as_index=as_index).sum(), - gdf.groupby(["x", "y"], as_index=as_index).sum(), - as_index=as_index, - by=["x", "y"], - ) - - -def test_groupby_cats(): - df = DataFrame() - df["cats"] = pd.Categorical(list("aabaacaab")) - df["vals"] = np.random.random(len(df)) - - cats = df["cats"].values_host - vals = df["vals"].to_numpy() - - grouped = df.groupby(["cats"], as_index=False).mean() - - got_vals = grouped["vals"] - - got_cats = grouped["cats"] - - for i in range(len(got_vals)): - expect = vals[cats == got_cats[i]].mean() - np.testing.assert_almost_equal(got_vals[i], expect) - - -def test_groupby_iterate_groups(): - np.random.seed(0) - df = DataFrame() - nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) - - def assert_values_equal(arr): - np.testing.assert_array_equal(arr[0], arr) - - for name, grp in df.groupby(["key1", "key2"]): - pddf = grp.to_pandas() - for k in "key1,key2".split(","): - assert_values_equal(pddf[k].values) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply(): - np.random.seed(0) - df = DataFrame() - nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) - - expect_grpby = df.to_pandas().groupby( - ["key1", "key2"], as_index=False, group_keys=False - ) - got_grpby = df.groupby(["key1", "key2"]) - - def foo(df): - df["out"] = df["val1"] + df["val2"] - return df - - expect = expect_grpby.apply(foo, include_groups=False) - got = got_grpby.apply(foo, include_groups=False) - assert_groupby_results_equal(expect, got) - - -def create_test_groupby_apply_args_params(): - def f1(df, k): - df["out"] = df["val1"] + df["val2"] + k - return df - - def f2(df, k, L): - df["out"] = df["val1"] - df["val2"] + (k / L) - return df - - def f3(df, k, L, m): - df["out"] = ((k * df["val1"]) + (L * df["val2"])) / m - return df - - return [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] - - -@pytest.mark.parametrize("func,args", create_test_groupby_apply_args_params()) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_args(func, args): - np.random.seed(0) - df = DataFrame() - nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) - - expect_grpby = df.to_pandas().groupby( - ["key1", "key2"], as_index=False, group_keys=False - ) - got_grpby = df.groupby(["key1", "key2"]) - expect = expect_grpby.apply(func, *args, include_groups=False) - got = got_grpby.apply(func, *args, include_groups=False) - assert_groupby_results_equal(expect, got) - - -def test_groupby_apply_grouped(): - np.random.seed(0) - df = DataFrame() - nelem = 20 - df["key1"] = range(nelem) - df["key2"] = range(nelem) - df["val1"] = range(nelem) - df["val2"] = range(nelem) - - got_grpby = df.groupby(["key1", "key2"]) - - def foo(key1, val1, com1, com2): - for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): - com1[i] = key1[i] * 10000 + val1[i] - com2[i] = i - - got = got_grpby.apply_grouped( - foo, - incols=["key1", "val1"], - outcols={"com1": np.float64, "com2": np.int32}, - tpb=8, - ) - - got = got.to_pandas() - - expect = df.copy() - expect["com1"] = (expect["key1"] * 10000 + expect["key1"]).astype( - np.float64 - ) - expect["com2"] = np.zeros(nelem, dtype=np.int32) - - assert_groupby_results_equal(expect, got) - - -@pytest.fixture(scope="module") -def groupby_jit_data_small(): - """ - Return a small dataset for testing JIT Groupby Apply. The dataframe - contains 4 groups of size 1, 2, 3, 4 as well as an additional key - column that can be used to test subgroups within groups. This data - is useful for smoke testing basic numeric results - """ - rng = np.random.default_rng(42) - df = DataFrame() - key1 = [1] + [2] * 2 + [3] * 3 + [4] * 4 - key2 = [1, 2] * 5 - df["key1"] = key1 - df["key2"] = key2 - - df["val1"] = rng.integers(0, 10, len(key1)) - df["val2"] = rng.integers(0, 10, len(key1)) - - # randomly permute data - df = df.sample(frac=1, ignore_index=True) - return df - - -@pytest.fixture(scope="module") -def groupby_jit_data_large(groupby_jit_data_small): - """ - Larger version of groupby_jit_data_small which contains enough data - to require more than one block per group. This data is useful for - testing if JIT GroupBy algorithms scale to larger dastasets without - manifesting numerical issues such as overflow. - """ - max_tpb = 1024 - factor = ( - max_tpb + 1 - ) # bigger than a block but not always an exact multiple - df = cudf.concat([groupby_jit_data_small] * factor) - - return df - - -@pytest.fixture(scope="module") -def groupby_jit_data_nans(groupby_jit_data_small): - """ - Returns a modified version of groupby_jit_data_small which contains - nan values. - """ - - df = groupby_jit_data_small.sort_values(["key1", "key2"]) - df["val1"] = df["val1"].astype("float64") - df["val1"][::2] = np.nan - df = df.sample(frac=1, ignore_index=True) - return df - - -@pytest.fixture(scope="module") -def groupby_jit_datasets( - groupby_jit_data_small, groupby_jit_data_large, groupby_jit_data_nans -): - return { - "small": groupby_jit_data_small, - "large": groupby_jit_data_large, - "nans": groupby_jit_data_nans, - } - - -def run_groupby_apply_jit_test(data, func, keys, *args): - expect_groupby_obj = data.to_pandas().groupby(keys) - got_groupby_obj = data.groupby(keys) - - # compare cuDF jit to pandas - cudf_jit_result = got_groupby_obj.apply( - func, *args, engine="jit", include_groups=False - ) - pandas_result = expect_groupby_obj.apply(func, *args, include_groups=False) - assert_groupby_results_equal(cudf_jit_result, pandas_result) - - -def groupby_apply_jit_reductions_test_inner(func, data, dtype): - # ideally we'd just have: - # lambda group: getattr(group, func)() - # but the current kernel caching mechanism relies on pickle which - # does not play nice with local functions. What's below uses - # exec as a workaround to write the test functions dynamically - - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - data["val1"] = data["val1"].astype(dtype) - data["val2"] = data["val2"].astype(dtype) - - run_groupby_apply_jit_test(data, func, ["key1"]) - - -# test unary reductions -@pytest.mark.parametrize( - "dtype", - SUPPORTED_GROUPBY_NUMPY_TYPES, - ids=[str(t) for t in SUPPORTED_GROUPBY_NUMPY_TYPES], -) -@pytest.mark.parametrize( - "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"] -) -@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_jit_unary_reductions( - func, dtype, dataset, groupby_jit_datasets -): - dataset = groupby_jit_datasets[dataset] - groupby_apply_jit_reductions_test_inner(func, dataset, dtype) - - -# test unary reductions for special values -def groupby_apply_jit_reductions_special_vals_inner( - func, data, dtype, special_val -): - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - data["val1"] = data["val1"].astype(dtype) - data["val2"] = data["val2"].astype(dtype) - data["val1"] = special_val - data["val2"] = special_val - - run_groupby_apply_jit_test(data, func, ["key1"]) - - -# test unary index reductions for special values -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def groupby_apply_jit_idx_reductions_special_vals_inner( - func, data, dtype, special_val -): - funcstr = textwrap.dedent( - f""" - def func(df): - return df['val1'].{func}() - """ - ) - lcl = {} - exec(funcstr, lcl) - func = lcl["func"] - - data["val1"] = data["val1"].astype(dtype) - data["val2"] = data["val2"].astype(dtype) - data["val1"] = special_val - data["val2"] = special_val - - run_groupby_apply_jit_test(data, func, ["key1"]) - - -@pytest.mark.parametrize("dtype", ["float64", "float32"]) -@pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"]) -@pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf]) -@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_jit_reductions_special_vals( - func, dtype, dataset, groupby_jit_datasets, special_val -): - dataset = groupby_jit_datasets[dataset] - with expect_warning_if( - func in {"var", "std"} and not np.isnan(special_val), RuntimeWarning - ): - groupby_apply_jit_reductions_special_vals_inner( - func, dataset, dtype, special_val - ) - - -@pytest.mark.parametrize("dtype", ["float64"]) -@pytest.mark.parametrize("func", ["idxmax", "idxmin"]) -@pytest.mark.parametrize( - "special_val", - [ - pytest.param( - np.nan, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/13832" - ), - ), - np.inf, - -np.inf, - ], -) -@pytest.mark.parametrize("dataset", ["small", "large", "nans"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="include_groups keyword new in pandas 2.2", -) -def test_groupby_apply_jit_idx_reductions_special_vals( - func, dtype, dataset, groupby_jit_datasets, special_val -): - dataset = groupby_jit_datasets[dataset] - groupby_apply_jit_idx_reductions_special_vals_inner( - func, dataset, dtype, special_val - ) - - -@pytest.mark.parametrize("dtype", ["int32"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_sum_integer_overflow(dtype): - max = np.iinfo(dtype).max - - data = DataFrame( - { - "a": [0, 0, 0], - "b": [max, max, max], - } - ) - - def func(group): - return group["b"].sum() - - run_groupby_apply_jit_test(data, func, ["a"]) - - -@pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"]) -@pytest.mark.parametrize( - "dataset", - [ - pytest.param( - "small", - marks=[ - pytest.mark.filterwarnings( - "ignore:Degrees of Freedom <= 0 for slice" - ), - pytest.mark.filterwarnings( - "ignore:divide by zero encountered in divide" - ), - ], - ), - "large", - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype): - dataset = groupby_jit_datasets[dataset] - - dataset["val1"] = dataset["val1"].astype(dtype) - dataset["val2"] = dataset["val2"].astype(dtype) - - keys = ["key1"] - - def func(group): - return group["val1"].corr(group["val2"]) - - if np.dtype(dtype).kind == "f": - # Correlation of floating types is not yet supported: - # https://github.com/rapidsai/cudf/issues/13839 - m = ( - f"Series.corr\\(Series\\) is not " - f"supported for \\({dtype}, {dtype}\\)" - ) - with pytest.raises(UDFError, match=m): - run_groupby_apply_jit_test(dataset, func, keys) - return - with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): - run_groupby_apply_jit_test(dataset, func, keys) - - -@pytest.mark.parametrize("dtype", ["int32", "int64"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_correlation_zero_variance(dtype): - # pearson correlation is undefined when the variance of either - # variable is zero. This test ensures that the jit implementation - # returns the same result as pandas in this case. - data = DataFrame( - {"a": [0, 0, 0, 0, 0], "b": [1, 1, 1, 1, 1], "c": [2, 2, 2, 2, 2]} - ) - - def func(group): - return group["b"].corr(group["c"]) - - with expect_warning_if(dtype in {"int32", "int64"}, RuntimeWarning): - run_groupby_apply_jit_test(data, func, ["a"]) - - -@pytest.mark.parametrize("op", unary_ops) -def test_groupby_apply_jit_invalid_unary_ops_error(groupby_jit_data_small, op): - keys = ["key1"] - - def func(group): - return op(group["val1"]) - - with pytest.raises( - UDFError, - match=f"{op.__name__}\\(Series\\) is not supported by JIT GroupBy", - ): - run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) - - -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_groupby_apply_jit_invalid_binary_ops_error( - groupby_jit_data_small, op -): - keys = ["key1"] - - def func(group): - return op(group["val1"], group["val2"]) - - with pytest.raises( - UDFError, - match=f"{op.__name__}\\(Series, Series\\) is not supported", - ): - run_groupby_apply_jit_test(groupby_jit_data_small, func, keys) - - -def test_groupby_apply_jit_no_df_ops(groupby_jit_data_small): - # DataFrame level operations are not yet supported. - def func(group): - return group.sum() - - with pytest.raises( - UDFError, - match="JIT GroupBy.apply\\(\\) does not support DataFrame.sum\\(\\)", - ): - run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1"]) - - -@pytest.mark.parametrize("dtype", ["uint8", "str"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_unsupported_dtype(dtype): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) - df["b"] = df["b"].astype(dtype) - - # a UDAF that doesn't actually use the input column - # with the unsupported dtype should still succeed - def func(group): - return group["c"].sum() - - run_groupby_apply_jit_test(df, func, ["a"]) - - # however a UDAF that does use the unsupported dtype - # should fail - def func(group): - return group["b"].sum() - - with pytest.raises(UDFError, match="Only columns of the following dtypes"): - run_groupby_apply_jit_test(df, func, ["a"]) - - -@pytest.mark.parametrize( - "func", - [ - lambda df: df["val1"].max() + df["val2"].min(), - lambda df: df["val1"].sum() + df["val2"].var(), - lambda df: df["val1"].mean() + df["val2"].std(), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_basic(func, groupby_jit_data_small): - run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"]) - - -def create_test_groupby_apply_jit_args_params(): - def f1(df, k): - return df["val1"].max() + df["val2"].min() + k - - def f2(df, k, L): - return df["val1"].sum() - df["val2"].var() + (k / L) - - def f3(df, k, L, m): - return ((k * df["val1"].mean()) + (L * df["val2"].std())) / m - - return [(f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] - - -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_jit_args_params() -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_args(func, args, groupby_jit_data_small): - run_groupby_apply_jit_test( - groupby_jit_data_small, func, ["key1", "key2"], *args - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_jit_block_divergence(): - # https://github.com/rapidsai/cudf/issues/12686 - df = cudf.DataFrame( - { - "a": [0, 0, 0, 1, 1, 1], - "b": [1, 1, 1, 2, 3, 4], - } - ) - - def diverging_block(grp_df): - if grp_df["b"].mean() > 1: - return grp_df["b"].mean() - return 0 - - run_groupby_apply_jit_test(df, diverging_block, ["a"]) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_caching(): - # Make sure similar functions that differ - # by simple things like constants actually - # recompile - - # begin with a clear cache - precompiled.clear() - assert precompiled.currsize == 0 - - data = cudf.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 4, 5, 6]}) - - def f(group): - return group["b"].mean() * 2 - - # a single run should result in a cache size of 1 - run_groupby_apply_jit_test(data, f, ["a"]) - assert precompiled.currsize == 1 - - # a second run with f should not increase the count - run_groupby_apply_jit_test(data, f, ["a"]) - assert precompiled.currsize == 1 - - # changing a constant value inside the UDF should miss - def f(group): - return group["b"].mean() * 3 - - run_groupby_apply_jit_test(data, f, ["a"]) - assert precompiled.currsize == 2 - - # changing the dtypes of the columns should miss - data["b"] = data["b"].astype("float64") - run_groupby_apply_jit_test(data, f, ["a"]) - - assert precompiled.currsize == 3 - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_no_bytecode_fallback(): - # tests that a function which contains no bytecode - # attribute, but would still be executable using - # the iterative groupby apply approach, still works. - - gdf = cudf.DataFrame({"a": [0, 1, 1], "b": [1, 2, 3]}) - pdf = gdf.to_pandas() - - def f(group): - return group.sum() - - part = partial(f) - - expect = pdf.groupby("a").apply(part, include_groups=False) - got = gdf.groupby("a").apply(part, engine="auto", include_groups=False) - assert_groupby_results_equal(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_return_col_from_df(): - # tests a UDF that consists of purely colwise - # ops, such as `lambda group: group.x + group.y` - # which returns a column - func = lambda group: group.x + group.y # noqa:E731 - df = cudf.DataFrame( - { - "id": range(10), - "x": range(10), - "y": range(10), - } - ) - pdf = df.to_pandas() - - def func(df): - return df.x + df.y - - got = df.groupby("id").apply(func, include_groups=False) - expect = pdf.groupby("id").apply(func, include_groups=False) - # pandas seems to erroneously add an extra MI level of ids - # TODO: Figure out how pandas groupby.apply determines the columns - expect = pd.DataFrame(expect.droplevel(1), columns=got.columns) - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize("func", [lambda group: group.sum()]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_return_df(func): - # tests a UDF that reduces over a dataframe - # and produces a series with the original column names - # as its index, such as lambda group: group.sum() + group.min() - df = cudf.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, 4]}) - pdf = df.to_pandas() - - expect = pdf.groupby("a").apply(func, include_groups=False) - got = df.groupby("a").apply(func, include_groups=False) - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize("as_index", [True, False]) -def test_groupby_apply_return_reindexed_series(as_index): - def gdf_func(df): - return cudf.Series([df["a"].sum(), df["b"].min(), df["c"].max()]) - - def pdf_func(df): - return pd.Series([df["a"].sum(), df["b"].min(), df["c"].max()]) - - df = cudf.DataFrame( - { - "key": [0, 0, 1, 1, 2, 2], - "a": [1, 2, 3, 4, 5, 6], - "b": [7, 8, 9, 10, 11, 12], - "c": [13, 14, 15, 16, 17, 18], - } - ) - pdf = df.to_pandas() - - kwargs = {} - if PANDAS_GE_220: - kwargs["include_groups"] = False - - expect = pdf.groupby("key", as_index=as_index).apply(pdf_func, **kwargs) - got = df.groupby("key", as_index=as_index).apply(gdf_func, **kwargs) - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) -@pytest.mark.parametrize( - "func", - [ - "mean", - "std", - "var", - "min", - "max", - "idxmin", - "idxmax", - "count", - "sum", - "prod", - ], -) -def test_groupby_2keys_agg(nelem, func): - # gdf (Note: lack of multiIndex) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) - ) - got_df = make_frame(DataFrame, nelem=nelem).groupby(["x", "y"]).agg(func) - - check_dtype = func not in _index_type_aggs - assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) - - -@pytest.mark.parametrize("num_groups", [2, 3, 10, 50, 100]) -@pytest.mark.parametrize("nelem_per_group", [1, 10, 100]) -@pytest.mark.parametrize( - "func", - ["min", "max", "count", "sum"], - # TODO: Replace the above line with the one below once - # https://github.com/pandas-dev/pandas/issues/40685 is resolved. - # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], -) -def test_groupby_agg_decimal(num_groups, nelem_per_group, func): - # The number of digits after the decimal to use. - decimal_digits = 2 - # The number of digits before the decimal to use. - whole_digits = 2 - - scale = 10**whole_digits - nelem = num_groups * nelem_per_group - - # The unique is necessary because otherwise if there are duplicates idxmin - # and idxmax may return different results than pandas (see - # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to - # the current version of the test, because idxmin and idxmax simply don't - # work with pandas Series composed of Decimal objects (see - # https://github.com/pandas-dev/pandas/issues/40685). However, if that is - # ever enabled, then this issue will crop up again so we may as well have - # it fixed now. - x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) - y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) - - if x.size < y.size: - total_elements = x.size - y = y[: x.size] - else: - total_elements = y.size - x = x[: y.size] - - # Note that this filtering can lead to one group with fewer elements, but - # that shouldn't be a problem and is probably useful to test. - idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements] - - decimal_x = pd.Series([Decimal(str(d)) for d in x]) - decimal_y = pd.Series([Decimal(str(d)) for d in y]) - - pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y}) - gdf = DataFrame( - { - "idx": idx_col, - "x": cudf.Series(decimal_x), - "y": cudf.Series(decimal_y), - } - ) - - expect_df = pdf.groupby("idx", sort=True).agg(func) - if rmm._cuda.gpu.runtimeGetVersion() < 11000: - with pytest.raises(RuntimeError): - got_df = gdf.groupby("idx", sort=True).agg(func) - else: - got_df = gdf.groupby("idx", sort=True).agg(func) - assert_eq(expect_df["x"], got_df["x"], check_dtype=False) - assert_eq(expect_df["y"], got_df["y"], check_dtype=False) - - -@pytest.mark.parametrize( - "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "prod", "mean"] -) -def test_series_groupby(agg): - s = pd.Series([1, 2, 3]) - g = Series([1, 2, 3]) - sg = s.groupby(s // 2) - gg = g.groupby(g // 2) - sa = getattr(sg, agg)() - ga = getattr(gg, agg)() - check_dtype = agg not in _index_type_aggs - assert_groupby_results_equal(sa, ga, check_dtype=check_dtype) - - -@pytest.mark.parametrize( - "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "prod", "mean"] -) -def test_series_groupby_agg(agg): - s = pd.Series([1, 2, 3]) - g = Series([1, 2, 3]) - sg = s.groupby(s // 2).agg(agg) - gg = g.groupby(g // 2).agg(agg) - check_dtype = agg not in _index_type_aggs - assert_groupby_results_equal(sg, gg, check_dtype=check_dtype) - - -@pytest.mark.parametrize( - "agg", - [ - "min", - "max", - "count", - "sum", - "prod", - "mean", - pytest.param( - "idxmin", - marks=pytest.mark.xfail(reason="gather needed for idxmin"), - ), - pytest.param( - "idxmax", - marks=pytest.mark.xfail(reason="gather needed for idxmax"), - ), - ], -) -def test_groupby_level_zero(agg): - pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[2, 5, 5]) - gdf = DataFrame.from_pandas(pdf) - pdg = pdf.groupby(level=0) - gdg = gdf.groupby(level=0) - pdresult = getattr(pdg, agg)() - gdresult = getattr(gdg, agg)() - check_dtype = agg not in _index_type_aggs - assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype) - - -@pytest.mark.parametrize( - "agg", - [ - "min", - "max", - "count", - "sum", - "prod", - "mean", - pytest.param( - "idxmin", - marks=pytest.mark.xfail(reason="gather needed for idxmin"), - ), - pytest.param( - "idxmax", - marks=pytest.mark.xfail(reason="gather needed for idxmax"), - ), - ], -) -def test_groupby_series_level_zero(agg): - pdf = pd.Series([1, 2, 3], index=[2, 5, 5]) - gdf = Series.from_pandas(pdf) - pdg = pdf.groupby(level=0) - gdg = gdf.groupby(level=0) - pdresult = getattr(pdg, agg)() - gdresult = getattr(gdg, agg)() - check_dtype = agg not in _index_type_aggs - assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype) - - -def test_groupby_column_name(): - pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) - gdf = DataFrame.from_pandas(pdf) - g = gdf.groupby("yy") - p = pdf.groupby("yy") - gxx = g["xx"].sum() - pxx = p["xx"].sum() - assert_groupby_results_equal(pxx, gxx) - - gxx = g["xx"].count() - pxx = p["xx"].count() - assert_groupby_results_equal(pxx, gxx, check_dtype=False) - - gxx = g["xx"].min() - pxx = p["xx"].min() - assert_groupby_results_equal(pxx, gxx) - - gxx = g["xx"].max() - pxx = p["xx"].max() - assert_groupby_results_equal(pxx, gxx) - - gxx = g["xx"].idxmin() - pxx = p["xx"].idxmin() - assert_groupby_results_equal(pxx, gxx, check_dtype=False) - - gxx = g["xx"].idxmax() - pxx = p["xx"].idxmax() - assert_groupby_results_equal(pxx, gxx, check_dtype=False) - - gxx = g["xx"].mean() - pxx = p["xx"].mean() - assert_groupby_results_equal(pxx, gxx) - - -def test_groupby_column_numeral(): - pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]}) - gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1) - g = gdf.groupby(1) - pxx = p[0].sum() - gxx = g[0].sum() - assert_groupby_results_equal(pxx, gxx) - - pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]}) - gdf = DataFrame.from_pandas(pdf) - p = pdf.groupby(1.5) - g = gdf.groupby(1.5) - pxx = p[0.5].sum() - gxx = g[0.5].sum() - assert_groupby_results_equal(pxx, gxx) - - -@pytest.mark.parametrize( - "series", - [ - [0, 1, 0], - [1, 1, 1], - [0, 1, 1], - [1, 2, 3], - [4, 3, 2], - [0, 2, 0], - pd.Series([0, 2, 0]), - pd.Series([0, 2, 0], index=[0, 2, 1]), - ], -) # noqa: E501 -def test_groupby_external_series(series): - pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) - gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series)).x.sum() - gxx = gdf.groupby(cudf.Series(series)).x.sum() - assert_groupby_results_equal(pxx, gxx) - - -@pytest.mark.parametrize("series", [[0.0, 1.0], [1.0, 1.0, 1.0, 1.0]]) -def test_groupby_external_series_incorrect_length(series): - pdf = pd.DataFrame({"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}) - gdf = DataFrame.from_pandas(pdf) - pxx = pdf.groupby(pd.Series(series)).x.sum() - gxx = gdf.groupby(cudf.Series(series)).x.sum() - assert_groupby_results_equal(pxx, gxx) - - -@pytest.mark.parametrize( - "level", [0, 1, "a", "b", [0, 1], ["a", "b"], ["a", 1], -1, [-1, -2]] -) -def test_groupby_levels(level): - idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 2)], names=("a", "b")) - pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - pdf.groupby(level=level).sum(), - gdf.groupby(level=level).sum(), - ) - - -def test_advanced_groupby_levels(): - pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1], "z": [1, 1, 1]}) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdh = pdg.groupby(level=1).sum() - gdh = gdg.groupby(level=1).sum() - assert_groupby_results_equal(pdh, gdh) - pdg = pdf.groupby(["x", "y", "z"]).sum() - gdg = gdf.groupby(["x", "y", "z"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["z"]).sum() - gdg = gdf.groupby(["z"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["y", "z"]).sum() - gdg = gdf.groupby(["y", "z"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["x", "z"]).sum() - gdg = gdf.groupby(["x", "z"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["y"]).sum() - gdg = gdf.groupby(["y"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby(["x"]).sum() - gdg = gdf.groupby(["x"]).sum() - assert_groupby_results_equal(pdg, gdg) - pdh = pdg.groupby(level=0).sum() - gdh = gdg.groupby(level=0).sum() - assert_groupby_results_equal(pdh, gdh) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() - pdh = pdg.groupby(level=[0, 1]).sum() - gdh = gdg.groupby(level=[0, 1]).sum() - assert_groupby_results_equal(pdh, gdh) - pdh = pdg.groupby(level=[1, 0]).sum() - gdh = gdg.groupby(level=[1, 0]).sum() - assert_groupby_results_equal(pdh, gdh) - pdg = pdf.groupby(["x", "y"]).sum() - gdg = gdf.groupby(["x", "y"]).sum() - - assert_exceptions_equal( - lfunc=pdg.groupby, - rfunc=gdg.groupby, - lfunc_args_and_kwargs=([], {"level": 2}), - rfunc_args_and_kwargs=([], {"level": 2}), - ) - - -@pytest.mark.parametrize( - "func", - [ - lambda df: df.groupby(["x", "y", "z"]).sum(), - lambda df: df.groupby(["x", "y"]).sum(), - lambda df: df.groupby(["x", "y"]).agg("sum"), - lambda df: df.groupby(["y"]).sum(), - lambda df: df.groupby(["y"]).agg("sum"), - lambda df: df.groupby(["x"]).sum(), - lambda df: df.groupby(["x"]).agg("sum"), - lambda df: df.groupby(["x", "y"]).z.sum(), - lambda df: df.groupby(["x", "y"]).z.agg("sum"), - ], -) -def test_empty_groupby(func): - pdf = pd.DataFrame({"x": [], "y": [], "z": []}) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal(func(pdf), func(gdf), check_index_type=False) - - -def test_groupby_unsupported_columns(): - np.random.seed(12) - pd_cat = pd.Categorical( - pd.Series(np.random.choice(["a", "b", 1], 3), dtype="category") - ) - pdf = pd.DataFrame( - { - "x": [1, 2, 3], - "y": ["a", "b", "c"], - "z": ["d", "e", "f"], - "a": [3, 4, 5], - } - ) - pdf["b"] = pd_cat - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x").sum(numeric_only=True) - # cudf does not yet support numeric_only, so our default is False (unlike - # pandas, which defaults to inferring and throws a warning about it). - gdg = gdf.groupby("x").sum(numeric_only=True) - assert_groupby_results_equal(pdg, gdg) - - -def test_list_of_series(): - pdf = pd.DataFrame({"x": [1, 2, 3], "y": [1, 2, 1]}) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby([pdf.x]).y.sum() - gdg = gdf.groupby([gdf.x]).y.sum() - assert_groupby_results_equal(pdg, gdg) - pdg = pdf.groupby([pdf.x, pdf.y]).y.sum() - gdg = gdf.groupby([gdf.x, gdf.y]).y.sum() - pytest.skip() - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_use_agg_column_as_index(): - pdf = pd.DataFrame() - pdf["a"] = [1, 1, 1, 3, 5] - gdf = cudf.DataFrame() - gdf["a"] = [1, 1, 1, 3, 5] - pdg = pdf.groupby("a").agg({"a": "count"}) - gdg = gdf.groupby("a").agg({"a": "count"}) - assert_groupby_results_equal(pdg, gdg, check_dtype=False) - - -def test_groupby_list_then_string(): - gdf = cudf.DataFrame() - gdf["a"] = [0, 1, 0, 1, 2] - gdf["b"] = [11, 2, 15, 12, 2] - gdf["c"] = [6, 7, 6, 7, 6] - pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg( - {"b": ["min", "max"], "c": "max"} - ) - pdg = pdf.groupby("a", as_index=True).agg( - {"b": ["min", "max"], "c": "max"} - ) - assert_groupby_results_equal(gdg, pdg) - - -def test_groupby_different_unequal_length_column_aggregations(): - gdf = cudf.DataFrame() - gdf["a"] = [0, 1, 0, 1, 2] - gdf["b"] = [11, 2, 15, 12, 2] - gdf["c"] = [11, 2, 15, 12, 2] - pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg( - {"b": "min", "c": ["max", "min"]} - ) - pdg = pdf.groupby("a", as_index=True).agg( - {"b": "min", "c": ["max", "min"]} - ) - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_single_var_two_aggs(): - gdf = cudf.DataFrame() - gdf["a"] = [0, 1, 0, 1, 2] - gdf["b"] = [11, 2, 15, 12, 2] - gdf["c"] = [11, 2, 15, 12, 2] - pdf = gdf.to_pandas() - gdg = gdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) - pdg = pdf.groupby("a", as_index=True).agg({"b": ["min", "max"]}) - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_double_var_two_aggs(): - gdf = cudf.DataFrame() - gdf["a"] = [0, 1, 0, 1, 2] - gdf["b"] = [11, 2, 15, 12, 2] - gdf["c"] = [11, 2, 15, 12, 2] - pdf = gdf.to_pandas() - gdg = gdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) - pdg = pdf.groupby(["a", "b"], as_index=True).agg({"c": ["min", "max"]}) - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_apply_basic_agg_single_column(): - gdf = DataFrame() - gdf["key"] = [0, 0, 1, 1, 2, 2, 0] - gdf["val"] = [0, 1, 2, 3, 4, 5, 6] - gdf["mult"] = gdf["key"] * gdf["val"] - pdf = gdf.to_pandas() - - gdg = gdf.groupby(["key", "val"]).mult.sum() - pdg = pdf.groupby(["key", "val"]).mult.sum() - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_multi_agg_single_groupby_series(): - pdf = pd.DataFrame( - { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), - } - ) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby("x").y.agg(["sum", "max"]) - gdg = gdf.groupby("x").y.agg(["sum", "max"]) - - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_multi_agg_multi_groupby(): - pdf = pd.DataFrame( - { - "a": np.random.randint(0, 5, 10), - "b": np.random.randint(0, 5, 10), - "c": np.random.randint(0, 5, 10), - "d": np.random.randint(0, 5, 10), - } - ) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) - assert_groupby_results_equal(pdg, gdg) - - -def test_groupby_datetime_multi_agg_multi_groupby(): - pdf = pd.DataFrame( - { - "a": pd.date_range( - datetime.datetime.now(), - datetime.datetime.now() + datetime.timedelta(9), - freq="D", - ), - "b": np.random.randint(0, 5, 10), - "c": np.random.randint(0, 5, 10), - "d": np.random.randint(0, 5, 10), - } - ) - gdf = cudf.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"]).agg(["sum", "max"]) - gdg = gdf.groupby(["a", "b"]).agg(["sum", "max"]) - - assert_groupby_results_equal(pdg, gdg) - - -@pytest.mark.parametrize( - "agg", - [ - ["min", "max", "count", "mean"], - ["mean", "var", "std"], - ["count", "mean", "var", "std"], - ], -) -def test_groupby_multi_agg_hash_groupby(agg): - alphabets = "abcdefghijklmnopqrstuvwxyz" - prefixes = alphabets[:10] - coll_dict = dict() - for prefix in prefixes: - for this_name in alphabets: - coll_dict[prefix + this_name] = float - coll_dict["id"] = int - gdf = cudf.datasets.timeseries( - start="2000", - end="2000-01-2", - dtypes=coll_dict, - freq="1s", - seed=1, - ).reset_index(drop=True) - pdf = gdf.to_pandas() - check_dtype = "count" not in agg - pdg = pdf.groupby("id").agg(agg) - gdg = gdf.groupby("id").agg(agg) - assert_groupby_results_equal(pdg, gdg, check_dtype=check_dtype) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="previous verion of pandas throws a warning", -) -@pytest.mark.parametrize( - "agg", ["min", "max", "idxmax", "idxmin", "sum", "prod", "count", "mean"] -) -def test_groupby_nulls_basic(agg): - check_dtype = agg not in _index_type_aggs - - pdf = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": [1, 2, 1, 2, 1, None]}) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - getattr(pdf.groupby("a"), agg)(), - getattr(gdf.groupby("a"), agg)(), - check_dtype=check_dtype, - ) - - pdf = pd.DataFrame( - { - "a": [0, 0, 1, 1, 2, 2], - "b": [1, 2, 1, 2, 1, None], - "c": [1, 2, 1, None, 1, 2], - } - ) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - getattr(pdf.groupby("a"), agg)(), - getattr(gdf.groupby("a"), agg)(), - check_dtype=check_dtype, - ) - - pdf = pd.DataFrame( - { - "a": [0, 0, 1, 1, 2, 2], - "b": [1, 2, 1, 2, 1, None], - "c": [1, 2, None, None, 1, 2], - } - ) - gdf = cudf.from_pandas(pdf) - - # TODO: fillna() used here since we don't follow - # Pandas' null semantics. Should we change it? - - assert_groupby_results_equal( - getattr(pdf.groupby("a"), agg)().fillna(0), - getattr(gdf.groupby("a"), agg)().fillna(0 if agg != "prod" else 1), - check_dtype=check_dtype, - ) - - -def test_groupby_nulls_in_index(): - pdf = pd.DataFrame({"a": [None, 2, 1, 1], "b": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) - - -def test_groupby_all_nulls_index(): - gdf = cudf.DataFrame( - { - "a": cudf.Series([None, None, None, None], dtype="object"), - "b": [1, 2, 3, 4], - } - ) - pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) - - gdf = cudf.DataFrame( - {"a": cudf.Series([np.nan, np.nan, np.nan, np.nan]), "b": [1, 2, 3, 4]} - ) - pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby("a").sum(), gdf.groupby("a").sum() - ) - - -@pytest.mark.parametrize("sort", [True, False]) -def test_groupby_sort(sort): - pdf = pd.DataFrame({"a": [2, 2, 1, 1], "b": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.groupby("a", sort=sort).sum(), - gdf.groupby("a", sort=sort).sum(), - check_like=not sort, - ) - - pdf = pd.DataFrame( - {"c": [-1, 2, 1, 4], "b": [1, 2, 3, 4], "a": [2, 2, 1, 1]} - ) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.groupby(["c", "b"], sort=sort).sum(), - gdf.groupby(["c", "b"], sort=sort).sum(), - check_like=not sort, - ) - - ps = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=[2, 2, 2, 3, 3, 1, 1, 1]) - gs = cudf.from_pandas(ps) - - assert_eq( - ps.groupby(level=0, sort=sort).sum().to_frame(), - gs.groupby(level=0, sort=sort).sum().to_frame(), - check_like=not sort, - ) - - ps = pd.Series( - [1, 2, 3, 4, 5, 6, 7, 8], - index=pd.MultiIndex.from_product([(1, 2), ("a", "b"), (42, 84)]), - ) - gs = cudf.from_pandas(ps) - - assert_eq( - ps.groupby(level=0, sort=sort).sum().to_frame(), - gs.groupby(level=0, sort=sort).sum().to_frame(), - check_like=not sort, - ) - - -def test_groupby_cat(): - pdf = pd.DataFrame( - {"a": [1, 1, 2], "b": pd.Series(["b", "b", "a"], dtype="category")} - ) - gdf = cudf.from_pandas(pdf) - assert_groupby_results_equal( - pdf.groupby("a").count(), - gdf.groupby("a").count(), - check_dtype=False, - ) - - -def test_groupby_index_type(): - df = cudf.DataFrame() - df["string_col"] = ["a", "b", "c"] - df["counts"] = [1, 2, 3] - res = df.groupby(by="string_col").counts.sum() - assert res.index.dtype == cudf.dtype("object") - - -@pytest.mark.parametrize( - "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] -) -@pytest.mark.parametrize("q", [0.25, 0.4, 0.5, 0.7, 1]) -def test_groupby_quantile(request, interpolation, q): - request.applymarker( - pytest.mark.xfail( - condition=(q == 0.5 and interpolation == "nearest"), - reason=( - "Pandas NaN Rounding will fail nearest interpolation at 0.5" - ), - ) - ) - - raw_data = { - "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9], - "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2], - } - # Pandas>0.25 now casts NaN in quantile operations as a float64 - # # so we are filling with zeros. - pdf = pd.DataFrame(raw_data).fillna(0) - gdf = DataFrame.from_pandas(pdf) - - pdg = pdf.groupby("x") - gdg = gdf.groupby("x") - - pdresult = pdg.quantile(q, interpolation=interpolation) - gdresult = gdg.quantile(q, interpolation=interpolation) - - assert_groupby_results_equal(pdresult, gdresult) - - -def test_groupby_std(): - raw_data = { - "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2], - "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9], - } - pdf = pd.DataFrame(raw_data) - gdf = DataFrame.from_pandas(pdf) - pdg = pdf.groupby("x") - gdg = gdf.groupby("x") - pdresult = pdg.std() - gdresult = gdg.std() - - assert_groupby_results_equal(pdresult, gdresult) - - -def test_groupby_size(): - pdf = pd.DataFrame( - { - "a": [1, 1, 3, 4], - "b": ["bob", "bob", "alice", "cooper"], - "c": [1, 2, 3, 4], - } - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").size(), - gdf.groupby("a").size(), - check_dtype=False, - ) - - assert_groupby_results_equal( - pdf.groupby(["a", "b", "c"]).size(), - gdf.groupby(["a", "b", "c"]).size(), - check_dtype=False, - ) - - sr = pd.Series(range(len(pdf))) - assert_groupby_results_equal( - pdf.groupby(sr).size(), - gdf.groupby(sr).size(), - check_dtype=False, - ) - - -@pytest.mark.parametrize("index", [None, [1, 2, 3, 4]]) -def test_groupby_cumcount(index): - pdf = pd.DataFrame( - { - "a": [1, 1, 3, 4], - "b": ["bob", "bob", "alice", "cooper"], - "c": [1, 2, 3, 4], - }, - index=index, - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").cumcount(), - gdf.groupby("a").cumcount(), - check_dtype=False, - ) - - assert_groupby_results_equal( - pdf.groupby(["a", "b", "c"]).cumcount(), - gdf.groupby(["a", "b", "c"]).cumcount(), - check_dtype=False, - ) - - sr = pd.Series(range(len(pdf)), index=index) - assert_groupby_results_equal( - pdf.groupby(sr).cumcount(), - gdf.groupby(sr).cumcount(), - check_dtype=False, - ) - - -@pytest.mark.parametrize("nelem", [2, 3, 1000]) -@pytest.mark.parametrize("as_index", [True, False]) -@pytest.mark.parametrize( - "agg", ["min", "max", "idxmin", "idxmax", "mean", "count"] -) -def test_groupby_datetime(nelem, as_index, agg): - if agg == "mean" and as_index is True: - return - check_dtype = agg not in ("mean", "count", "idxmin", "idxmax") - pdf = make_frame(pd.DataFrame, nelem=nelem, with_datetime=True) - gdf = make_frame(cudf.DataFrame, nelem=nelem, with_datetime=True) - pdg = pdf.groupby("datetime", as_index=as_index) - gdg = gdf.groupby("datetime", as_index=as_index) - if as_index is False: - pdres = getattr(pdg, agg)() - gdres = getattr(gdg, agg)() - else: - pdres = pdg.agg({"datetime": agg}) - gdres = gdg.agg({"datetime": agg}) - assert_groupby_results_equal( - pdres, - gdres, - check_dtype=check_dtype, - as_index=as_index, - by=["datetime"], - ) - - -def test_groupby_dropna(): - df = cudf.DataFrame({"a": [1, 1, None], "b": [1, 2, 3]}) - expect = cudf.DataFrame( - {"b": [3, 3]}, index=cudf.Series([1, None], name="a") - ) - got = df.groupby("a", dropna=False).sum() - assert_groupby_results_equal(expect, got) - - df = cudf.DataFrame( - {"a": [1, 1, 1, None], "b": [1, None, 1, None], "c": [1, 2, 3, 4]} - ) - idx = cudf.MultiIndex.from_frame( - df[["a", "b"]].drop_duplicates().sort_values(["a", "b"]), - names=["a", "b"], - ) - expect = cudf.DataFrame({"c": [4, 2, 4]}, index=idx) - got = df.groupby(["a", "b"], dropna=False).sum() - - assert_groupby_results_equal(expect, got) - - -def test_groupby_dropna_getattr(): - df = cudf.DataFrame() - df["id"] = [0, 1, 1, None, None, 3, 3] - df["val"] = [0, 1, 1, 2, 2, 3, 3] - got = df.groupby("id", dropna=False).val.sum() - - expect = cudf.Series( - [0, 2, 6, 4], name="val", index=cudf.Series([0, 1, 3, None], name="id") - ) - - assert_groupby_results_equal(expect, got) - - -def test_groupby_categorical_from_string(): - gdf = cudf.DataFrame() - gdf["id"] = ["a", "b", "c"] - gdf["val"] = [0, 1, 2] - gdf["id"] = gdf["id"].astype("category") - assert_groupby_results_equal( - cudf.DataFrame({"val": gdf["val"]}).set_index(keys=gdf["id"]), - gdf.groupby("id").sum(), - ) - - -def test_groupby_arbitrary_length_series(): - gdf = cudf.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}, index=[4, 5, 6]) - gsr = cudf.Series([1.0, 2.0, 2.0], index=[3, 4, 5]) - - pdf = gdf.to_pandas() - psr = gsr.to_pandas() - - expect = pdf.groupby(psr).sum() - got = gdf.groupby(gsr).sum() - - assert_groupby_results_equal(expect, got) - - -def test_groupby_series_same_name_as_dataframe_column(): - gdf = cudf.DataFrame({"a": [1, 1, 2], "b": [2, 3, 4]}, index=[4, 5, 6]) - gsr = cudf.Series([1.0, 2.0, 2.0], name="a", index=[3, 4, 5]) - - pdf = gdf.to_pandas() - psr = gsr.to_pandas() - - expect = pdf.groupby(psr).sum() - got = gdf.groupby(gsr).sum() - - assert_groupby_results_equal(expect, got) - - -def test_group_by_series_and_column_name_in_by(): - gdf = cudf.DataFrame( - {"x": [1.0, 2.0, 3.0], "y": [1, 2, 1]}, index=[1, 2, 3] - ) - gsr0 = cudf.Series([0.0, 1.0, 2.0], name="a", index=[1, 2, 3]) - gsr1 = cudf.Series([0.0, 1.0, 3.0], name="b", index=[3, 4, 5]) - - pdf = gdf.to_pandas() - psr0 = gsr0.to_pandas() - psr1 = gsr1.to_pandas() - - expect = pdf.groupby(["x", psr0, psr1]).sum() - got = gdf.groupby(["x", gsr0, gsr1]).sum() - - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize( - "grouper", - [ - "a", - ["a"], - ["a", "b"], - np.array([0, 1, 1, 2, 3, 2]), - {0: "a", 1: "a", 2: "b", 3: "a", 4: "b", 5: "c"}, - lambda x: x + 1, - ["a", np.array([0, 1, 1, 2, 3, 2])], - ], -) -def test_grouping(grouper): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2, 3], - "b": [1, 2, 1, 2, 1, 2], - "c": [1, 2, 3, 4, 5, 6], - } - ) - gdf = cudf.from_pandas(pdf) - - for pdf_group, gdf_group in zip( - pdf.groupby(grouper), gdf.groupby(grouper) - ): - assert pdf_group[0] == gdf_group[0] - assert_eq(pdf_group[1], gdf_group[1]) - - -@pytest.mark.parametrize("agg", [lambda x: x.count(), "count"]) -@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -def test_groupby_count(agg, by): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby(by).agg(agg) - got = gdf.groupby(by).agg(agg) - - assert_groupby_results_equal(expect, got, check_dtype=True) - - -@pytest.mark.parametrize("agg", [lambda x: x.median(), "median"]) -@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -def test_groupby_median(agg, by): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby(by).agg(agg) - got = gdf.groupby(by).agg(agg) - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("agg", [lambda x: x.nunique(), "nunique"]) -@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -def test_groupby_nunique(agg, by): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 3], "b": [1, 2, 2, 2, 1], "c": [1, 2, None, 4, 5]} - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby(by).nunique() - got = gdf.groupby(by).nunique() - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("dropna", [True, False]) -def test_nunique_dropna(dropna): - gdf = cudf.DataFrame( - { - "a": [1, 1, 2], - "b": [4, None, 5], - "c": [None, None, 7], - "d": [1, 1, 3], - } - ) - pdf = gdf.to_pandas() - - result = gdf.groupby("a")["b"].nunique(dropna=dropna) - expected = pdf.groupby("a")["b"].nunique(dropna=dropna) - assert_groupby_results_equal(result, expected, check_dtype=False) - - -@pytest.mark.parametrize( - "n", - [0, 1, 2, 10], -) -@pytest.mark.parametrize("by", ["a", ["a", "b"], ["a", "c"]]) -def test_groupby_nth(n, by): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 3], - "b": [1, 2, 2, 2, 1], - "c": [1, 2, None, 4, 5], - "d": ["a", "b", "c", "d", "e"], - } - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby(by).nth(n) - got = gdf.groupby(by).nth(n) - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -def test_raise_data_error(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) - gdf = cudf.from_pandas(pdf) - - assert_exceptions_equal( - pdf.groupby("a").mean, - gdf.groupby("a").mean, - ) - - -def test_multi_agg(): - gdf = cudf.DataFrame( - {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]} - ) - pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), - gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}), - ) - - -@pytest.mark.parametrize( - "agg", - ( - list(itertools.combinations(["count", "max", "min", "nunique"], 2)) - + [ - {"b": "min", "c": "mean"}, - {"b": "max", "c": "mean"}, - {"b": "count", "c": "mean"}, - {"b": "nunique", "c": "mean"}, - ] - ), -) -def test_groupby_agg_combinations(agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 2, 2, 3], - "b": ["a", "a", "b", "c", "d"], - "c": [1, 2, 3, 4, 5], - } - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg(agg), - gdf.groupby("a").agg(agg), - check_dtype=False, - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_noempty_group(): - pdf = pd.DataFrame( - {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]} - ) - gdf = cudf.from_pandas(pdf) - - expect = ( - pdf.groupby("a", group_keys=False) - .apply(lambda x: x.iloc[[0, 1]], include_groups=False) - .reset_index(drop=True) - ) - got = ( - gdf.groupby("a") - .apply(lambda x: x.iloc[[0, 1]], include_groups=False) - .reset_index(drop=True) - ) - assert_groupby_results_equal(expect, got) - - -def test_reset_index_after_empty_groupby(): - # GH #5475 - pdf = pd.DataFrame({"a": [1, 2, 3]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").sum().reset_index(), - gdf.groupby("a").sum().reset_index(), - as_index=False, - by="a", - ) - - -def test_groupby_attribute_error(): - err_msg = "Test error message" - - class TestGroupBy(cudf.core.groupby.GroupBy): - @property - def _groupby(self): - raise AttributeError(err_msg) - - a = cudf.DataFrame({"a": [1, 2], "b": [2, 3]}) - gb = TestGroupBy(a, a["a"]) - - with pytest.raises(AttributeError, match=err_msg): - gb.sum() - - -@pytest.mark.parametrize( - "by", - [ - "a", - "b", - ["a"], - ["b"], - ["a", "b"], - ["b", "a"], - np.array([0, 0, 0, 1, 1, 1, 2]), - ], -) -def test_groupby_groups(by): - pdf = pd.DataFrame( - {"a": [1, 2, 1, 2, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6, 7]} - ) - gdf = cudf.from_pandas(pdf) - - pdg = pdf.groupby(by) - gdg = gdf.groupby(by) - - for key in pdg.groups: - assert key in gdg.groups - assert_eq(pdg.groups[key], gdg.groups[key]) - - -@pytest.mark.parametrize( - "by", - [ - "a", - "b", - ["a"], - ["b"], - ["a", "b"], - ["b", "a"], - ["a", "c"], - ["a", "b", "c"], - ], -) -def test_groupby_groups_multi(by): - pdf = pd.DataFrame( - { - "a": [1, 2, 1, 2, 1, 2, 3], - "b": ["a", "b", "a", "b", "b", "c", "c"], - "c": [1, 2, 3, 4, 5, 6, 7], - } - ) - gdf = cudf.from_pandas(pdf) - - pdg = pdf.groupby(by) - gdg = gdf.groupby(by) - - for key in pdg.groups: - assert key in gdg.groups - assert_eq(pdg.groups[key], gdg.groups[key]) - - -def test_groupby_nunique_series(): - pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, 3, 1, 1, 2]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a")["b"].nunique(), - gdf.groupby("a")["b"].nunique(), - check_dtype=False, - ) - - -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_simple(list_agg): - pdf = pd.DataFrame({"a": [1, 1, 1, 2, 2, 2], "b": [1, 2, None, 4, 5, 6]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": list}), - gdf.groupby("a").agg({"b": list_agg}), - check_dtype=False, - ) - - -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_of_lists(list_agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2, 2], - "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]], - } - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": list}), - gdf.groupby("a").agg({"b": list_agg}), - check_dtype=False, - ) - - -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_of_structs(list_agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2, 2], - "b": [ - {"c": "1", "d": 1}, - {"c": "2", "d": 2}, - {"c": "3", "d": 3}, - {"c": "4", "d": 4}, - {"c": "5", "d": 5}, - {"c": "6", "d": 6}, - ], - } - ) - gdf = cudf.from_pandas(pdf) - grouped = gdf.groupby("a").agg({"b": list_agg}) - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": list}), - grouped, - check_dtype=True, - ) - assert grouped["b"].dtype.element_type == gdf["b"].dtype - - -@pytest.mark.parametrize("list_agg", [list, "collect"]) -def test_groupby_list_single_element(list_agg): - pdf = pd.DataFrame({"a": [1, 2], "b": [3, None]}) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg({"b": list}), - gdf.groupby("a").agg({"b": list_agg}), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "agg", [list, [list, "count"], {"b": list, "c": "sum"}] -) -def test_groupby_list_strings(agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": ["b", "a", None, "e", "d"], - "c": [1, 2, 3, 4, 5], - } - ) - gdf = cudf.from_pandas(pdf) - - assert_groupby_results_equal( - pdf.groupby("a").agg(agg), - gdf.groupby("a").agg(agg), - check_dtype=False, - ) - - -def test_groupby_list_columns_excluded(): - pdf = pd.DataFrame( - { - "a": [1, 1, 2, 2], - "b": [1, 2, 3, 4], - "c": [[1, 2], [3, 4], [5, 6], [7, 8]], - } - ) - gdf = cudf.from_pandas(pdf) - - pandas_result = pdf.groupby("a").mean(numeric_only=True) - pandas_agg_result = pdf.groupby("a").agg("mean", numeric_only=True) - - assert_groupby_results_equal( - pandas_result, - gdf.groupby("a").mean(numeric_only=True), - check_dtype=False, - ) - - assert_groupby_results_equal( - pandas_agg_result, - gdf.groupby("a").agg("mean"), - check_dtype=False, - ) - - -def test_groupby_pipe(): - pdf = pd.DataFrame({"A": "a b a b".split(), "B": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("A").pipe(lambda x: x.max() - x.min()) - actual = gdf.groupby("A").pipe(lambda x: x.max() - x.min()) - - assert_groupby_results_equal(expected, actual) - - -def create_test_groupby_apply_return_scalars_params(): - def f0(x): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = ticker / 10 - return full - - def f1(x, k): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = ticker / k - return full - - def f2(x, k, L): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = L * (ticker / k) - return full - - def f3(x, k, L, m): - x = x[~x["B"].isna()] - ticker = x.shape[0] - full = L * (ticker / k) % m - return full - - return [(f0, ()), (f1, (42,)), (f2, (42, 119)), (f3, (42, 119, 212.1))] - - -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_return_scalars_params() -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_groupby_apply_return_scalars(func, args): - pdf = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3, 3, 4, 4, 5, 5], - "B": [ - 0.01, - np.nan, - 0.03, - 0.04, - np.nan, - 0.06, - 0.07, - 0.08, - 0.09, - 1.0, - ], - } - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("A").apply(func, *args, include_groups=False) - actual = gdf.groupby("A").apply(func, *args, include_groups=False) - - assert_groupby_results_equal(expected, actual) - - -def create_test_groupby_apply_return_series_dataframe_params(): - def f0(x): - return x - x.max() - - def f1(x): - return x.min() - x.max() - - def f2(x): - return x.min() - - def f3(x, k): - return x - x.max() + k - - def f4(x, k, L): - return x.min() - x.max() + (k / L) - - def f5(x, k, L, m): - return m * x.min() + (k / L) - - return [ - (f0, ()), - (f1, ()), - (f2, ()), - (f3, (42,)), - (f4, (42, 119)), - (f5, (41, 119, 212.1)), - ] - - -@pytest.mark.parametrize( - "func,args", create_test_groupby_apply_return_series_dataframe_params() -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Include groups missing on old versions of pandas", -) -def test_groupby_apply_return_series_dataframe(func, args): - pdf = pd.DataFrame( - {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby(["key"], group_keys=False).apply( - func, *args, include_groups=False - ) - actual = gdf.groupby(["key"]).apply(func, *args, include_groups=False) - - assert_groupby_results_equal(expected, actual) - - -@pytest.mark.parametrize( - "pdf", - [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], -) -def test_groupby_no_keys(pdf): - gdf = cudf.from_pandas(pdf) - if isinstance(pdf, pd.DataFrame): - kwargs = {"check_column_type": False} - else: - kwargs = {} - assert_groupby_results_equal( - pdf.groupby([]).max(), - gdf.groupby([]).max(), - check_dtype=False, - check_index_type=False, # Int64 v/s Float64 - **kwargs, - ) - - -@pytest.mark.parametrize( - "pdf", - [pd.DataFrame(), pd.DataFrame({"a": []}), pd.Series([], dtype="float64")], -) -def test_groupby_apply_no_keys(pdf): - gdf = cudf.from_pandas(pdf) - if isinstance(pdf, pd.DataFrame): - kwargs = {"check_column_type": False} - else: - kwargs = {} - assert_groupby_results_equal( - pdf.groupby([], group_keys=False).apply(lambda x: x.max()), - gdf.groupby([]).apply(lambda x: x.max()), - check_index_type=False, # Int64 v/s Float64 - **kwargs, - ) - - -@pytest.mark.parametrize( - "pdf", - [pd.DataFrame({"a": [1, 2]}), pd.DataFrame({"a": [1, 2], "b": [2, 3]})], -) -def test_groupby_nonempty_no_keys(pdf): - gdf = cudf.from_pandas(pdf) - assert_exceptions_equal( - lambda: pdf.groupby([]), - lambda: gdf.groupby([]), - ) - - -@pytest.mark.parametrize( - "by,data", - [ - # ([], []), # error? - ([1, 1, 2, 2], [0, 0, 1, 1]), - ([1, 2, 3, 4], [0, 0, 0, 0]), - ([1, 2, 1, 2], [0, 1, 1, 1]), - ], -) -@pytest.mark.parametrize( - "dtype", - SIGNED_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["string", "category"], -) -def test_groupby_unique(by, data, dtype): - pdf = pd.DataFrame({"by": by, "data": data}) - pdf["data"] = pdf["data"].astype(dtype) - gdf = cudf.from_pandas(pdf) - - expect = pdf.groupby("by")["data"].unique() - got = gdf.groupby("by")["data"].unique() - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -@pytest.mark.parametrize( - "func", ["cummin", "cummax", "cumcount", "cumsum", "cumprod"] -) -def test_groupby_2keys_scan(nelem, func): - pdf = make_frame(pd.DataFrame, nelem=nelem) - expect_df = pdf.groupby(["x", "y"], sort=True).agg(func) - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - # pd.groupby.cumcount returns a series. - if isinstance(expect_df, pd.Series): - expect_df = expect_df.to_frame("val") - - check_dtype = func not in _index_type_aggs - assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) - - -@pytest.mark.parametrize("nelem", [100, 1000]) -@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) -@pytest.mark.parametrize("pct", [False, True]) -def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - pdf.columns = ["x", "y", "z"] - gdf = cudf.from_pandas(pdf) - expect_df = pdf.groupby(["x", "y"], sort=True).rank( - method=method, ascending=ascending, na_option=na_option, pct=pct - ) - got_df = gdf.groupby(["x", "y"], sort=True).rank( - method=method, ascending=ascending, na_option=na_option, pct=pct - ) - - assert_groupby_results_equal(got_df, expect_df, check_dtype=False) - - -def test_groupby_rank_fails(): - gdf = cudf.DataFrame( - {"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]} - ) - with pytest.raises(NotImplementedError): - gdf.groupby(["x", "y"]).rank(method="min", axis=1) - gdf = cudf.DataFrame( - { - "a": [1, 1, 1, 2, 2, 2], - "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]], - } - ) - with pytest.raises(NotImplementedError): - gdf.groupby(["a"]).rank(method="min", axis=1) - - -@pytest.mark.parametrize( - "with_nan", [False, True], ids=["just-NA", "also-NaN"] -) -@pytest.mark.parametrize("dropna", [False, True], ids=["keepna", "dropna"]) -@pytest.mark.parametrize( - "duplicate_index", [False, True], ids=["rangeindex", "dupindex"] -) -def test_groupby_scan_null_keys(with_nan, dropna, duplicate_index): - key_col = [None, 1, 2, None, 3, None, 3, 1, None, 1] - if with_nan: - df = pd.DataFrame( - {"key": pd.Series(key_col, dtype="float32"), "value": range(10)} - ) - else: - df = pd.DataFrame( - {"key": pd.Series(key_col, dtype="Int32"), "value": range(10)} - ) - - if duplicate_index: - # Non-default index with duplicates - df.index = [1, 2, 3, 1, 3, 2, 4, 1, 6, 10] - - cdf = cudf.from_pandas(df) - - expect = df.groupby("key", dropna=dropna).cumsum() - got = cdf.groupby("key", dropna=dropna).cumsum() - assert_eq(expect, got) - - -def test_groupby_mix_agg_scan(): - err_msg = "Cannot perform both aggregation and scan in one operation" - func = ["cumsum", "sum"] - gb = make_frame(DataFrame, nelem=10).groupby(["x", "y"], sort=True) - - gb.agg(func[0]) - gb.agg(func[1]) - gb.agg(func[1:]) - with pytest.raises(NotImplementedError, match=err_msg): - gb.agg(func) - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -@pytest.mark.parametrize("fill_value", [None, np.nan, 42]) -def test_groupby_shift_row(nelem, shift_perc, direction, fill_value): - pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"]) - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["x", "y"]).shift( - periods=n_shift, fill_value=fill_value - ) - got = gdf.groupby(["x", "y"]).shift(periods=n_shift, fill_value=fill_value) - - assert_groupby_results_equal( - expected[["val", "val2"]], got[["val", "val2"]] - ) - - -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -@pytest.mark.parametrize( - "fill_value", - [ - None, - pytest.param( - 0, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/10608" - ), - ), - pytest.param( - 42, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/10608" - ), - ), - ], -) -def test_groupby_shift_row_mixed_numerics( - nelem, shift_perc, direction, fill_value -): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) - got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -# TODO: Shifting list columns is currently unsupported because we cannot -# construct a null list scalar in python. Support once it is added. -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_shift_row_mixed(nelem, shift_perc, direction): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["0"]).shift(periods=n_shift) - got = gdf.groupby(["0"]).shift(periods=n_shift) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -@pytest.mark.parametrize( - "fill_value", - [ - [ - 42, - "fill", - np.datetime64(123, "ns"), - cudf.Scalar(456, dtype="timedelta64[ns]"), - ] - ], -) -def test_groupby_shift_row_mixed_fill( - nelem, shift_perc, direction, fill_value -): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - # Pandas does not support specifying different fill_value by column, so we - # simulate it column by column - expected = pdf.copy() - for col, single_fill in zip(pdf.iloc[:, 1:], fill_value): - if isinstance(single_fill, cudf.Scalar): - single_fill = single_fill._host_value - expected[col] = ( - pdf[col] - .groupby(pdf["0"]) - .shift(periods=n_shift, fill_value=single_fill) - ) - - got = gdf.groupby(["0"]).shift(periods=n_shift, fill_value=fill_value) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) -@pytest.mark.parametrize("fill_value", [None, 0, 42]) -def test_groupby_shift_row_zero_shift(nelem, fill_value): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - gdf = cudf.from_pandas(t.to_pandas()) - - expected = gdf - got = gdf.groupby(["0"]).shift(periods=0, fill_value=fill_value) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_diff_row(nelem, shift_perc, direction): - pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"]) - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["x", "y"]).diff(periods=n_shift) - got = gdf.groupby(["x", "y"]).diff(periods=n_shift) - - assert_groupby_results_equal( - expected[["val", "val2"]], got[["val", "val2"]] - ) - - -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) -@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5]) -@pytest.mark.parametrize("direction", [1, -1]) -def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - n_shift = int(nelem * shift_perc) * direction - - expected = pdf.groupby(["0"]).diff(periods=n_shift) - got = gdf.groupby(["0"]).diff(periods=n_shift) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4", "5"]], got[["1", "2", "3", "4", "5"]] - ) - - -@pytest.mark.parametrize("nelem", [10, 50, 100, 1000]) -def test_groupby_diff_row_zero_shift(nelem): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - gdf = cudf.from_pandas(t.to_pandas()) - - expected = gdf - got = gdf.groupby(["0"]).shift(periods=0) - - assert_groupby_results_equal( - expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]] - ) - - -# TODO: test for category columns when cudf.Scalar supports category type -@pytest.mark.parametrize("nelem", [10, 100, 1000]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -def test_groupby_fillna_multi_value(nelem): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ms]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - key_col = "0" - value_cols = ["1", "2", "3", "4", "5", "6"] - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - - # fill the dataframe with the first non-null item in the column - fill_values = { - name: pdf[name].loc[pdf[name].first_valid_index()] - for name in value_cols - } - # cudf can't fillna with a pandas.Timedelta type - fill_values["4"] = fill_values["4"].to_numpy() - with pytest.warns(FutureWarning): - expect = pdf.groupby(key_col).fillna(value=fill_values) - with pytest.warns(FutureWarning): - got = gdf.groupby(key_col).fillna(value=fill_values) - - assert_groupby_results_equal(expect[value_cols], got[value_cols]) - - -# TODO: test for category columns when cudf.Scalar supports category type -# TODO: cudf.fillna does not support decimal column to column fill yet -@pytest.mark.parametrize("nelem", [10, 100, 1000]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -def test_groupby_fillna_multi_value_df(nelem): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ms]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - key_col = "0" - value_cols = ["1", "2", "3", "4", "5"] - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - - # fill the dataframe with the first non-null item in the column - fill_values = { - name: pdf[name].loc[pdf[name].first_valid_index()] - for name in value_cols - } - # cudf can't fillna with a pandas.Timedelta type - fill_values["4"] = fill_values["4"].to_numpy() - fill_values = pd.DataFrame(fill_values, index=pdf.index) - with pytest.warns(FutureWarning): - expect = pdf.groupby(key_col).fillna(value=fill_values) - - fill_values = cudf.from_pandas(fill_values) - with pytest.warns(FutureWarning): - got = gdf.groupby(key_col).fillna(value=fill_values) - - assert_groupby_results_equal(expect[value_cols], got[value_cols]) - - -@pytest.mark.parametrize( - "by", - [pd.Series([1, 1, 2, 2, 3, 4]), lambda x: x % 2 == 0, pd.Grouper(level=0)], -) -@pytest.mark.parametrize( - "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]] -) -@pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -def test_groupby_various_by_fillna(by, data, args): - ps = pd.Series(data) - gs = cudf.from_pandas(ps) - - with pytest.warns(FutureWarning): - expect = ps.groupby(by).fillna(**args) - if isinstance(by, pd.Grouper): - by = cudf.Grouper(level=by.level) - with pytest.warns(FutureWarning): - got = gs.groupby(by).fillna(**args) - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("nelem", [10, 100, 1000]) -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -def test_groupby_fillna_method(nelem, method): - t = rand_dataframe( - dtypes_meta=[ - {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, - {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10}, - { - "dtype": "datetime64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "timedelta64[ns]", - "null_frequency": 0.4, - "cardinality": 10, - }, - { - "dtype": "list", - "null_frequency": 0.4, - "cardinality": 10, - "lists_max_length": 10, - "nesting_max_depth": 3, - "value_type": "int64", - }, - {"dtype": "category", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10}, - {"dtype": "str", "null_frequency": 0.4, "cardinality": 10}, - ], - rows=nelem, - use_threads=False, - seed=0, - ) - key_col = "0" - value_cols = ["1", "2", "3", "4", "5", "6", "7", "8"] - pdf = t.to_pandas() - gdf = cudf.from_pandas(pdf) - - with pytest.warns(FutureWarning): - expect = pdf.groupby(key_col).fillna(method=method) - with pytest.warns(FutureWarning): - got = gdf.groupby(key_col).fillna(method=method) - - assert_groupby_results_equal( - expect[value_cols], got[value_cols], sort=False - ) - - -@pytest.mark.parametrize( - "data", - [ - {"Speed": [380.0, 370.0, 24.0, 26.0], "Score": [50, 30, 90, 80]}, - { - "Speed": [380.0, 370.0, 24.0, 26.0], - "Score": [50, 30, 90, 80], - "Other": [10, 20, 30, 40], - }, - ], -) -@pytest.mark.parametrize("group", ["Score", "Speed"]) -def test_groupby_describe(data, group): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - got = gdf.groupby(group).describe() - expect = pdf.groupby(group).describe() - - assert_groupby_results_equal(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [], "b": []}, - {"a": [2, 1, 2, 1, 1, 3], "b": [None, 1, 2, None, 2, None]}, - {"a": [None], "b": [None]}, - {"a": [2, 1, 1], "b": [None, 1, 0], "c": [None, 0, 1]}, - ], -) -@pytest.mark.parametrize("agg", ["first", "last", ["first", "last"]]) -def test_groupby_first(data, agg): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby("a").agg(agg) - got = gdf.groupby("a").agg(agg) - assert_groupby_results_equal(expect, got, check_dtype=False) - - -def test_groupby_apply_series(): - def foo(x): - return x.sum() - - got = make_frame(DataFrame, 100).groupby("x").y.apply(foo) - expect = make_frame(pd.DataFrame, 100).groupby("x").y.apply(foo) - - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize( - "func,args", - [ - (lambda x, k: x + k, (42,)), - (lambda x, k, L: x + k - L, (42, 191)), - (lambda x, k, L, m: (x + k) / (L * m), (42, 191, 99.9)), - ], -) -def test_groupby_apply_series_args(func, args): - got = make_frame(DataFrame, 100).groupby("x").y.apply(func, *args) - expect = ( - make_frame(pd.DataFrame, 100) - .groupby("x", group_keys=False) - .y.apply(func, *args) - ) - - assert_groupby_results_equal(expect, got) - - -@pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("closed", [None, "left", "right"]) -def test_groupby_freq_week(label, closed): - pdf = pd.DataFrame( - { - "Publish date": [ - pd.Timestamp("2000-01-03"), - pd.Timestamp("2000-01-01"), - pd.Timestamp("2000-01-09"), - pd.Timestamp("2000-01-02"), - pd.Timestamp("2000-01-07"), - pd.Timestamp("2000-01-16"), - ], - "ID": [0, 1, 2, 3, 4, 5], - "Price": [10, 20, 30, 40, 50, 60], - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby( - pd.Grouper(key="Publish date", freq="1W", label=label, closed=closed) - ).mean() - got = gdf.groupby( - cudf.Grouper(key="Publish date", freq="1W", label=label, closed=closed) - ).mean() - assert_eq( - expect, - got, - check_like=True, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("closed", [None, "left", "right"]) -def test_groupby_freq_day(label, closed): - pdf = pd.DataFrame( - { - "Publish date": [ - pd.Timestamp("2000-01-03"), - pd.Timestamp("2000-01-01"), - pd.Timestamp("2000-01-09"), - pd.Timestamp("2000-01-02"), - pd.Timestamp("2000-01-07"), - pd.Timestamp("2000-01-16"), - ], - "ID": [0, 1, 2, 3, 4, 5], - "Price": [10, 20, 30, 40, 50, 60], - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby( - pd.Grouper(key="Publish date", freq="3D", label=label, closed=closed) - ).mean() - got = gdf.groupby( - cudf.Grouper(key="Publish date", freq="3D", label=label, closed=closed) - ).mean() - assert_eq( - expect, - got, - check_like=True, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("closed", [None, "left", "right"]) -def test_groupby_freq_min(label, closed): - pdf = pd.DataFrame( - { - "Publish date": [ - pd.Timestamp("2000-01-01 12:01:00"), - pd.Timestamp("2000-01-01 12:05:00"), - pd.Timestamp("2000-01-01 15:30:00"), - pd.Timestamp("2000-01-02 00:00:00"), - pd.Timestamp("2000-01-01 23:47:00"), - pd.Timestamp("2000-01-02 00:05:00"), - ], - "ID": [0, 1, 2, 3, 4, 5], - "Price": [10, 20, 30, 40, 50, 60], - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby( - pd.Grouper(key="Publish date", freq="1h", label=label, closed=closed) - ).mean() - got = gdf.groupby( - cudf.Grouper(key="Publish date", freq="1h", label=label, closed=closed) - ).mean() - assert_eq( - expect, - got, - check_like=True, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize("label", [None, "left", "right"]) -@pytest.mark.parametrize("closed", [None, "left", "right"]) -def test_groupby_freq_s(label, closed): - pdf = pd.DataFrame( - { - "Publish date": [ - pd.Timestamp("2000-01-01 00:00:02"), - pd.Timestamp("2000-01-01 00:00:07"), - pd.Timestamp("2000-01-01 00:00:02"), - pd.Timestamp("2000-01-02 00:00:15"), - pd.Timestamp("2000-01-01 00:00:05"), - pd.Timestamp("2000-01-02 00:00:09"), - ], - "ID": [0, 1, 2, 3, 4, 5], - "Price": [10, 20, 30, 40, 50, 60], - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.groupby( - pd.Grouper(key="Publish date", freq="3s", label=label, closed=closed) - ).mean() - got = gdf.groupby( - cudf.Grouper(key="Publish date", freq="3s", label=label, closed=closed) - ).mean() - assert_eq( - expect, - got, - check_like=True, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - "pdf, group, name, obj", - [ - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "X", - "A", - None, - ), - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "X", - "B", - None, - ), - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "X", - "A", - pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), - ), - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "Y", - 1, - pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), - ), - ( - pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), - "Y", - 3, - pd.DataFrame({"a": [1, 2, 0, 11]}), - ), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warnings only given on newer versions.", -) -def test_groupby_get_group(pdf, group, name, obj): - gdf = cudf.from_pandas(pdf) - - if isinstance(obj, pd.DataFrame): - gobj = cudf.from_pandas(obj) - else: - gobj = obj - - pgb = pdf.groupby(group) - ggb = gdf.groupby(group) - with expect_warning_if(obj is not None): - expected = pgb.get_group(name=name, obj=obj) - with expect_warning_if(obj is not None): - actual = ggb.get_group(name=name, obj=gobj) - - assert_groupby_results_equal(expected, actual) - - expected = pdf.iloc[pgb.indices.get(name)] - actual = gdf.iloc[ggb.indices.get(name)] - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "by", - [ - "a", - ["a", "b"], - pd.Series([2, 1, 1, 2, 2]), - pd.Series(["b", "a", "a", "b", "b"]), - ], -) -@pytest.mark.parametrize("agg", ["sum", "mean", lambda df: df.mean()]) -def test_groupby_transform_aggregation(by, agg): - gdf = cudf.DataFrame( - {"a": [2, 2, 1, 2, 1], "b": [1, 1, 1, 2, 2], "c": [1, 2, 3, 4, 5]} - ) - pdf = gdf.to_pandas() - - expected = pdf.groupby(by).transform(agg) - actual = gdf.groupby(by).transform(agg) - - assert_groupby_results_equal(expected, actual) - - -def test_groupby_select_then_ffill(): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 2, 2], - "b": [1, None, None, 2, None], - "c": [3, None, None, 4, None], - } - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("a")["c"].ffill() - actual = gdf.groupby("a")["c"].ffill() - - assert_groupby_results_equal(expected, actual) - - -def test_groupby_select_then_shift(): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("a")["c"].shift(1) - actual = gdf.groupby("a")["c"].shift(1) - - assert_groupby_results_equal(expected, actual) - - -def test_groupby_select_then_diff(): - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.groupby("a")["c"].diff(1) - actual = gdf.groupby("a")["c"].diff(1) - - assert_groupby_results_equal(expected, actual) - - -# TODO: Add a test including datetime64[ms] column in input data - - -@pytest.mark.parametrize("by", ["a", ["a", "b"], pd.Series([1, 2, 1, 3])]) -def test_groupby_transform_maintain_index(by): - # test that we maintain the index after a groupby transform - gdf = cudf.DataFrame( - {"a": [1, 1, 1, 2], "b": [1, 2, 1, 2]}, index=[3, 2, 1, 0] - ) - pdf = gdf.to_pandas() - assert_groupby_results_equal( - pdf.groupby(by).transform("max"), gdf.groupby(by).transform("max") - ) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data, gkey", - [ - ( - { - "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], - "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], - "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], - "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], - }, - ["id"], - ), - ( - { - "id": [0, 0, 0, 0, 1, 1, 1], - "a": [1, 3, 4, 2.0, -3.0, 9.0, 10.0], - "b": [10.0, 23, -4.0, 2, -3.0, None, 19.0], - }, - ["id", "a"], - ), - ( - { - "id": ["a", "a", "b", "b", "c", "c"], - "val1": [None, None, None, None, None, None], - }, - ["id"], - ), - ], -) -@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5]) -@pytest.mark.parametrize("fill_method", ["ffill", "bfill", no_default, None]) -def test_groupby_pct_change(data, gkey, periods, fill_method): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - with expect_warning_if(fill_method not in (no_default, None)): - actual = gdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) - with expect_warning_if( - ( - fill_method not in (no_default, None) - or (fill_method is not None and pdf.isna().any().any()) - ) - ): - expected = pdf.groupby(gkey).pct_change( - periods=periods, fill_method=fill_method - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("periods", [-5, 5]) -def test_groupby_pct_change_multiindex_dataframe(periods): - gdf = cudf.DataFrame( - { - "a": [1, 1, 2, 2], - "b": [1, 1, 2, 3], - "c": [2, 3, 4, 5], - "d": [6, 8, 9, 1], - } - ).set_index(["a", "b"]) - - actual = gdf.groupby(level=["a", "b"]).pct_change(periods) - expected = gdf.to_pandas().groupby(level=["a", "b"]).pct_change(periods) - - assert_eq(expected, actual) - - -def test_groupby_pct_change_empty_columns(): - gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) - pdf = gdf.to_pandas() - - actual = gdf.groupby("id").pct_change() - expected = pdf.groupby("id").pct_change() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("group_keys", [None, True, False]) -@pytest.mark.parametrize("by", ["A", ["A", "B"]]) -def test_groupby_group_keys(group_keys, by): - gdf = cudf.DataFrame( - { - "A": "a a a a b b".split(), - "B": [1, 1, 2, 2, 3, 3], - "C": [4, 6, 5, 9, 8, 7], - } - ) - pdf = gdf.to_pandas() - - g_group = gdf.groupby(by, group_keys=group_keys) - p_group = pdf.groupby(by, group_keys=group_keys) - - actual = g_group[["B", "C"]].apply(lambda x: x / x.sum()) - expected = p_group[["B", "C"]].apply(lambda x: x / x.sum()) - assert_eq(actual, expected) - - -@pytest.fixture -def df_ngroup(): - df = cudf.DataFrame( - { - "a": [2, 2, 1, 1, 2, 3], - "b": [1, 2, 1, 2, 1, 2], - "c": ["a", "a", "b", "c", "d", "c"], - }, - index=[1, 3, 5, 7, 4, 2], - ) - df.index.name = "foo" - return df - - -@pytest.mark.parametrize( - "by", - [ - lambda: "a", - lambda: "b", - lambda: ["a", "b"], - lambda: "c", - lambda: pd.Series([1, 2, 1, 2, 1, 2]), - lambda: pd.Series(["x", "y", "y", "x", "z", "x"]), - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -def test_groupby_ngroup(by, ascending, df_ngroup): - by = by() - expected = df_ngroup.to_pandas().groupby(by).ngroup(ascending=ascending) - actual = df_ngroup.groupby(by).ngroup(ascending=ascending) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) -def test_groupby_dtypes(groups): - df = cudf.DataFrame( - {"a": [1, 2, 3, 3], "b": ["x", "y", "z", "a"], "c": [10, 11, 12, 12]} - ) - pdf = df.to_pandas() - with pytest.warns(FutureWarning): - expected = pdf.groupby(groups).dtypes - with pytest.warns(FutureWarning): - actual = df.groupby(groups).dtypes - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("index_names", ["a", "b", "c", ["b", "c"]]) -def test_groupby_by_index_names(index_names): - gdf = cudf.DataFrame( - {"a": [1, 2, 3, 4], "b": ["a", "b", "a", "a"], "c": [1, 1, 2, 1]} - ).set_index(index_names) - pdf = gdf.to_pandas() - - assert_groupby_results_equal( - pdf.groupby(index_names).min(), gdf.groupby(index_names).min() - ) - - -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) -def test_group_by_pandas_compat(groups): - with cudf.option_context("mode.pandas_compatible", True): - df = cudf.DataFrame( - { - "a": [1, 3, 2, 3, 3], - "b": ["x", "a", "y", "z", "a"], - "c": [10, 13, 11, 12, 12], - } - ) - pdf = df.to_pandas() - - assert_eq(pdf.groupby(groups).max(), df.groupby(groups).max()) - - -class TestSample: - @pytest.fixture(params=["default", "rangeindex", "intindex", "strindex"]) - def index(self, request): - n = 12 - if request.param == "rangeindex": - return cudf.RangeIndex(2, n + 2) - elif request.param == "intindex": - return cudf.Index( - [2, 3, 4, 1, 0, 5, 6, 8, 7, 9, 10, 13], dtype="int32" - ) - elif request.param == "strindex": - return cudf.Index(list(string.ascii_lowercase[:n])) - elif request.param == "default": - return None - - @pytest.fixture( - params=[ - ["a", "a", "b", "b", "c", "c", "c", "d", "d", "d", "d", "d"], - [1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4], - ], - ids=["str-group", "int-group"], - ) - def df(self, index, request): - return cudf.DataFrame( - {"a": request.param, "b": request.param, "v": request.param}, - index=index, - ) - - @pytest.fixture(params=["a", ["a", "b"]], ids=["single-col", "two-col"]) - def by(self, request): - return request.param - - def expected(self, df, *, n=None, frac=None): - value_counts = collections.Counter(df.a.values_host) - if n is not None: - values = list( - itertools.chain.from_iterable( - itertools.repeat(v, n) for v in value_counts.keys() - ) - ) - elif frac is not None: - values = list( - itertools.chain.from_iterable( - itertools.repeat(v, round(count * frac)) - for v, count in value_counts.items() - ) - ) - else: - raise ValueError("Must provide either n or frac") - values = cudf.Series(sorted(values), dtype=df.a.dtype) - return cudf.DataFrame({"a": values, "b": values, "v": values}) - - @pytest.mark.parametrize("n", [None, 0, 1, 2]) - def test_constant_n_no_replace(self, df, by, n): - result = df.groupby(by).sample(n=n).sort_values("a") - n = 1 if n is None else n - assert_eq(self.expected(df, n=n), result.reset_index(drop=True)) - - def test_constant_n_no_replace_too_large_raises(self, df): - with pytest.raises(ValueError): - df.groupby("a").sample(n=3) - - @pytest.mark.parametrize("n", [1, 2, 3]) - def test_constant_n_replace(self, df, by, n): - result = df.groupby(by).sample(n=n, replace=True).sort_values("a") - assert_eq(self.expected(df, n=n), result.reset_index(drop=True)) - - def test_invalid_arguments(self, df): - with pytest.raises(ValueError): - df.groupby("a").sample(n=1, frac=0.1) - - def test_not_implemented_arguments(self, df): - with pytest.raises(NotImplementedError): - # These are valid weights, but we don't implement this yet. - df.groupby("a").sample(n=1, weights=[1 / len(df)] * len(df)) - - @pytest.mark.parametrize("frac", [0, 1 / 3, 1 / 2, 2 / 3, 1]) - @pytest.mark.parametrize("replace", [False, True]) - def test_fraction_rounding(self, df, by, frac, replace): - result = ( - df.groupby(by).sample(frac=frac, replace=replace).sort_values("a") - ) - assert_eq(self.expected(df, frac=frac), result.reset_index(drop=True)) - - -class TestHeadTail: - @pytest.fixture(params=[-3, -2, -1, 0, 1, 2, 3], ids=lambda n: f"{n=}") - def n(self, request): - return request.param - - @pytest.fixture( - params=[False, True], ids=["no-preserve-order", "preserve-order"] - ) - def preserve_order(self, request): - return request.param - - @pytest.fixture - def df(self): - return cudf.DataFrame( - { - "a": [1, 0, 1, 2, 2, 1, 3, 2, 3, 3, 3], - "b": [0, 1, 2, 4, 3, 5, 6, 7, 9, 8, 10], - } - ) - - @pytest.fixture(params=[True, False], ids=["head", "tail"]) - def take_head(self, request): - return request.param - - @pytest.fixture - def expected(self, df, n, take_head, preserve_order): - if n == 0: - # We'll get an empty dataframe in this case - return df._empty_like(keep_index=True) - else: - if preserve_order: - # Should match pandas here - g = df.to_pandas().groupby("a") - if take_head: - return g.head(n=n) - else: - return g.tail(n=n) - else: - # We groupby "a" which is the first column. This - # possibly relies on an implementation detail that for - # integer group keys, cudf produces groups in sorted - # (ascending) order. - keyfunc = operator.itemgetter(0) - if take_head or n == 0: - # Head does group[:n] as does tail for n == 0 - slicefunc = operator.itemgetter(slice(None, n)) - else: - # Tail does group[-n:] except when n == 0 - slicefunc = operator.itemgetter( - slice(-n, None) if n else slice(0) - ) - values_to_sort = np.hstack( - [df.values_host, np.arange(len(df)).reshape(-1, 1)] - ) - expect_a, expect_b, index = zip( - *itertools.chain.from_iterable( - slicefunc(list(group)) - for _, group in itertools.groupby( - sorted(values_to_sort.tolist(), key=keyfunc), - key=keyfunc, - ) - ) - ) - return cudf.DataFrame( - {"a": expect_a, "b": expect_b}, index=index - ) - - def test_head_tail(self, df, n, take_head, expected, preserve_order): - if take_head: - actual = df.groupby("a").head(n=n, preserve_order=preserve_order) - else: - actual = df.groupby("a").tail(n=n, preserve_order=preserve_order) - assert_eq(actual, expected) - - -def test_head_tail_empty(): - # GH #13397 - - values = [1, 2, 3] - pdf = pd.DataFrame({}, index=values) - df = cudf.DataFrame({}, index=values) - - expected = pdf.groupby(pd.Series(values)).head() - got = df.groupby(cudf.Series(values)).head() - assert_eq(expected, got, check_column_type=False) - - expected = pdf.groupby(pd.Series(values)).tail() - got = df.groupby(cudf.Series(values)).tail() - - assert_eq(expected, got, check_column_type=False) - - -@pytest.mark.parametrize( - "groups", ["a", "b", "c", ["a", "c"], ["a", "b", "c"]] -) -@pytest.mark.parametrize("sort", [True, False]) -def test_group_by_pandas_sort_order(groups, sort): - with cudf.option_context("mode.pandas_compatible", True): - df = cudf.DataFrame( - { - "a": [10, 1, 10, 3, 2, 1, 3, 3], - "b": [5, 6, 7, 1, 2, 3, 4, 9], - "c": [20, 20, 10, 11, 13, 11, 12, 12], - } - ) - pdf = df.to_pandas() - - assert_eq( - pdf.groupby(groups, sort=sort).sum(), - df.groupby(groups, sort=sort).sum(), - ) - - -@pytest.mark.parametrize( - "dtype", - ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"], -) -@pytest.mark.parametrize( - "reduce_op", - [ - "min", - "max", - "idxmin", - "idxmax", - "first", - "last", - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_group_by_empty_reduction(dtype, reduce_op): - gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype) - pdf = gdf.to_pandas() - - gg = gdf.groupby("a")["c"] - pg = pdf.groupby("a")["c"] - - assert_eq( - getattr(gg, reduce_op)(), getattr(pg, reduce_op)(), check_dtype=True - ) - - -@pytest.mark.parametrize( - "dtype", - ["int32", "int64", "float64", "datetime64[ns]", "timedelta64[ns]", "bool"], -) -@pytest.mark.parametrize( - "apply_op", - ["sum", "min", "max", "idxmax"], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_group_by_empty_apply(request, dtype, apply_op): - request.applymarker( - pytest.mark.xfail( - condition=(dtype == "datetime64[ns]" and apply_op == "sum"), - reason=("sum isn't supported for datetime64[ns]"), - ) - ) - - gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype) - pdf = gdf.to_pandas() - - gg = gdf.groupby("a")["c"] - pg = pdf.groupby("a")["c"] - - assert_eq( - gg.apply(apply_op), - pg.apply(apply_op), - check_dtype=True, - check_index_type=True, - ) - - -def test_groupby_consecutive_operations(): - df = cudf.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) - pdf = df.to_pandas() - - gg = df.groupby("A") - pg = pdf.groupby("A") - - actual = gg.nth(-1) - expected = pg.nth(-1) - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - actual = gg.nth(0) - expected = pg.nth(0) - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - actual = gg.cumsum() - expected = pg.cumsum() - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - actual = gg.cumcount() - expected = pg.cumcount() - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - actual = gg.cumsum() - expected = pg.cumsum() - - assert_groupby_results_equal(actual, expected, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warning only given on newer versions.", -) -def test_categorical_grouping_pandas_compatibility(): - gdf = cudf.DataFrame( - { - "key": cudf.Series([2, 1, 3, 1, 1], dtype="category"), - "a": [0, 1, 3, 2, 3], - } - ) - pdf = gdf.to_pandas() - - with cudf.option_context("mode.pandas_compatible", True): - actual = gdf.groupby("key", sort=False).sum() - with pytest.warns(FutureWarning): - # observed param deprecation. - expected = pdf.groupby("key", sort=False).sum() - assert_eq(actual, expected) - - -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("as_index", [True, False]) -def test_group_by_value_counts(normalize, sort, ascending, dropna, as_index): - # From Issue#12789 - df = cudf.DataFrame( - { - "gender": ["male", "male", "female", "male", "female", "male"], - "education": ["low", "medium", np.nan, "low", "high", "low"], - "country": ["US", "FR", "US", "FR", "FR", "FR"], - } - ) - pdf = df.to_pandas() - - actual = df.groupby("gender", as_index=as_index).value_counts( - normalize=normalize, sort=sort, ascending=ascending, dropna=dropna - ) - expected = pdf.groupby("gender", as_index=as_index).value_counts( - normalize=normalize, sort=sort, ascending=ascending, dropna=dropna - ) - - # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` - assert_groupby_results_equal( - actual, - expected, - check_names=False, - check_index_type=False, - as_index=as_index, - by=["gender", "education"], - sort=sort, - ) - - -def test_group_by_value_counts_subset(): - # From Issue#12789 - df = cudf.DataFrame( - { - "gender": ["male", "male", "female", "male", "female", "male"], - "education": ["low", "medium", "high", "low", "high", "low"], - "country": ["US", "FR", "US", "FR", "FR", "FR"], - } - ) - pdf = df.to_pandas() - - actual = df.groupby("gender").value_counts(["education"]) - expected = pdf.groupby("gender").value_counts(["education"]) - - # TODO: Remove `check_names=False` once testing against `pandas>=2.0.0` - assert_groupby_results_equal( - actual, expected, check_names=False, check_index_type=False - ) - - -def test_group_by_value_counts_clash_with_subset(): - df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) - with pytest.raises(ValueError): - df.groupby("a").value_counts(["a"]) - - -def test_group_by_value_counts_subset_not_exists(): - df = cudf.DataFrame({"a": [1, 5, 3], "b": [2, 5, 2]}) - with pytest.raises(ValueError): - df.groupby("a").value_counts(["c"]) - - -def test_group_by_value_counts_with_count_column(): - df = cudf.DataFrame({"a": [1, 5, 3], "count": [2, 5, 2]}) - with pytest.raises(ValueError): - df.groupby("a", as_index=False).value_counts() - - -def test_groupby_internal_groups_empty(gdf): - # test that we don't segfault when calling the internal - # .groups() method with an empty list: - gb = gdf.groupby("y")._groupby - _, _, grouped_vals = gb.groups([]) - assert grouped_vals == [] - - -def test_groupby_shift_series_multiindex(): - idx = cudf.MultiIndex.from_tuples( - [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["f", "s"] - ) - ser = Series(range(4), index=idx) - result = ser.groupby(level=0).shift(1) - expected = ser.to_pandas().groupby(level=0).shift(1) - assert_eq(expected, result) - - -@pytest.mark.parametrize( - "func", ["min", "max", "sum", "mean", "idxmin", "idxmax"] -) -@pytest.mark.parametrize( - "by,data", - [ - ("a", {"a": [1, 2, 3]}), - (["a", "id"], {"id": [0, 0, 1], "a": [1, 2, 3]}), - ("a", {"a": [1, 2, 3], "b": ["A", "B", "C"]}), - ("id", {"id": [0, 0, 1], "a": [1, 2, 3], "b": ["A", "B", "C"]}), - (["b", "id"], {"id": [0, 0, 1], "b": ["A", "B", "C"]}), - ("b", {"b": ["A", "B", "C"]}), - ], -) -def test_group_by_reduce_numeric_only(by, data, func): - # Test that simple groupby reductions support numeric_only=True - df = cudf.DataFrame(data) - expected = getattr(df.to_pandas().groupby(by, sort=True), func)( - numeric_only=True - ) - result = getattr(df.groupby(by, sort=True), func)(numeric_only=True) - assert_eq(expected, result) - - -@pytest.mark.parametrize( - "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"] -) -def test_group_by_raises_string_error(op): - df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]}) - - with pytest.raises(TypeError): - df.groupby(df.a).agg(op) - - -@pytest.mark.parametrize( - "op", - [ - "cummax", - "cummin", - "cumprod", - "cumsum", - "mean", - "median", - "prod", - "sum", - list, - ], -) -def test_group_by_raises_category_error(op): - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5], - "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"), - } - ) - - with pytest.raises(TypeError): - df.groupby(df.a).agg(op) - - -def test_ngroups(): - pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)}) - gdf = cudf.DataFrame.from_pandas(pdf) - - pgb = pdf.groupby("a") - ggb = gdf.groupby("a") - assert pgb.ngroups == ggb.ngroups - assert len(pgb) == len(ggb) - - -def test_ndim(): - pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)}) - gdf = cudf.DataFrame.from_pandas(pdf) - - pgb = pdf.groupby("a") - ggb = gdf.groupby("a") - assert pgb.ndim == ggb.ndim - - pser = pd.Series(range(3)) - gser = cudf.Series.from_pandas(pser) - pgb = pser.groupby([0, 0, 1]) - ggb = gser.groupby(cudf.Series([0, 0, 1])) - assert pgb.ndim == ggb.ndim diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py deleted file mode 100644 index c98b92f7083..00000000000 --- a/python/cudf/cudf/tests/test_hash_vocab.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import filecmp -import os -import warnings - -import pytest - -from cudf.utils.hash_vocab_utils import hash_vocab - - -@pytest.fixture(scope="module") -def datadir(datadir): - return os.path.join( - datadir, "subword_tokenizer_data", "bert_base_cased_sampled" - ) - - -def test_correct_bert_base_vocab_hash(datadir, tmpdir): - # The vocabulary is drawn from bert-base-cased - vocab_path = os.path.join(datadir, "vocab.txt") - - groundtruth_path = os.path.join(datadir, "vocab-hash.txt") - output_path = tmpdir.join("cudf-vocab-hash.txt") - warnings.simplefilter(action="ignore", category=RuntimeWarning) - hash_vocab(vocab_path, output_path) - - assert filecmp.cmp(output_path, groundtruth_path, shallow=False) diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py deleted file mode 100644 index 430ed973f19..00000000000 --- a/python/cudf/cudf/tests/test_hdf.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import os -from string import ascii_letters - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES, UNSIGNED_TYPES - -pytest.importorskip("tables") - - -@pytest.fixture(params=[0, 1, 10, 100]) -def pdf(request): - types = set(NUMERIC_TYPES + ["datetime64[ns]"] + ["bool"]) - set( - UNSIGNED_TYPES - ) - typer = {"col_" + val: val for val in types} - ncols = len(types) - nrows = request.param - - rng = np.random.default_rng(1) - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - rng.integers(0, 50, size=(nrows, ncols)), - columns=pd.Index([f"col_{typ}" for typ in types]), - index=pd.RangeIndex(nrows, name="test_index"), - ) - # Cast all the column dtypes to objects, rename them, and then cast to - # appropriate types - test_pdf = test_pdf.astype(typer).rename( - {"col_datetime64[ns]": "col_datetime64"}, axis=1 - ) - - # Create non-numeric categorical data otherwise may be typecasted - data = rng.choice(list(ascii_letters), size=nrows) - test_pdf["col_category"] = pd.Series(data, dtype="category") - - return (test_pdf, nrows) - - -@pytest.fixture -def gdf(pdf): - pdf, nrows = pdf - return (cudf.DataFrame.from_pandas(pdf), nrows) - - -@pytest.fixture(params=["fixed", "table"]) -def hdf_files(request, tmp_path_factory, pdf): - pdf, nrows = pdf - if request.param == "fixed": - pdf = pdf.drop("col_category", axis=1) - - fname_df = tmp_path_factory.mktemp("hdf") / "test_df.hdf" - pdf.to_hdf(fname_df, key="hdf_df_tests", format=request.param) - - fname_series = {} - for column in pdf.columns: - fname_series[column] = ( - tmp_path_factory.mktemp("hdf") / "test_series.hdf" - ) - pdf[column].to_hdf( - fname_series[column], key="hdf_series_tests", format=request.param - ) - return (fname_df, fname_series, request.param, nrows) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.filterwarnings("ignore:Strings are not yet supported") -@pytest.mark.parametrize( - "columns", - [["col_int8"], ["col_category"], ["col_int32", "col_float32"], None], -) -def test_hdf_reader(hdf_files, columns): - hdf_df_file, hdf_series, format, nrows = hdf_files - if format == "fixed" and columns is not None: - pytest.skip("Can't use columns with format 'fixed'") - if format == "table" and nrows == 0: - pytest.skip("Can't read 0 row table with format 'table'") - expect_df = pd.read_hdf(hdf_df_file, columns=columns) - got_df = cudf.read_hdf(hdf_df_file, columns=columns) - - assert_eq( - expect_df, got_df, check_categorical=False, check_index_type=False - ) - - for column in hdf_series.keys(): - expect_series = pd.read_hdf(hdf_series[column]) - got_series = cudf.read_hdf(hdf_series[column]) - - assert_eq(expect_series, got_series, check_index_type=False) - - -@pytest.mark.parametrize("format", ["fixed", "table"]) -@pytest.mark.parametrize("complib", ["zlib", "bzip2", "lzo", "blosc"]) -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_hdf_writer(tmpdir, pdf, gdf, complib, format): - pdf, nrows = pdf - if format == "table" and nrows == 0: - pytest.skip("Can't read 0 row table with format 'table'") - gdf, _ = gdf - - if format == "fixed": - pdf = pdf.drop("col_category", axis=1) - gdf = gdf.drop("col_category", axis=1) - - pdf_df_fname = tmpdir.join("pdf_df.hdf") - gdf_df_fname = tmpdir.join("gdf_df.hdf") - - pdf.to_hdf(pdf_df_fname, key="hdf_tests", format=format, complib=complib) - gdf.to_hdf(gdf_df_fname, key="hdf_tests", format=format, complib=complib) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect = pd.read_hdf(pdf_df_fname) - got = pd.read_hdf(gdf_df_fname) - - assert_eq(expect, got, check_index_type=False) - - for column in pdf.columns: - pdf_series_fname = tmpdir.join(column + "_" + "pdf_series.hdf") - gdf_series_fname = tmpdir.join(column + "_" + "gdf_series.hdf") - - pdf[column].to_hdf( - pdf_series_fname, key="hdf_tests", format=format, complib=complib - ) - gdf[column].to_hdf( - gdf_series_fname, key="hdf_tests", format=format, complib=complib - ) - - assert os.path.exists(pdf_series_fname) - assert os.path.exists(gdf_series_fname) - - expect_series = pd.read_hdf(pdf_series_fname) - got_series = pd.read_hdf(gdf_series_fname) - - assert_eq(expect_series, got_series, check_index_type=False) diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py deleted file mode 100644 index 098b5192d4a..00000000000 --- a/python/cudf/cudf/tests/test_hdfs.py +++ /dev/null @@ -1,300 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import os -from io import BytesIO - -import fastavro -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.testing import assert_eq - -if not os.environ.get("RUN_HDFS_TESTS"): - pytestmark = pytest.mark.skip("Env not configured to run HDFS tests") - - -basedir = "/tmp/test-hdfs" -host = "localhost" # hadoop hostname -port = 9000 # hadoop rpc port - - -@pytest.fixture -def hdfs(scope="module"): - # Default Rpc port can be 8020/9000 depending on the hdfs config - fs = pa.hdfs.connect(host=host, port=port) - try: - if not fs.exists(basedir): - fs.mkdir(basedir) - except pa.lib.ArrowIOError: - pytest.skip("hdfs config probably incorrect") - - return fs - - -@pytest.fixture -def pdf(scope="module"): - df = pd.DataFrame() - df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Float"] = np.array([9.001, 8.343, 6, 2.781]) - df["Integer2"] = np.array([2345, 106, 2088, 789277], dtype="uint64") - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df["Boolean"] = np.array([True, False, True, False]) - return df - - -@pytest.mark.parametrize("test_url", [False, True]) -def test_read_csv(tmpdir, pdf, hdfs, test_url): - fname = tmpdir.mkdir("csv").join("file.csv") - # Write to local file system - pdf.to_csv(fname) - # Read from local file system as buffer - with open(fname, mode="rb") as f: - buffer = BytesIO(f.read()) - # Write to hdfs - hdfs.upload(basedir + "/test_csv_reader.csv", buffer) - - if test_url: - hd_fpath = "hdfs://{}:{}{}/test_csv_reader.csv".format( - host, port, basedir - ) - else: - hd_fpath = f"hdfs://{basedir}/test_csv_reader.csv" - - got = cudf.read_csv(hd_fpath) - - # Read pandas from byte buffer - with hdfs.open(basedir + "/test_csv_reader.csv") as f: - expect = pd.read_csv(f) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("test_url", [False, True]) -def test_write_csv(pdf, hdfs, test_url): - gdf = cudf.from_pandas(pdf) - if test_url: - hd_fpath = "hdfs://{}:{}{}/test_csv_writer.csv".format( - host, port, basedir - ) - else: - hd_fpath = f"hdfs://{basedir}/test_csv_writer.csv" - - gdf.to_csv(hd_fpath, index=False) - - assert hdfs.exists(f"{basedir}/test_csv_writer.csv") - with hdfs.open(f"{basedir}/test_csv_writer.csv", mode="rb") as f: - got = pd.read_csv(f, dtype=dict(pdf.dtypes)) - assert_eq(pdf, got) - - -@pytest.mark.parametrize("test_url", [False, True]) -def test_read_parquet(tmpdir, pdf, hdfs, test_url): - fname = tmpdir.mkdir("parquet").join("test_parquet_reader.parquet") - # Write to local file system - pdf.to_parquet(fname) - # Read from local file system as buffer - with open(fname, mode="rb") as f: - buffer = BytesIO(f.read()) - # Write to hdfs - hdfs.upload(basedir + "/test_parquet_reader.parquet", buffer) - - if test_url: - hd_fpath = "hdfs://{}:{}{}/test_parquet_reader.parquet".format( - host, port, basedir - ) - else: - hd_fpath = f"hdfs://{basedir}/test_parquet_reader.parquet" - - got = cudf.read_parquet(hd_fpath) - - # Read pandas from byte buffer - with hdfs.open(basedir + "/test_parquet_reader.parquet") as f: - expect = pd.read_parquet(f) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("test_url", [False, True]) -def test_write_parquet(pdf, hdfs, test_url): - gdf = cudf.from_pandas(pdf) - if test_url: - hd_fpath = "hdfs://{}:{}{}/test_parquet_writer.parquet".format( - host, port, basedir - ) - else: - hd_fpath = f"hdfs://{basedir}/test_parquet_writer.parquet" - - gdf.to_parquet(hd_fpath) - - assert hdfs.exists(f"{basedir}/test_parquet_writer.parquet") - with hdfs.open(f"{basedir}/test_parquet_writer.parquet", mode="rb") as f: - got = pd.read_parquet(f) - - assert_eq(pdf, got) - - -@pytest.mark.xfail( - reason="Writing string columns with parition_cols is incorrect" -) -@pytest.mark.parametrize("test_url", [False, True]) -def test_write_parquet_partitioned(tmpdir, pdf, hdfs, test_url): - pdf.to_parquet( - path=tmpdir.join("pandas_parquet_writer_partitioned.parquet"), - index=False, - partition_cols=["Integer", "Boolean"], - ) - gdf = cudf.from_pandas(pdf) - if test_url: - hd_fpath = "hdfs://{}:{}{}/test_parquet_partitioned.parquet".format( - host, port, basedir - ) - else: - hd_fpath = f"hdfs://{basedir}/test_parquet_partitioned.parquet" - # Clear data written from previous runs - hdfs.rm(f"{basedir}/test_parquet_partitioned.parquet", recursive=True) - gdf.to_parquet( - hd_fpath, index=False, partition_cols=["Integer", "Boolean"] - ) - - assert hdfs.exists(f"{basedir}/test_parquet_partitioned.parquet") - got = pd.read_parquet(hd_fpath) - expect = pd.read_parquet( - tmpdir.join("pandas_parquet_writer_partitioned.parquet") - ) - assert_eq(expect, got) - - -@pytest.mark.parametrize("test_url", [False, True]) -def test_read_json(tmpdir, pdf, hdfs, test_url): - fname = tmpdir.mkdir("json").join("test_json_reader.json") - # Write to local file system - # Sorting by col_name now as pandas sorts by col name while reading json - - pdf.sort_index(axis=1).to_json(fname, orient="records", lines=True) - # Read from local file system as buffer - with open(fname, mode="rb") as f: - buffer = BytesIO(f.read()) - # Write to hdfs - hdfs.upload(basedir + "/test_json_reader.json", buffer) - - if test_url: - hd_fpath = "hdfs://{}:{}{}/test_json_reader.json".format( - host, port, basedir - ) - else: - hd_fpath = f"hdfs://{basedir}/test_json_reader.json" - - got = cudf.read_json(hd_fpath, engine="cudf", orient="records", lines=True) - - # Read pandas from byte buffer - with hdfs.open(basedir + "/test_json_reader.json") as f: - expect = pd.read_json(f, lines=True) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("test_url", [False, True]) -def test_read_orc(datadir, hdfs, test_url): - fname = datadir / "orc" / "TestOrcFile.testSnappy.orc" - # Read from local file system as buffer - with open(fname, mode="rb") as f: - buffer = BytesIO(f.read()) - # Write to hdfs - hdfs.upload(basedir + "/file.orc", buffer) - - if test_url: - hd_fpath = f"hdfs://{host}:{port}{basedir}/file.orc" - else: - hd_fpath = f"hdfs://{basedir}/file.orc" - - got = cudf.read_orc(hd_fpath) - expect = pd.read_orc(buffer) - assert_eq(expect, got) - - -@pytest.mark.parametrize("test_url", [False, True]) -def test_write_orc(pdf, hdfs, test_url): - # Orc writer doesn't support writing unsigned ints - pdf["Integer2"] = pdf["Integer2"].astype("int64") - gdf = cudf.from_pandas(pdf) - if test_url: - hd_fpath = "hdfs://{}:{}{}/test_orc_writer.orc".format( - host, port, basedir - ) - else: - hd_fpath = f"hdfs://{basedir}/test_orc_writer.orc" - - gdf.to_orc(hd_fpath) - - assert hdfs.exists(f"{basedir}/test_orc_writer.orc") - with hdfs.open(f"{basedir}/test_orc_writer.orc", mode="rb") as f: - got = pd.read_orc(f) - - assert_eq(pdf, got) - - -@pytest.mark.parametrize("test_url", [False, True]) -def test_read_avro(datadir, hdfs, test_url): - fname = datadir / "avro" / "example.avro" - # Read from local file system as buffer - with open(fname, mode="rb") as f: - buffer = BytesIO(f.read()) - # Write to hdfs - hdfs.upload(basedir + "/file.avro", buffer) - - if test_url: - hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro" - else: - hd_fpath = f"hdfs://{basedir}/file.avro" - - got = cudf.read_avro(hd_fpath) - with open(fname, mode="rb") as f: - expect = pd.DataFrame.from_records(fastavro.reader(f)) - - for col in expect.columns: - expect[col] = expect[col].astype(got[col].dtype) - assert_eq(expect, got) - - -def test_storage_options(tmpdir, pdf, hdfs): - fname = tmpdir.mkdir("csv").join("file.csv") - # Write to local file system - pdf.to_csv(fname) - # Read from local file system as buffer - with open(fname, mode="rb") as f: - buffer = BytesIO(f.read()) - # Write to hdfs - hdfs.upload(basedir + "/file.csv", buffer) - - hd_fpath = f"hdfs://{basedir}/file.csv" - - storage_options = {"host": host, "port": port} - - got = cudf.read_csv(hd_fpath, storage_options=storage_options) - - # Read pandas from byte buffer - with hdfs.open(basedir + "/file.csv") as f: - expect = pd.read_csv(f) - - assert_eq(expect, got) - - -def test_storage_options_error(tmpdir, pdf, hdfs): - fname = tmpdir.mkdir("csv").join("file.csv") - # Write to local file system - pdf.to_csv(fname) - # Read from local file system as buffer - with open(fname, mode="rb") as f: - buffer = BytesIO(f.read()) - # Write to hdfs - hdfs.upload(basedir + "/file.csv", buffer) - - hd_fpath = f"hdfs://{host}:{port}{basedir}/file.avro" - - storage_options = {"host": host, "port": port} - - with pytest.raises(KeyError): - cudf.read_csv(hd_fpath, storage_options=storage_options) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py deleted file mode 100644 index 3f483219423..00000000000 --- a/python/cudf/cudf/tests/test_index.py +++ /dev/null @@ -1,3340 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -""" -Test related to Index -""" - -import datetime -import operator -import re - -import cupy as cp -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.api.extensions import no_default -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex -from cudf.testing import assert_eq -from cudf.testing._utils import ( - ALL_TYPES, - FLOAT_TYPES, - NUMERIC_TYPES, - OTHER_TYPES, - SERIES_OR_INDEX_NAMES, - SIGNED_INTEGER_TYPES, - UNSIGNED_TYPES, - assert_column_memory_eq, - assert_column_memory_ne, - assert_exceptions_equal, - expect_warning_if, -) -from cudf.utils.utils import search_range - - -def test_df_set_index_from_series(): - df = cudf.DataFrame() - df["a"] = list(range(10)) - df["b"] = list(range(0, 20, 2)) - - # Check set_index(Series) - df2 = df.set_index(df["b"]) - assert list(df2.columns) == ["a", "b"] - sliced_strided = df2.loc[2:6] - assert len(sliced_strided) == 3 - assert list(sliced_strided.index.values) == [2, 4, 6] - - -def test_df_set_index_from_name(): - df = cudf.DataFrame() - df["a"] = list(range(10)) - df["b"] = list(range(0, 20, 2)) - - # Check set_index(column_name) - df2 = df.set_index("b") - # 1 less column because 'b' is used as index - assert list(df2.columns) == ["a"] - sliced_strided = df2.loc[2:6] - assert len(sliced_strided) == 3 - assert list(sliced_strided.index.values) == [2, 4, 6] - - -def test_df_slice_empty_index(): - df = cudf.DataFrame() - assert isinstance(df.index, RangeIndex) - assert isinstance(df.index[:1], RangeIndex) - with pytest.raises(IndexError): - df.index[1] - - -def test_index_find_label_range_genericindex(): - # Monotonic Index - idx = cudf.Index(np.asarray([4, 5, 6, 10])) - assert idx.find_label_range(slice(4, 6)) == slice(0, 3, 1) - assert idx.find_label_range(slice(5, 10)) == slice(1, 4, 1) - assert idx.find_label_range(slice(0, 6)) == slice(0, 3, 1) - assert idx.find_label_range(slice(4, 11)) == slice(0, 4, 1) - - # Non-monotonic Index - idx_nm = cudf.Index(np.asarray([5, 4, 6, 10])) - assert idx_nm.find_label_range(slice(4, 6)) == slice(1, 3, 1) - assert idx_nm.find_label_range(slice(5, 10)) == slice(0, 4, 1) - # Last value not found - with pytest.raises(KeyError) as raises: - idx_nm.find_label_range(slice(0, 6)) - raises.match("not in index") - # Last value not found - with pytest.raises(KeyError) as raises: - idx_nm.find_label_range(slice(4, 11)) - raises.match("not in index") - - -def test_index_find_label_range_rangeindex(): - """Cudf specific""" - # step > 0 - # 3, 8, 13, 18 - ridx = RangeIndex(3, 20, 5) - assert ridx.find_label_range(slice(3, 8)) == slice(0, 2, 1) - assert ridx.find_label_range(slice(0, 7)) == slice(0, 1, 1) - assert ridx.find_label_range(slice(3, 19)) == slice(0, 4, 1) - assert ridx.find_label_range(slice(2, 21)) == slice(0, 4, 1) - - # step < 0 - # 20, 15, 10, 5 - ridx = RangeIndex(20, 3, -5) - assert ridx.find_label_range(slice(15, 10)) == slice(1, 3, 1) - assert ridx.find_label_range(slice(10, 15, -1)) == slice(2, 0, -1) - assert ridx.find_label_range(slice(10, 0)) == slice(2, 4, 1) - assert ridx.find_label_range(slice(30, 13)) == slice(0, 2, 1) - assert ridx.find_label_range(slice(30, 0)) == slice(0, 4, 1) - - -def test_index_comparision(): - start, stop = 10, 34 - rg = cudf.RangeIndex(start, stop) - gi = cudf.Index(np.arange(start, stop)) - assert rg.equals(gi) - assert gi.equals(rg) - assert not rg[:-1].equals(gi) - assert rg[:-1].equals(gi[:-1]) - - -@pytest.mark.parametrize( - "func", - [ - lambda x: x.min(), - lambda x: x.max(), - lambda x: x.any(), - lambda x: x.all(), - ], -) -def test_reductions(func): - x = np.asarray([4, 5, 6, 10]) - idx = cudf.Index(np.asarray([4, 5, 6, 10])) - - assert func(x) == func(idx) - - -def test_name(): - idx = cudf.Index(np.asarray([4, 5, 6, 10]), name="foo") - assert idx.name == "foo" - - -def test_index_immutable(): - start, stop = 10, 34 - rg = RangeIndex(start, stop) - with pytest.raises(TypeError): - rg[1] = 5 - gi = cudf.Index(np.arange(start, stop)) - with pytest.raises(TypeError): - gi[1] = 5 - - -def test_categorical_index(): - pdf = pd.DataFrame() - pdf["a"] = [1, 2, 3] - pdf["index"] = pd.Categorical(["a", "b", "c"]) - initial_df = cudf.from_pandas(pdf) - pdf = pdf.set_index("index") - gdf1 = cudf.from_pandas(pdf) - gdf2 = cudf.DataFrame() - gdf2["a"] = [1, 2, 3] - gdf2["index"] = pd.Categorical(["a", "b", "c"]) - assert_eq(initial_df.index, gdf2.index) - gdf2 = gdf2.set_index("index") - - assert isinstance(gdf1.index, CategoricalIndex) - assert_eq(pdf, gdf1) - assert_eq(pdf.index, gdf1.index) - assert_eq( - pdf.index.codes, - gdf1.index.codes.astype(pdf.index.codes.dtype).to_numpy(), - ) - - assert isinstance(gdf2.index, CategoricalIndex) - assert_eq(pdf, gdf2) - assert_eq(pdf.index, gdf2.index) - assert_eq( - pdf.index.codes, - gdf2.index.codes.astype(pdf.index.codes.dtype).to_numpy(), - ) - - -def test_pandas_as_index(): - # Define Pandas Indexes - pdf_int_index = pd.Index([1, 2, 3, 4, 5]) - pdf_uint_index = pd.Index([1, 2, 3, 4, 5]) - pdf_float_index = pd.Index([1.0, 2.0, 3.0, 4.0, 5.0]) - pdf_datetime_index = pd.DatetimeIndex( - [1000000, 2000000, 3000000, 4000000, 5000000] - ) - pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"]) - - # Define cudf Indexes - gdf_int_index = Index(pdf_int_index) - gdf_uint_index = Index(pdf_uint_index) - gdf_float_index = Index(pdf_float_index) - gdf_datetime_index = Index(pdf_datetime_index) - gdf_category_index = Index(pdf_category_index) - - # Check instance types - assert isinstance(gdf_int_index, Index) - assert isinstance(gdf_uint_index, Index) - assert isinstance(gdf_float_index, Index) - assert isinstance(gdf_datetime_index, DatetimeIndex) - assert isinstance(gdf_category_index, CategoricalIndex) - - # Check equality - assert_eq(pdf_int_index, gdf_int_index) - assert_eq(pdf_uint_index, gdf_uint_index) - assert_eq(pdf_float_index, gdf_float_index) - assert_eq(pdf_datetime_index, gdf_datetime_index) - assert_eq(pdf_category_index, gdf_category_index) - - assert_eq( - pdf_category_index.codes, - gdf_category_index.codes.astype( - pdf_category_index.codes.dtype - ).to_numpy(), - ) - - -@pytest.mark.parametrize("initial_name", SERIES_OR_INDEX_NAMES) -@pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES) -def test_index_rename(initial_name, name): - pds = pd.Index([1, 2, 3], name=initial_name) - gds = Index(pds) - - assert_eq(pds, gds) - - expect = pds.rename(name) - got = gds.rename(name) - - assert_eq(expect, got) - """ - From here on testing recursive creation - and if name is being handles in recursive creation. - """ - pds = pd.Index(expect) - gds = Index(got) - - assert_eq(pds, gds) - - pds = pd.Index(pds, name="abc") - gds = Index(gds, name="abc") - assert_eq(pds, gds) - - -def test_index_rename_inplace(): - pds = pd.Index([1, 2, 3], name="asdf") - gds = Index(pds) - - # inplace=False should yield a shallow copy - gds_renamed_deep = gds.rename("new_name", inplace=False) - - assert gds_renamed_deep._values.data_ptr == gds._values.data_ptr - - # inplace=True returns none - expected_ptr = gds._values.data_ptr - gds.rename("new_name", inplace=True) - - assert expected_ptr == gds._values.data_ptr - - -def test_index_rename_preserves_arg(): - idx1 = cudf.Index([1, 2, 3], name="orig_name") - - # this should be an entirely new object - idx2 = idx1.rename("new_name", inplace=False) - - assert idx2.name == "new_name" - assert idx1.name == "orig_name" - - # a new object but referencing the same data - idx3 = Index(idx1, name="last_name") - - assert idx3.name == "last_name" - assert idx1.name == "orig_name" - - -def test_set_index_as_property(): - cdf = cudf.DataFrame() - col1 = np.arange(10) - col2 = np.arange(0, 20, 2) - cdf["a"] = col1 - cdf["b"] = col2 - - # Check set_index(Series) - cdf.index = cdf["b"] - - assert_eq(cdf.index.to_numpy(), col2) - - with pytest.raises(ValueError): - cdf.index = [list(range(10))] - - idx = pd.Index(np.arange(0, 1000, 100)) - cdf.index = idx - assert_eq(cdf.index.to_pandas(), idx) - - df = cdf.to_pandas() - assert_eq(df.index, idx) - - head = cdf.head().to_pandas() - assert_eq(head.index, idx[:5]) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_range(name, deep=True): - cidx = cudf.RangeIndex(1, 5) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_datetime(name, deep=True): - cidx = cudf.DatetimeIndex(["2001", "2002", "2003"]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_string(name, deep=True): - cidx = cudf.Index(["a", "b", "c"]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_integer(name, deep=True): - """Test for NumericIndex Copy Casts""" - cidx = cudf.Index([1, 2, 3]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_float(name, deep=True): - """Test for NumericIndex Copy Casts""" - cidx = cudf.Index([1.0, 2.0, 3.0]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("name", ["x"]) -def test_index_copy_category(name, deep=True): - cidx = cudf.core.index.CategoricalIndex([1, 2, 3]) - pidx = cidx.to_pandas() - - pidx_copy = pidx.copy(name=name, deep=deep) - cidx_copy = cidx.copy(name=name, deep=deep) - - assert_column_memory_ne(cidx._values, cidx_copy._values) - assert_eq(pidx_copy, cidx_copy) - - -@pytest.mark.parametrize("deep", [True, False]) -@pytest.mark.parametrize( - "idx", - [ - cudf.DatetimeIndex(["2001", "2002", "2003"]), - cudf.Index(["a", "b", "c"]), - cudf.Index([1, 2, 3]), - cudf.Index([1.0, 2.0, 3.0]), - cudf.CategoricalIndex([1, 2, 3]), - cudf.CategoricalIndex(["a", "b", "c"]), - ], -) -@pytest.mark.parametrize("copy_on_write", [True, False]) -def test_index_copy_deep(idx, deep, copy_on_write): - """Test if deep copy creates a new instance for device data.""" - idx_copy = idx.copy(deep=deep) - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", copy_on_write) - if ( - isinstance(idx._values, cudf.core.column.StringColumn) - or not deep - or (cudf.get_option("copy_on_write") and not deep) - ): - # StringColumn is immutable hence, deep copies of a - # Index with string dtype will share the same StringColumn. - - # When `copy_on_write` is turned on, Index objects will - # have unique column object but they all point to same - # data pointers. - assert_column_memory_eq(idx._values, idx_copy._values) - else: - assert_column_memory_ne(idx._values, idx_copy._values) - cudf.set_option("copy_on_write", original_cow_setting) - - -@pytest.mark.parametrize("idx", [[1, None, 3, None, 5]]) -def test_index_isna(idx): - pidx = pd.Index(idx, name="idx") - gidx = cudf.Index(idx, name="idx") - assert_eq(gidx.isna(), pidx.isna()) - - -@pytest.mark.parametrize("idx", [[1, None, 3, None, 5]]) -def test_index_notna(idx): - pidx = pd.Index(idx, name="idx") - gidx = cudf.Index(idx, name="idx") - assert_eq(gidx.notna(), pidx.notna()) - - -def test_rangeindex_slice_attr_name(): - start, stop = 0, 10 - rg = RangeIndex(start, stop, name="myindex") - sliced_rg = rg[0:9] - assert_eq(rg.name, sliced_rg.name) - - -def test_from_pandas_str(): - idx = ["a", "b", "c"] - pidx = pd.Index(idx, name="idx") - gidx_1 = cudf.Index(idx, name="idx") - gidx_2 = cudf.from_pandas(pidx) - - assert_eq(gidx_1, gidx_2) - - -def test_from_pandas_gen(): - idx = [2, 4, 6] - pidx = pd.Index(idx, name="idx") - gidx_1 = cudf.Index(idx, name="idx") - gidx_2 = cudf.from_pandas(pidx) - - assert_eq(gidx_1, gidx_2) - - -def test_index_names(): - idx = Index([1, 2, 3], name="idx") - assert idx.names == ("idx",) - - -@pytest.mark.parametrize( - "data", - [ - range(0), - range(1), - range(0, 1), - range(0, 5), - range(1, 10), - range(1, 10, 1), - range(1, 10, 3), - range(10, 1, -3), - range(-5, 10), - ], -) -def test_range_index_from_range(data): - assert_eq(pd.Index(data), cudf.Index(data)) - - -@pytest.mark.parametrize( - "n", - [-10, -5, -2, 0, 1, 0, 2, 5, 10], -) -def test_empty_df_head_tail_index(n): - df = cudf.DataFrame() - pdf = pd.DataFrame() - assert_eq(df.head(n).index.values, pdf.head(n).index.values) - assert_eq(df.tail(n).index.values, pdf.tail(n).index.values) - - df = cudf.DataFrame({"a": [11, 2, 33, 44, 55]}) - pdf = pd.DataFrame({"a": [11, 2, 33, 44, 55]}) - assert_eq(df.head(n).index.values, pdf.head(n).index.values) - assert_eq(df.tail(n).index.values, pdf.tail(n).index.values) - - df = cudf.DataFrame(index=[1, 2, 3]) - pdf = pd.DataFrame(index=[1, 2, 3]) - assert_eq(df.head(n).index.values, pdf.head(n).index.values) - assert_eq(df.tail(n).index.values, pdf.tail(n).index.values) - - -@pytest.mark.parametrize( - "data,condition,other,error", - [ - (pd.Index(range(5)), pd.Index(range(5)) > 0, None, None), - (pd.Index([1, 2, 3]), pd.Index([1, 2, 3]) != 2, None, None), - (pd.Index(list("abc")), pd.Index(list("abc")) == "c", None, None), - ( - pd.Index(list("abc")), - pd.Index(list("abc")) == "c", - pd.Index(list("xyz")), - None, - ), - (pd.Index(range(5)), pd.Index(range(4)) > 0, None, ValueError), - ( - pd.Index(range(5)), - pd.Index(range(5)) > 1, - 10, - None, - ), - ( - pd.Index(np.arange(10)), - (pd.Index(np.arange(10)) % 3) == 0, - -pd.Index(np.arange(10)), - None, - ), - ( - pd.Index([1, 2, np.nan]), - pd.Index([1, 2, np.nan]) == 4, - None, - None, - ), - ( - pd.Index([1, 2, np.nan]), - pd.Index([1, 2, np.nan]) != 4, - None, - None, - ), - ( - pd.Index([-2, 3, -4, -79]), - [True, True, True], - None, - ValueError, - ), - ( - pd.Index([-2, 3, -4, -79]), - [True, True, True, False], - None, - None, - ), - ( - pd.Index([-2, 3, -4, -79]), - [True, True, True, False], - 17, - None, - ), - (pd.Index(list("abcdgh")), pd.Index(list("abcdgh")) != "g", "3", None), - ( - pd.Index(list("abcdgh")), - pd.Index(list("abcdg")) != "g", - "3", - ValueError, - ), - ( - pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), - pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) != "a", - "a", - None, - ), - ( - pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]), - pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) != "a", - "b", - None, - ), - ( - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ] - ) - ) - ), - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ] - ) - ) - ) - != "a", - None, - NotImplementedError, - ), - ], -) -def test_index_where(data, condition, other, error): - ps = data - gs = cudf.from_pandas(data) - - ps_condition = condition - if type(condition).__module__.split(".")[0] == "pandas": - gs_condition = cudf.from_pandas(condition) - else: - gs_condition = condition - - ps_other = other - if type(other).__module__.split(".")[0] == "pandas": - gs_other = cudf.from_pandas(other) - else: - gs_other = other - - if error is None: - if hasattr(ps, "dtype") and isinstance(ps.dtype, pd.CategoricalDtype): - expect = ps.where(ps_condition, other=ps_other) - got = gs.where(gs_condition, other=gs_other) - np.testing.assert_array_equal( - expect.codes, - got.codes.astype(expect.codes.dtype).fillna(-1).to_numpy(), - ) - assert_eq(expect.categories, got.categories) - else: - assert_eq( - ps.where(ps_condition, other=ps_other), - gs.where(gs_condition, other=gs_other).to_pandas(), - ) - else: - assert_exceptions_equal( - lfunc=ps.where, - rfunc=gs.where, - lfunc_args_and_kwargs=([ps_condition], {"other": ps_other}), - rfunc_args_and_kwargs=([gs_condition], {"other": gs_other}), - ) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) -@pytest.mark.parametrize("copy", [True, False]) -def test_index_astype(dtype, copy): - pdi = pd.Index([1, 2, 3]) - gdi = cudf.from_pandas(pdi) - - actual = gdi.astype(dtype=dtype, copy=copy) - expected = pdi.astype(dtype=dtype, copy=copy) - - assert_eq(expected, actual) - assert_eq(pdi, gdi) - - -@pytest.mark.parametrize( - "data", - [ - [1, 10, 2, 100, -10], - ["z", "x", "a", "c", "b"], - [-10.2, 100.1, -100.2, 0.0, 0.23], - ], -) -def test_index_argsort(data): - pdi = pd.Index(data) - gdi = cudf.from_pandas(pdi) - - assert_eq(pdi.argsort(), gdi.argsort()) - - -@pytest.mark.parametrize( - "data", - [ - pd.Index([1, 10, 2, 100, -10], name="abc"), - pd.Index(["z", "x", "a", "c", "b"]), - pd.Index(["z", "x", "a", "c", "b"], dtype="category"), - pd.Index( - [-10.2, 100.1, -100.2, 0.0, 0.23], name="this is a float index" - ), - pd.Index([102, 1001, 1002, 0.0, 23], dtype="datetime64[ns]"), - pd.Index([13240.2, 1001, 100.2, 0.0, 23], dtype="datetime64[ns]"), - pd.RangeIndex(0, 10, 1), - pd.RangeIndex(0, -100, -2), - pd.Index([-10.2, 100.1, -100.2, 0.0, 23], dtype="timedelta64[ns]"), - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("return_indexer", [True, False]) -def test_index_sort_values(data, ascending, return_indexer): - pdi = data - gdi = cudf.from_pandas(pdi) - - expected = pdi.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - actual = gdi.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - - if return_indexer: - expected_indexer = expected[1] - actual_indexer = actual[1] - - assert_eq(expected_indexer, actual_indexer) - - expected = expected[0] - actual = actual[0] - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 10, 2, 100, -10], - ["z", "x", "a", "c", "b"], - [-10.2, 100.1, -100.2, 0.0, 0.23], - ], -) -def test_index_to_series(data): - pdi = pd.Index(data) - gdi = cudf.from_pandas(pdi) - - assert_eq(pdi.to_series(), gdi.to_series()) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [4, 5, 6, 10, 20, 30], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - ["5", "6", "2", "a", "b", "c"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - [1.0, 5.0, 6.0, 0.0, 1.3], - ["ab", "cd", "ef"], - pd.Series(["1", "2", "a", "3", None], dtype="category"), - range(0, 10), - [], - [1, 1, 2, 2], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [4, 5, 6, 10, 20, 30], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - ["5", "6", "2", "a", "b", "c"], - ["ab", "ef", None], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - [1.0, 5.0, 6.0, 0.0, 1.3], - range(2, 4), - pd.Series(["1", "a", "3", None], dtype="category"), - [], - [2], - ], -) -@pytest.mark.parametrize("sort", [None, False, True]) -@pytest.mark.parametrize( - "name_data,name_other", - [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_index_difference(data, other, sort, name_data, name_other): - pd_data = pd.Index(data, name=name_data) - pd_other = pd.Index(other, name=name_other) - if ( - not PANDAS_GE_220 - and isinstance(pd_data.dtype, pd.CategoricalDtype) - and not isinstance(pd_other.dtype, pd.CategoricalDtype) - and pd_other.isnull().any() - ): - pytest.skip(reason="https://github.com/pandas-dev/pandas/issues/57318") - - if ( - not PANDAS_GE_220 - and len(pd_other) == 0 - and len(pd_data) != len(pd_data.unique()) - ): - pytest.skip(reason="Bug fixed in pandas-2.2+") - - gd_data = cudf.from_pandas(pd_data) - gd_other = cudf.from_pandas(pd_other) - - expected = pd_data.difference(pd_other, sort=sort) - actual = gd_data.difference(gd_other, sort=sort) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("other", ["a", 1, None]) -def test_index_difference_invalid_inputs(other): - pdi = pd.Index([1, 2, 3]) - gdi = cudf.Index([1, 2, 3]) - - assert_exceptions_equal( - pdi.difference, - gdi.difference, - ([other], {}), - ([other], {}), - ) - - -def test_index_difference_sort_error(): - pdi = pd.Index([1, 2, 3]) - gdi = cudf.Index([1, 2, 3]) - - assert_exceptions_equal( - pdi.difference, - gdi.difference, - ([pdi], {"sort": "A"}), - ([gdi], {"sort": "A"}), - ) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - [], - ["b", "c", "d"], - [1], - [2, 3, 4], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -def test_index_equals(data, other): - pd_data = pd.Index(data) - pd_other = pd.Index(other) - - gd_data = Index(data) - gd_other = Index(other) - - expected = pd_data.equals(pd_other) - actual = gd_data.equals(gd_other) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -def test_index_categories_equal(data, other): - pd_data = pd.Index(data).astype("category") - pd_other = pd.Index(other) - - gd_data = Index(data).astype("category") - gd_other = Index(other) - - expected = pd_data.equals(pd_other) - actual = gd_data.equals(gd_other) - assert_eq(expected, actual) - - expected = pd_other.equals(pd_data) - actual = gd_other.equals(gd_data) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -def test_index_equal_misc(data, other): - pd_data = pd.Index(data) - pd_other = other - - gd_data = Index(data) - gd_other = other - - expected = pd_data.equals(pd_other) - actual = gd_data.equals(gd_other) - assert_eq(expected, actual) - - expected = pd_data.equals(np.array(pd_other)) - actual = gd_data.equals(np.array(gd_other)) - assert_eq(expected, actual) - - expected = pd_data.equals(pd.Series(pd_other)) - actual = gd_data.equals(cudf.Series(gd_other)) - assert_eq(expected, actual) - - expected = pd_data.astype("category").equals(pd_other) - actual = gd_data.astype("category").equals(gd_other) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.parametrize( - "other", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - ["1", "2", "3", "4", "5", "6"], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - ["a"], - ["b", "c", "d"], - [1], - [2, 3, 4], - [], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ["abcd", "defgh", "werty", "poiu"], - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Does not warn on older versions of pandas", -) -def test_index_append(data, other): - pd_data = pd.Index(data) - pd_other = pd.Index(other) - - gd_data = cudf.Index(data) - gd_other = cudf.Index(other) - - if cudf.utils.dtypes.is_mixed_with_object_dtype(gd_data, gd_other): - gd_data = gd_data.astype("str") - gd_other = gd_other.astype("str") - - with expect_warning_if( - (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype - ): - expected = pd_data.append(pd_other) - with expect_warning_if( - (len(data) == 0 or len(other) == 0) and gd_data.dtype != gd_other.dtype - ): - actual = gd_data.append(gd_other) - if len(data) == 0 and len(other) == 0: - # Pandas default dtype to "object" for empty list - # cudf default dtype to "float" for empty list - assert_eq(expected, actual.astype("str")) - elif actual.dtype == "object": - assert_eq(expected.astype("str"), actual) - else: - assert_eq(expected, actual) - - -def test_index_empty_append_name_conflict(): - empty = cudf.Index([], name="foo") - non_empty = cudf.Index([1], name="bar") - expected = cudf.Index([1]) - - with pytest.warns(FutureWarning): - result = non_empty.append(empty) - assert_eq(result, expected) - - with pytest.warns(FutureWarning): - result = empty.append(non_empty) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4, 5, 6], - [10, 20, 30, 40, 50, 60], - [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], - [1], - [2, 3, 4], - [10.0], - [1100.112, 2323.2322, 2323.2322], - ], -) -@pytest.mark.parametrize( - "other", - [ - ["1", "2", "3", "4", "5", "6"], - ["a"], - ["b", "c", "d"], - ["abcd", "defgh", "werty", "poiu"], - ], -) -def test_index_append_error(data, other): - gd_data = Index(data) - gd_other = Index(other) - - got_dtype = ( - gd_other.dtype - if gd_data.dtype == np.dtype("object") - else gd_data.dtype - ) - with pytest.raises( - TypeError, - match=re.escape( - f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ), - ): - gd_data.append(gd_other) - - with pytest.raises( - TypeError, - match=re.escape( - f"cudf does not support appending an Index of " - f"dtype `{np.dtype('object')}` with an Index " - f"of dtype `{got_dtype}`, please type-cast " - f"either one of them to same dtypes." - ), - ): - gd_other.append(gd_data) - - sr = gd_other.to_series() - - assert_exceptions_equal( - lfunc=gd_data.to_pandas().append, - rfunc=gd_data.append, - lfunc_args_and_kwargs=([[sr.to_pandas()]],), - rfunc_args_and_kwargs=([[sr]],), - ) - - -@pytest.mark.parametrize( - "data,other", - [ - ( - pd.Index([1, 2, 3, 4, 5, 6]), - [ - pd.Index([1, 2, 3, 4, 5, 6]), - pd.Index([1, 2, 3, 4, 5, 6, 10]), - pd.Index([]), - ], - ), - ( - pd.Index([]), - [ - pd.Index([1, 2, 3, 4, 5, 6]), - pd.Index([1, 2, 3, 4, 5, 6, 10]), - pd.Index([1, 4, 5, 6]), - ], - ), - ( - pd.Index([10, 20, 30, 40, 50, 60]), - [ - pd.Index([10, 20, 30, 40, 50, 60]), - pd.Index([10, 20, 30]), - pd.Index([40, 50, 60]), - pd.Index([10, 60]), - pd.Index([60]), - ], - ), - ( - pd.Index([]), - [ - pd.Index([10, 20, 30, 40, 50, 60]), - pd.Index([10, 20, 30]), - pd.Index([40, 50, 60]), - pd.Index([10, 60]), - pd.Index([60]), - ], - ), - ( - pd.Index(["1", "2", "3", "4", "5", "6"]), - [ - pd.Index(["1", "2", "3", "4", "5", "6"]), - pd.Index(["1", "2", "3"]), - pd.Index(["6"]), - pd.Index(["1", "6"]), - ], - ), - ( - pd.Index([]), - [ - pd.Index(["1", "2", "3", "4", "5", "6"]), - pd.Index(["1", "2", "3"]), - pd.Index(["6"]), - pd.Index(["1", "6"]), - ], - ), - ( - pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), - [ - pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), - pd.Index([1.0, 6.0]), - pd.Index([]), - pd.Index([6.0]), - ], - ), - ( - pd.Index([]), - [ - pd.Index([1.0, 2.0, 3.0, 4.0, 5.0, 6.0]), - pd.Index([1.0, 6.0]), - pd.Index([1.0, 2.0, 6.0]), - pd.Index([6.0]), - ], - ), - ( - pd.Index(["a"]), - [ - pd.Index(["a"]), - pd.Index(["a", "b", "c"]), - pd.Index(["c"]), - pd.Index(["d"]), - pd.Index(["ae", "hello", "world"]), - ], - ), - ( - pd.Index([]), - [ - pd.Index(["a"]), - pd.Index(["a", "b", "c"]), - pd.Index(["c"]), - pd.Index(["d"]), - pd.Index(["ae", "hello", "world"]), - pd.Index([]), - ], - ), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Does not warn on older versions of pandas", -) -def test_index_append_list(data, other): - pd_data = data - pd_other = other - - gd_data = cudf.from_pandas(data) - gd_other = [cudf.from_pandas(i) for i in other] - - with expect_warning_if( - (len(data) == 0 or any(len(d) == 0 for d in other)) - and (any(d.dtype != data.dtype for d in other)) - ): - expected = pd_data.append(pd_other) - with expect_warning_if( - (len(data) == 0 or any(len(d) == 0 for d in other)) - and (any(d.dtype != data.dtype for d in other)) - ): - actual = gd_data.append(gd_other) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) -@pytest.mark.parametrize("name", [1, "a", None]) -def test_index_basic(data, dtype, name): - pdi = pd.Index(data, dtype=dtype, name=name) - gdi = cudf.Index(data, dtype=dtype, name=name) - - assert_eq(pdi, gdi) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("name", [1, "a", None]) -@pytest.mark.parametrize("dtype", SIGNED_INTEGER_TYPES) -def test_integer_index_apis(data, name, dtype): - pindex = pd.Index(data, dtype=dtype, name=name) - gindex = cudf.Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == dtype - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("name", [1, "a", None]) -@pytest.mark.parametrize("dtype", UNSIGNED_TYPES) -def test_unsigned_integer_index_apis(data, name, dtype): - pindex = pd.Index(data, dtype=dtype, name=name) - gindex = cudf.Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == dtype - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("name", [1, "a", None]) -@pytest.mark.parametrize("dtype", FLOAT_TYPES) -def test_float_index_apis(data, name, dtype): - pindex = pd.Index(data, dtype=dtype, name=name) - gindex = cudf.Index(data, dtype=dtype, name=name) - - assert_eq(pindex, gindex) - assert gindex.dtype == dtype - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize("categories", [[1, 2], None]) -@pytest.mark.parametrize( - "dtype", - [ - pd.CategoricalDtype([1, 2, 3], ordered=True), - pd.CategoricalDtype([1, 2, 3], ordered=False), - None, - ], -) -@pytest.mark.parametrize("ordered", [True, False]) -@pytest.mark.parametrize("name", [1, "a", None]) -def test_categorical_index_basic(data, categories, dtype, ordered, name): - # can't have both dtype and categories/ordered - if dtype is not None: - categories = None - ordered = None - pindex = pd.CategoricalIndex( - data=data, - categories=categories, - dtype=dtype, - ordered=ordered, - name=name, - ) - gindex = CategoricalIndex( - data=data, - categories=categories, - dtype=dtype, - ordered=ordered, - name=name, - ) - - assert_eq(pindex, gindex) - - -@pytest.mark.parametrize( - "data", - [ - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], - names=("number", "color"), - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], - names=("number1", "color2"), - ), - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], - ), - ], -) -@pytest.mark.parametrize( - "other", - [ - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], - names=("number", "color"), - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], ["yellow", "violet", "pink", "white"]], - names=("number1", "color2"), - ), - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["red", "blue", "red", "blue"]], - ), - ], -) -def test_multiindex_append(data, other): - pdi = data - other_pd = other - - gdi = cudf.from_pandas(data) - other_gd = cudf.from_pandas(other) - - expected = pdi.append(other_pd) - actual = gdi.append(other_gd) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) -def test_index_empty(data, dtype): - pdi = pd.Index(data, dtype=dtype) - gdi = cudf.Index(data, dtype=dtype) - - assert_eq(pdi.empty, gdi.empty) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) -def test_index_size(data, dtype): - pdi = pd.Index(data, dtype=dtype) - gdi = cudf.Index(data, dtype=dtype) - - assert_eq(pdi.size, gdi.size) - - -@pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], [], [1], [1, 2, 3]]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) -def test_index_drop_duplicates(data, dtype): - pdi = pd.Index(data, dtype=dtype) - gdi = cudf.Index(data, dtype=dtype) - - assert_eq(pdi.drop_duplicates(), gdi.drop_duplicates()) - - -def test_dropna_bad_how(): - with pytest.raises(ValueError): - cudf.Index([1]).dropna(how="foo") - - -@pytest.mark.parametrize("data", [[1, 2, 3, 1, 2, 3, 4], []]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) -def test_index_tolist(data, dtype): - gdi = cudf.Index(data, dtype=dtype) - - with pytest.raises( - TypeError, - match=re.escape( - r"cuDF does not support conversion to host memory " - r"via the `tolist()` method. Consider using " - r"`.to_arrow().to_pylist()` to construct a Python list." - ), - ): - gdi.tolist() - - -@pytest.mark.parametrize("data", [[], [1], [1, 2, 3]]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) -def test_index_iter_error(data, dtype): - gdi = cudf.Index(data, dtype=dtype) - - with pytest.raises( - TypeError, - match=re.escape( - f"{gdi.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - iter(gdi) - - -@pytest.mark.parametrize("data", [[], [1], [1, 2, 3, 4, 5]]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + ["str", "category", "datetime64[ns]"] -) -def test_index_values_host(data, dtype): - gdi = cudf.Index(data, dtype=dtype) - pdi = pd.Index(data, dtype=dtype) - - np.testing.assert_array_equal(gdi.values_host, pdi.values) - - -@pytest.mark.parametrize( - "data,fill_value", - [ - ([1, 2, 3, 1, None, None], 1), - ([None, None, 3.2, 1, None, None], 10.0), - ([None, "a", "3.2", "z", None, None], "helloworld"), - (pd.Series(["a", "b", None], dtype="category"), "b"), - (pd.Series([None, None, 1.0], dtype="category"), 1.0), - ( - np.array([1, 2, 3, None], dtype="datetime64[s]"), - np.datetime64("2005-02-25"), - ), - ( - np.array( - [None, None, 122, 3242234, None, 6237846], - dtype="datetime64[ms]", - ), - np.datetime64("2005-02-25"), - ), - ], -) -def test_index_fillna(data, fill_value): - pdi = pd.Index(data) - gdi = cudf.Index(data) - - assert_eq( - pdi.fillna(fill_value), gdi.fillna(fill_value), exact=False - ) # Int64 v/s Float64 - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 1, None, None], - [None, None, 3.2, 1, None, None], - [None, "a", "3.2", "z", None, None], - pd.Series(["a", "b", None], dtype="category"), - np.array([1, 2, 3, None], dtype="datetime64[s]"), - ], -) -def test_index_to_arrow(data): - pdi = pd.Index(data) - gdi = cudf.Index(data) - - expected_arrow_array = pa.Array.from_pandas(pdi) - got_arrow_array = gdi.to_arrow() - - assert_eq(expected_arrow_array, got_arrow_array) - - -@pytest.mark.parametrize( - "data", - [ - [None, None, 3.2, 1, None, None], - [None, "a", "3.2", "z", None, None], - pd.Series(["a", "b", None], dtype="category"), - np.array([1, 2, 3, None], dtype="datetime64[s]"), - ], -) -def test_index_from_arrow(data): - pdi = pd.Index(data) - - arrow_array = pa.Array.from_pandas(pdi) - expected_index = pd.Index(arrow_array.to_pandas()) - gdi = cudf.Index.from_arrow(arrow_array) - - assert_eq(expected_index, gdi) - - -def test_multiindex_to_arrow(): - pdf = pd.DataFrame( - { - "a": [1, 2, 1, 2, 3], - "b": [1.0, 2.0, 3.0, 4.0, 5.0], - "c": np.array([1, 2, 3, None, 5], dtype="datetime64[s]"), - "d": ["a", "b", "c", "d", "e"], - } - ) - pdf["a"] = pdf["a"].astype("category") - df = cudf.from_pandas(pdf) - gdi = cudf.MultiIndex.from_frame(df) - - expected = pa.Table.from_pandas(pdf) - got = gdi.to_arrow() - - assert_eq(expected, got) - - -def test_multiindex_from_arrow(): - pdf = pd.DataFrame( - { - "a": [1, 2, 1, 2, 3], - "b": [1.0, 2.0, 3.0, 4.0, 5.0], - "c": np.array([1, 2, 3, None, 5], dtype="datetime64[s]"), - "d": ["a", "b", "c", "d", "e"], - } - ) - pdf["a"] = pdf["a"].astype("category") - ptb = pa.Table.from_pandas(pdf) - gdi = cudf.MultiIndex.from_arrow(ptb) - pdi = pd.MultiIndex.from_frame(pdf) - - assert_eq(pdi, gdi) - - -def test_index_equals_categories(): - lhs = cudf.CategoricalIndex( - ["a", "b", "c", "b", "a"], categories=["a", "b", "c"] - ) - rhs = cudf.CategoricalIndex( - ["a", "b", "c", "b", "a"], categories=["a", "b", "c", "_"] - ) - - got = lhs.equals(rhs) - expect = lhs.to_pandas().equals(rhs.to_pandas()) - - assert_eq(expect, got) - - -def test_rangeindex_arg_validation(): - with pytest.raises(TypeError): - RangeIndex("1") - - with pytest.raises(TypeError): - RangeIndex(1, "2") - - with pytest.raises(TypeError): - RangeIndex(1, 3, "1") - - with pytest.raises(ValueError): - RangeIndex(1, dtype="float64") - - with pytest.raises(ValueError): - RangeIndex(1, dtype="uint64") - - -def test_rangeindex_name_not_hashable(): - with pytest.raises(ValueError): - RangeIndex(range(2), name=["foo"]) - - with pytest.raises(ValueError): - RangeIndex(range(2)).copy(name=["foo"]) - - -def test_index_rangeindex_search_range(): - # step > 0 - ridx = RangeIndex(-13, 17, 4) - ri = ridx._range - for i in range(len(ridx)): - assert i == search_range(ridx[i], ri, side="left") - assert i + 1 == search_range(ridx[i], ri, side="right") - - -@pytest.mark.parametrize( - "rge", - [(1, 10, 1), (1, 10, 3), (10, -17, -1), (10, -17, -3)], -) -def test_index_rangeindex_get_item_basic(rge): - pridx = pd.RangeIndex(*rge) - gridx = cudf.RangeIndex(*rge) - - for i in range(-len(pridx), len(pridx)): - assert pridx[i] == gridx[i] - - -@pytest.mark.parametrize( - "rge", - [(1, 10, 3), (10, 1, -3)], -) -def test_index_rangeindex_get_item_out_of_bounds(rge): - gridx = cudf.RangeIndex(*rge) - with pytest.raises(IndexError): - _ = gridx[4] - - -@pytest.mark.parametrize( - "rge", - [(10, 1, 1), (-17, 10, -3)], -) -def test_index_rangeindex_get_item_null_range(rge): - gridx = cudf.RangeIndex(*rge) - - with pytest.raises(IndexError): - gridx[0] - - -@pytest.mark.parametrize( - "rge", [(-17, 21, 2), (21, -17, -3), (0, 0, 1), (0, 1, -3), (10, 0, 5)] -) -@pytest.mark.parametrize( - "sl", - [ - slice(1, 7, 1), - slice(1, 7, 2), - slice(-1, 7, 1), - slice(-1, 7, 2), - slice(-3, 7, 2), - slice(7, 1, -2), - slice(7, -3, -2), - slice(None, None, 1), - slice(0, None, 2), - slice(0, None, 3), - slice(0, 0, 3), - ], -) -def test_index_rangeindex_get_item_slices(rge, sl): - pridx = pd.RangeIndex(*rge) - gridx = cudf.RangeIndex(*rge) - - assert_eq(pridx[sl], gridx[sl]) - - -@pytest.mark.parametrize( - "idx", - [ - pd.Index([1, 2, 3]), - pd.Index(["abc", "def", "ghi"]), - pd.RangeIndex(0, 10, 1), - pd.Index([0.324, 0.234, 1.3], name="abc"), - ], -) -@pytest.mark.parametrize("names", [None, "a", "new name", ["another name"]]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_index_set_names(idx, names, inplace): - pi = idx.copy() - gi = cudf.from_pandas(idx) - - expected = pi.set_names(names=names, inplace=inplace) - actual = gi.set_names(names=names, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("idx", [pd.Index([1, 2, 3], name="abc")]) -@pytest.mark.parametrize("level", [1, [0], "abc"]) -@pytest.mark.parametrize("names", [None, "a"]) -def test_index_set_names_error(idx, level, names): - pi = idx.copy() - gi = cudf.from_pandas(idx) - - assert_exceptions_equal( - lfunc=pi.set_names, - rfunc=gi.set_names, - lfunc_args_and_kwargs=([], {"names": names, "level": level}), - rfunc_args_and_kwargs=([], {"names": names, "level": level}), - ) - - -@pytest.mark.parametrize( - "idx", - [pd.Index([1, 3, 6]), pd.Index([6, 1, 3])], # monotonic # non-monotonic -) -@pytest.mark.parametrize("key", [list(range(0, 8))]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -def test_get_indexer_single_unique_numeric(idx, key, method): - pi = idx - gi = cudf.from_pandas(pi) - - if ( - # `method` only applicable to monotonic index - not pi.is_monotonic_increasing and method is not None - ): - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gi.get_indexer(key, method=method) - assert_eq(expected, got, check_dtype=True) - - -@pytest.mark.parametrize( - "idx", - [pd.RangeIndex(3, 100, 4)], -) -@pytest.mark.parametrize( - "key", - [ - list(range(1, 20, 3)), - list(range(20, 35, 3)), - list(range(35, 77, 3)), - list(range(77, 110, 3)), - ], -) -@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -@pytest.mark.parametrize("tolerance", [None, 0, 1, 13, 20]) -def test_get_indexer_rangeindex(idx, key, method, tolerance): - pi = idx - gi = cudf.from_pandas(pi) - - expected = pi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - got = gi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - - assert_eq(expected, got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - assert_eq(expected, got, check_dtype=True) - - -@pytest.mark.parametrize( - "idx", - [pd.RangeIndex(3, 100, 4)], -) -@pytest.mark.parametrize("key", list(range(1, 110, 3))) -def test_get_loc_rangeindex(idx, key): - pi = idx - gi = cudf.from_pandas(pi) - if ( - (key not in pi) - # Get key before the first element is KeyError - or (key < pi.start) - # Get key after the last element is KeyError - or (key >= pi.stop) - ): - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", - [ - pd.Index([1, 3, 3, 6]), # monotonic increasing - pd.Index([6, 1, 3, 3]), # non-monotonic - pd.Index([4, 3, 2, 1, 0]), # monotonic decreasing - ], -) -@pytest.mark.parametrize("key", [0, 3, 6, 7, 4]) -def test_get_loc_duplicate_numeric(idx, key): - pi = idx - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", - [ - pd.Index([-1, 2, 3, 6]), # monotonic - pd.Index([6, 1, 3, 4]), # non-monotonic - ], -) -@pytest.mark.parametrize("key", [[0, 3, 1], [6, 7]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill", "nearest"]) -@pytest.mark.parametrize("tolerance", [None, 1, 2]) -def test_get_indexer_single_duplicate_numeric(idx, key, method, tolerance): - pi = idx - gi = cudf.from_pandas(pi) - - if not pi.is_monotonic_increasing and method is not None: - assert_exceptions_equal( - lfunc=pi.get_indexer, - rfunc=gi.get_indexer, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - got = gi.get_indexer( - key, method=method, tolerance=None if method is None else tolerance - ) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", [pd.Index(["b", "f", "m", "q"]), pd.Index(["m", "f", "b", "q"])] -) -@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) -def test_get_loc_single_unique_string(idx, key): - pi = idx - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", [pd.Index(["b", "f", "m", "q"]), pd.Index(["m", "f", "b", "q"])] -) -@pytest.mark.parametrize("key", [["a", "f", "n", "z"], ["p", "p", "b"]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_single_unique_string(idx, key, method): - pi = idx - gi = cudf.from_pandas(pi) - - if not pi.is_monotonic_increasing and method is not None: - assert_exceptions_equal( - lfunc=pi.get_indexer, - rfunc=gi.get_indexer, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["m", "f", "m", "q"])] -) -@pytest.mark.parametrize("key", ["a", "f", "n", "z"]) -def test_get_loc_single_duplicate_string(idx, key): - pi = idx - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", [pd.Index(["b", "m", "m", "q"]), pd.Index(["a", "f", "m", "q"])] -) -@pytest.mark.parametrize("key", [["a"], ["f", "n", "z"]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_single_duplicate_string(idx, key, method): - pi = idx - gi = cudf.from_pandas(pi) - - if ( - # `method` only applicable to monotonic index - (not pi.is_monotonic_increasing and method is not None) - or not pi.is_unique - ): - assert_exceptions_equal( - lfunc=pi.get_indexer, - rfunc=gi.get_indexer, - lfunc_args_and_kwargs=([], {"key": key, "method": method}), - rfunc_args_and_kwargs=([], {"key": key, "method": method}), - ) - else: - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got, check_dtype=True) - - -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)] - ), - pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)] - ), - pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 1, 2), (1, 2, 3), (2, 1, 1), (2, 2, 1)] - ), - ], -) -@pytest.mark.parametrize("key", [1, (1, 2), (1, 2, 3), (2, 1, 1), (9, 9, 9)]) -def test_get_loc_multi_numeric(idx, key): - pi = idx.sort_values() - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 2, 1), (1, 2, 3), (2, 1, 1), (2, 2, 1)] - ), - pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 1, 1)] - ), - pd.MultiIndex.from_tuples( - [(1, 1, 1), (1, 1, 2), (1, 1, 24), (1, 2, 3), (2, 1, 1), (2, 2, 1)] - ), - ], -) -@pytest.mark.parametrize("key", [[(1, 2, 3)], [(9, 9, 9)]]) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_numeric(idx, key, method): - pi = idx.sort_values() - gi = cudf.from_pandas(pi) - - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got, check_dtype=True) - - -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 1), (1, 1, 1), (2, 2, 1)] - ) - ], -) -@pytest.mark.parametrize( - "key, result", - [ - (1, slice(1, 5, 1)), # deviates - ((1, 2), slice(1, 3, 1)), - ((1, 2, 3), slice(1, 2, None)), - ((2, 1, 1), slice(0, 1, None)), - ((9, 9, 9), None), - ], -) -def test_get_loc_multi_numeric_deviate(idx, key, result): - pi = idx - gi = cudf.from_pandas(pi) - - with expect_warning_if( - isinstance(key, tuple), pd.errors.PerformanceWarning - ): - key_flag = key not in pi - - if key_flag: - with expect_warning_if( - isinstance(key, tuple), pd.errors.PerformanceWarning - ): - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = result - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "key", - [ - ((1, 2, 3),), - ((2, 1, 1),), - ((9, 9, 9),), - ], -) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_numeric_deviate(key, method): - pi = pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] - ).sort_values() - gi = cudf.from_pandas(pi) - - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_get_indexer_multi_error(method): - pi = pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)] - ) - gi = cudf.from_pandas(pi) - - assert_exceptions_equal( - pi.get_indexer, - gi.get_indexer, - lfunc_args_and_kwargs=( - [], - {"target": ((1, 2, 3),), "method": method}, - ), - rfunc_args_and_kwargs=( - [], - {"target": ((1, 2, 3),), "method": method}, - ), - ) - - -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "b"), - ("b", "a", "a"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - ], -) -@pytest.mark.parametrize( - "key", ["a", ("a", "a"), ("a", "b", "c"), ("b", "c", "a"), ("z", "z", "z")] -) -def test_get_loc_multi_string(idx, key): - pi = idx.sort_values() - gi = cudf.from_pandas(pi) - - if key not in pi: - assert_exceptions_equal( - lfunc=pi.get_loc, - rfunc=gi.get_loc, - lfunc_args_and_kwargs=([], {"key": key}), - rfunc_args_and_kwargs=([], {"key": key}), - ) - else: - expected = pi.get_loc(key) - got = gi.get_loc(key) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "b"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "a"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - pd.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - ], -) -@pytest.mark.parametrize( - "key", [[("a", "b", "c"), ("b", "c", "a")], [("z", "z", "z")]] -) -@pytest.mark.parametrize("method", [None, "ffill", "bfill"]) -def test_get_indexer_multi_string(idx, key, method): - pi = idx.sort_values() - gi = cudf.from_pandas(pi) - - expected = pi.get_indexer(key, method=method) - got = gi.get_indexer(key, method=method) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "idx1", - [ - lambda: cudf.Index(["a", "b", "c"]), - lambda: cudf.RangeIndex(0, 10), - lambda: cudf.Index([1, 2, 3], dtype="category"), - lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), - lambda: cudf.MultiIndex.from_tuples( - [ - ("a", "a", "a"), - ("a", "b", "c"), - ("b", "a", "a"), - ("a", "a", "b"), - ("a", "b", "a"), - ("b", "c", "a"), - ] - ), - ], -) -@pytest.mark.parametrize( - "idx2", - [ - lambda: cudf.Index(["a", "b", "c"]), - lambda: cudf.RangeIndex(0, 10), - lambda: cudf.Index([1, 2, 3], dtype="category"), - lambda: cudf.Index(["a", "b", "c", "d"], dtype="category"), - ], -) -def test_get_indexer_invalid(idx1, idx2): - idx1 = idx1() - idx2 = idx2() - assert_eq( - idx1.get_indexer(idx2), idx1.to_pandas().get_indexer(idx2.to_pandas()) - ) - - -@pytest.mark.parametrize( - "objs", - [ - [pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)], - [pd.RangeIndex(10, 20), pd.RangeIndex(22, 40), pd.RangeIndex(50, 60)], - [pd.RangeIndex(10, 20, 2), pd.RangeIndex(20, 40, 2)], - ], -) -def test_range_index_concat(objs): - cudf_objs = [cudf.from_pandas(obj) for obj in objs] - - actual = cudf.concat(cudf_objs) - - expected = objs[0] - for obj in objs[1:]: - expected = expected.append(obj) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), - (pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)), - (pd.RangeIndex(0, 10, 2), pd.RangeIndex(1, 5, 3)), - (pd.RangeIndex(1, 5, 3), pd.RangeIndex(0, 10, 2)), - (pd.RangeIndex(1, 10, 3), pd.RangeIndex(1, 5, 2)), - (pd.RangeIndex(1, 5, 2), pd.RangeIndex(1, 10, 3)), - (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 3)), - (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 6)), - (pd.RangeIndex(1, 100, 6), pd.RangeIndex(1, 50, 3)), - (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), - (pd.Index([0, 1, 2, 30], name="a"), pd.Index([90, 100])), - (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), - (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), - (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])), - ( - pd.IntervalIndex.from_tuples([(0, 2), (0, 2), (2, 4)]), - pd.IntervalIndex.from_tuples([(0, 2), (2, 4)]), - ), - (pd.RangeIndex(0, 10), pd.Index([8, 1, 2, 4])), - (pd.Index([8, 1, 2, 4], name="a"), pd.Index([8, 1, 2, 4], name="b")), - ( - pd.Index([8, 1, 2, 4], name="a"), - pd.Index([], name="b", dtype="int64"), - ), - (pd.Index([], dtype="int64", name="a"), pd.Index([10, 12], name="b")), - (pd.Index([True, True, True], name="a"), pd.Index([], dtype="bool")), - ( - pd.Index([True, True, True]), - pd.Index([False, True], dtype="bool", name="b"), - ), - ], -) -@pytest.mark.parametrize("sort", [None, False, True]) -def test_union_index(idx1, idx2, sort): - expected = idx1.union(idx2, sort=sort) - - idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 - idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 - - actual = idx1.union(idx2, sort=sort) - - assert_eq(expected, actual) - - -def test_union_bool_with_other(): - idx1 = cudf.Index([True, True, True]) - idx2 = cudf.Index([0, 1], name="b") - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(cudf.errors.MixedTypeError): - idx1.union(idx2) - - -@pytest.mark.parametrize("dtype1", ["int8", "int32", "int32"]) -@pytest.mark.parametrize("dtype2", ["uint32", "uint64"]) -def test_union_unsigned_vs_signed(dtype1, dtype2): - idx1 = cudf.Index([10, 20, 30], dtype=dtype1) - idx2 = cudf.Index([0, 1], dtype=dtype2) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(cudf.errors.MixedTypeError): - idx1.union(idx2) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), - (pd.RangeIndex(0, 10), pd.RangeIndex(-10, 20)), - (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), - (pd.Index([0, 1, 2, 30], name=pd.NA), pd.Index([30, 0, 90, 100])), - (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), - (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), - ( - pd.Index(["a", "b", "c", "d", "c"]), - pd.Index(["a", "c", "z"], name="abc"), - ), - ( - pd.Index(["a", "b", "c", "d", "c"]), - pd.Index(["a", "b", "c", "d", "c"]), - ), - (pd.Index([True, False, True, True]), pd.Index([10, 11, 12, 0, 1, 2])), - (pd.Index([True, False, True, True]), pd.Index([True, True])), - (pd.RangeIndex(0, 10, name="a"), pd.Index([5, 6, 7], name="b")), - (pd.Index(["a", "b", "c"], dtype="category"), pd.Index(["a", "b"])), - (pd.Index(["a", "b", "c"], dtype="category"), pd.Index([1, 2, 3])), - (pd.Index([0, 1, 2], dtype="category"), pd.RangeIndex(0, 10)), - (pd.Index(["a", "b", "c"], name="abc"), []), - (pd.Index([], name="abc"), pd.RangeIndex(0, 4)), - (pd.Index([1, 2, 3]), pd.Index([1, 2], dtype="category")), - (pd.Index([]), pd.Index([1, 2], dtype="category")), - ], -) -@pytest.mark.parametrize("sort", [None, False, True]) -@pytest.mark.parametrize("pandas_compatible", [True, False]) -def test_intersection_index(idx1, idx2, sort, pandas_compatible): - expected = idx1.intersection(idx2, sort=sort) - - with cudf.option_context("mode.pandas_compatible", pandas_compatible): - idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 - idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 - - actual = idx1.intersection(idx2, sort=sort) - - # TODO: Resolve the bool vs ints mixed issue - # once pandas has a direction on this issue - # https://github.com/pandas-dev/pandas/issues/44000 - assert_eq( - expected, - actual, - exact=False - if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b") - or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b") - else True, - ) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - ["a", "v", "d"], - [234.243, 2432.3, None], - [True, False, True], - pd.Series(["a", " ", "v"], dtype="category"), - pd.IntervalIndex.from_breaks([0, 1, 2, 3]), - ], -) -@pytest.mark.parametrize( - "func", - [ - "is_numeric", - "is_boolean", - "is_integer", - "is_floating", - "is_object", - "is_categorical", - "is_interval", - ], -) -def test_index_type_methods(data, func): - pidx = pd.Index(data) - gidx = cudf.from_pandas(pidx) - - with pytest.warns(FutureWarning): - expected = getattr(pidx, func)() - with pytest.warns(FutureWarning): - actual = getattr(gidx, func)() - - if gidx.dtype == np.dtype("bool") and func == "is_object": - assert_eq(False, actual) - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) -def test_index_datetime_ceil(resolution): - cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) - pidx = cuidx.to_pandas() - - pidx_ceil = pidx.ceil(resolution) - cuidx_ceil = cuidx.ceil(resolution) - - assert_eq(pidx_ceil, cuidx_ceil) - - -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) -def test_index_datetime_floor(resolution): - cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) - pidx = cuidx.to_pandas() - - pidx_floor = pidx.floor(resolution) - cuidx_floor = cuidx.floor(resolution) - - assert_eq(pidx_floor, cuidx_floor) - - -@pytest.mark.parametrize( - "resolution", ["D", "h", "min", "s", "ms", "us", "ns"] -) -def test_index_datetime_round(resolution): - cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) - pidx = cuidx.to_pandas() - - pidx_floor = pidx.round(resolution) - cuidx_floor = cuidx.round(resolution) - - assert_eq(pidx_floor, cuidx_floor) - - -@pytest.mark.parametrize( - "data,nan_idx,NA_idx", - [([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): - idx = cudf.Index(data, nan_as_null=nan_as_null) - - if nan_as_null: - if nan_idx is not None: - assert idx[nan_idx] is cudf.NA - else: - if nan_idx is not None: - assert np.isnan(idx[nan_idx]) - - if NA_idx is not None: - assert idx[NA_idx] is cudf.NA - - -@pytest.mark.parametrize( - "index", - [ - pd.Index([]), - pd.Index(["a", "b", "c", "d", "e"]), - pd.Index([0, None, 9]), - pd.date_range("2019-01-01", periods=3), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["this", "is"], - [0, 19, 13], - ["2019-01-01 04:00:00", "2019-01-01 06:00:00", "2018-03-02 10:00:00"], - ], -) -def test_isin_index(index, values): - pidx = index - gidx = cudf.Index.from_pandas(pidx) - - is_dt_str = ( - next(iter(values), None) == "2019-01-01 04:00:00" - and len(pidx) - and pidx.dtype.kind == "M" - ) - with expect_warning_if(is_dt_str): - got = gidx.isin(values) - with expect_warning_if(PANDAS_GE_220 and is_dt_str): - expected = pidx.isin(values) - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - pd.MultiIndex.from_arrays( - [[1, 2, 3], ["red", "blue", "green"]], names=("number", "color") - ), - pd.MultiIndex.from_arrays([[], []], names=("number", "color")), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 10, 100], ["red", "blue", "green", "pink", "white"]], - names=("number", "color"), - ), - pd.MultiIndex.from_product( - [[0, 1], ["red", "blue", "green"]], names=("number", "color") - ), - ], -) -@pytest.mark.parametrize( - "values,level,err", - [ - ([(1, "red"), (2, "blue"), (0, "green")], None, None), - (["red", "orange", "yellow"], "color", None), - (["red", "white", "yellow"], "color", None), - ([0, 1, 2, 10, 11, 15], "number", None), - ([0, 1, 2, 10, 11, 15], None, TypeError), - (pd.Series([0, 1, 2, 10, 11, 15]), None, TypeError), - (pd.Index([0, 1, 2, 10, 11, 15]), None, TypeError), - (pd.Index([0, 1, 2, 8, 11, 15]), "number", None), - (pd.Index(["red", "white", "yellow"]), "color", None), - ([(1, "red"), (3, "red")], None, None), - (((1, "red"), (3, "red")), None, None), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3], ["red", "blue", "green"]], - names=("number", "color"), - ), - None, - None, - ), - ( - pd.MultiIndex.from_arrays([[], []], names=("number", "color")), - None, - None, - ), - ( - pd.MultiIndex.from_arrays( - [ - [1, 2, 3, 10, 100], - ["red", "blue", "green", "pink", "white"], - ], - names=("number", "color"), - ), - None, - None, - ), - ], -) -def test_isin_multiindex(data, values, level, err): - pmdx = data - gmdx = cudf.from_pandas(data) - - if err is None: - expected = pmdx.isin(values, level=level) - if isinstance(values, pd.MultiIndex): - values = cudf.from_pandas(values) - got = gmdx.isin(values, level=level) - - assert_eq(got, expected) - else: - assert_exceptions_equal( - lfunc=pmdx.isin, - rfunc=gmdx.isin, - lfunc_args_and_kwargs=([values], {"level": level}), - rfunc_args_and_kwargs=([values], {"level": level}), - check_exception_type=False, - ) - - -range_data = [ - range(np.random.randint(0, 100)), - range(9, 12, 2), - range(20, 30), - range(100, 1000, 10), - range(0, 10, -2), - range(0, -10, 2), - range(0, -10, -2), -] - - -@pytest.fixture(params=range_data) -def rangeindex(request): - """Create a cudf RangeIndex of different `nrows`""" - return RangeIndex(request.param) - - -@pytest.mark.parametrize( - "func", - ["nunique", "min", "max", "any", "values"], -) -def test_rangeindex_methods(rangeindex, func): - gidx = rangeindex - pidx = gidx.to_pandas() - - if func == "values": - expected = pidx.values - actual = gidx.values - else: - expected = getattr(pidx, func)() - actual = getattr(gidx, func)() - - assert_eq(expected, actual) - - -def test_index_constructor_integer(default_integer_bitwidth): - got = cudf.Index([1, 2, 3]) - expect = cudf.Index([1, 2, 3], dtype=f"int{default_integer_bitwidth}") - - assert_eq(expect, got) - - -def test_index_constructor_float(default_float_bitwidth): - got = cudf.Index([1.0, 2.0, 3.0]) - expect = cudf.Index( - [1.0, 2.0, 3.0], dtype=f"float{default_float_bitwidth}" - ) - - assert_eq(expect, got) - - -def test_rangeindex_union_default_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for union operation. - idx1 = cudf.RangeIndex(0, 2) - idx2 = cudf.RangeIndex(5, 6) - - expected = cudf.Index([0, 1, 5], dtype=f"int{default_integer_bitwidth}") - actual = idx1.union(idx2) - - assert_eq(expected, actual) - - -def test_rangeindex_intersection_default_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for intersection operation. - idx1 = cudf.RangeIndex(0, 100) - # Intersecting two RangeIndex will _always_ result in a RangeIndex, use - # regular index here to force materializing. - idx2 = cudf.Index([50, 102]) - - expected = cudf.Index([50], dtype=f"int{default_integer_bitwidth}") - actual = idx1.intersection(idx2) - - assert_eq(expected, actual) - - -def test_rangeindex_take_default_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for take operation. - idx = cudf.RangeIndex(0, 100) - actual = idx.take([0, 3, 7, 62]) - expected = cudf.Index( - [0, 3, 7, 62], dtype=f"int{default_integer_bitwidth}" - ) - assert_eq(expected, actual) - - -def test_rangeindex_apply_boolean_mask_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for apply boolean mask operation. - idx = cudf.RangeIndex(0, 8) - mask = [True, True, True, False, False, False, True, False] - actual = idx[mask] - expected = cudf.Index([0, 1, 2, 6], dtype=f"int{default_integer_bitwidth}") - assert_eq(expected, actual) - - -def test_rangeindex_repeat_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for repeat operation. - idx = cudf.RangeIndex(0, 3) - actual = idx.repeat(3) - expected = cudf.Index( - [0, 0, 0, 1, 1, 1, 2, 2, 2], dtype=f"int{default_integer_bitwidth}" - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "op, expected, expected_kind", - [ - (lambda idx: 2**idx, [2, 4, 8, 16], "int"), - (lambda idx: idx**2, [1, 4, 9, 16], "int"), - (lambda idx: idx / 2, [0.5, 1, 1.5, 2], "float"), - (lambda idx: 2 / idx, [2, 1, 2 / 3, 0.5], "float"), - (lambda idx: idx % 3, [1, 2, 0, 1], "int"), - (lambda idx: 3 % idx, [0, 1, 0, 3], "int"), - ], -) -def test_rangeindex_binops_user_option( - op, expected, expected_kind, default_integer_bitwidth -): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for binary operation. - idx = cudf.RangeIndex(1, 5) - actual = op(idx) - expected = cudf.Index( - expected, dtype=f"{expected_kind}{default_integer_bitwidth}" - ) - assert_eq( - expected, - actual, - ) - - -@pytest.mark.parametrize( - "op", [operator.add, operator.sub, operator.mul, operator.truediv] -) -def test_rangeindex_binop_diff_names_none(op): - idx1 = cudf.RangeIndex(10, 13, name="foo") - idx2 = cudf.RangeIndex(13, 16, name="bar") - result = op(idx1, idx2) - expected = op(idx1.to_pandas(), idx2.to_pandas()) - assert_eq(result, expected) - assert result.name is None - - -def test_rangeindex_join_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for join. - idx1 = cudf.RangeIndex(0, 10, name="a") - idx2 = cudf.RangeIndex(5, 15, name="b") - - actual = idx1.join(idx2, how="inner", sort=True) - expected = idx1.to_pandas().join(idx2.to_pandas(), how="inner", sort=True) - assert actual.dtype == cudf.dtype(f"int{default_integer_bitwidth}") - # exact=False to ignore dtype comparison, - # because `default_integer_bitwidth` is cudf only option - assert_eq(expected, actual, exact=False) - - -def test_rangeindex_where_user_option(default_integer_bitwidth): - # Test that RangeIndex is materialized into 32 bit index under user - # configuration for where operation. - idx = cudf.RangeIndex(0, 10) - mask = [True, False, True, False, True, False, True, False, True, False] - actual = idx.where(mask, -1) - expected = cudf.Index( - [0, -1, 2, -1, 4, -1, 6, -1, 8, -1], - dtype=f"int{default_integer_bitwidth}", - ) - assert_eq(expected, actual) - - -def test_rangeindex_append_return_rangeindex(): - idx = cudf.RangeIndex(0, 10) - result = idx.append([]) - assert_eq(idx, result) - - result = idx.append(cudf.Index([10])) - expected = cudf.RangeIndex(0, 11) - assert_eq(result, expected) - - -index_data = [ - range(np.random.randint(0, 100)), - range(0, 10, -2), - range(0, -10, 2), - range(0, -10, -2), - range(0, 1), - [1, 2, 3, 1, None, None], - [None, None, 3.2, 1, None, None], - [None, "a", "3.2", "z", None, None], - pd.Series(["a", "b", None], dtype="category"), - np.array([1, 2, 3, None], dtype="datetime64[s]"), -] - - -@pytest.fixture(params=index_data) -def index(request): - """Create a cudf Index of different dtypes""" - return cudf.Index(request.param) - - -@pytest.mark.parametrize( - "func", - [ - "to_series", - "isna", - "notna", - "append", - ], -) -def test_index_methods(index, func): - gidx = index - pidx = gidx.to_pandas() - - if func == "append": - expected = pidx.append(other=pidx) - actual = gidx.append(other=gidx) - else: - expected = getattr(pidx, func)() - actual = getattr(gidx, func)() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx, values", - [ - (range(100, 1000, 10), [200, 600, 800]), - ([None, "a", "3.2", "z", None, None], ["a", "z"]), - (pd.Series(["a", "b", None], dtype="category"), [10, None]), - ], -) -def test_index_isin_values(idx, values): - gidx = cudf.Index(idx) - pidx = gidx.to_pandas() - - actual = gidx.isin(values) - expected = pidx.isin(values) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx, scalar", - [ - (range(0, -10, -2), -4), - ([None, "a", "3.2", "z", None, None], "x"), - (pd.Series(["a", "b", None], dtype="category"), 10), - ], -) -def test_index_isin_scalar_values(idx, scalar): - gidx = cudf.Index(idx) - - with pytest.raises( - TypeError, - match=re.escape( - f"only list-like objects are allowed to be passed " - f"to isin(), you passed a {type(scalar).__name__}" - ), - ): - gidx.isin(scalar) - - -def test_index_any(): - gidx = cudf.Index([1, 2, 3]) - pidx = gidx.to_pandas() - - assert_eq(pidx.any(), gidx.any()) - - -def test_index_values(): - gidx = cudf.Index([1, 2, 3]) - pidx = gidx.to_pandas() - - assert_eq(pidx.values, gidx.values) - - -def test_index_null_values(): - gidx = cudf.Index([1.0, None, 3, 0, None]) - with pytest.raises(ValueError): - gidx.values - - -def test_index_error_list_index(): - s = cudf.Series([[1, 2], [2], [4]]) - with pytest.raises( - NotImplementedError, - match=re.escape( - "Unsupported column type passed to create an " - "Index: " - ), - ): - cudf.Index(s) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - pytest.param( - [np.nan, 10, 15, 16], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/49818" - ), - ), - range(0, 10), - [np.nan, None, 10, 20], - ["ab", "zx", "pq"], - ["ab", "zx", None, "pq"], - ], -) -def test_index_hasnans(data): - gs = cudf.Index(data, nan_as_null=False) - if isinstance(gs, cudf.RangeIndex): - with pytest.raises(NotImplementedError): - gs.to_pandas(nullable=True) - else: - ps = gs.to_pandas(nullable=True) - # Check type to avoid mixing Python bool and NumPy bool - assert isinstance(gs.hasnans, bool) - assert gs.hasnans == ps.hasnans - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 1, 1, 3, 2, 3], - [np.nan, 10, 15, 16, np.nan, 10, 16], - range(0, 10), - ["ab", "zx", None, "pq", "ab", None, "zx", None], - ], -) -@pytest.mark.parametrize("keep", ["first", "last", False]) -def test_index_duplicated(data, keep): - gs = cudf.Index(data) - ps = gs.to_pandas() - - expected = ps.duplicated(keep=keep) - actual = gs.duplicated(keep=keep) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,expected_dtype", - [ - ([10, 11, 12], pd.Int64Dtype()), - ([0.1, 10.2, 12.3], pd.Float64Dtype()), - (["abc", None, "def"], pd.StringDtype()), - ], -) -def test_index_to_pandas_nullable(data, expected_dtype): - gi = cudf.Index(data) - pi = gi.to_pandas(nullable=True) - expected = pd.Index(data, dtype=expected_dtype) - - assert_eq(pi, expected) - - -class TestIndexScalarGetItem: - @pytest.fixture( - params=[range(1, 10, 2), [1, 2, 3], ["a", "b", "c"], [1.5, 2.5, 3.5]] - ) - def index_values(self, request): - return request.param - - @pytest.fixture(params=[int, np.int8, np.int32, np.int64]) - def i(self, request): - return request.param(1) - - def test_scalar_getitem(self, index_values, i): - index = cudf.Index(index_values) - - assert not isinstance(index[i], cudf.Index) - assert index[i] == index_values[i] - assert_eq(index, index.to_pandas()) - - -@pytest.mark.parametrize( - "data", - [ - [ - pd.Timestamp("1970-01-01 00:00:00.000000001"), - pd.Timestamp("1970-01-01 00:00:00.000000002"), - 12, - 20, - ], - [ - pd.Timedelta(10), - pd.Timedelta(20), - 12, - 20, - ], - [1, 2, 3, 4], - ], -) -def test_index_mixed_dtype_error(data): - pi = pd.Index(data, dtype="object") - with pytest.raises(TypeError): - cudf.Index(pi) - - -@pytest.mark.parametrize("cls", [pd.DatetimeIndex, pd.TimedeltaIndex]) -def test_index_date_duration_freq_error(cls): - s = cls([1, 2, 3], freq="infer") - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.Index(s) - - -@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) -def test_index_getitem_time_duration(dtype): - gidx = cudf.Index([1, 2, 3, 4, None], dtype=dtype) - pidx = gidx.to_pandas() - with cudf.option_context("mode.pandas_compatible", True): - for i in range(len(gidx)): - if i == 4: - assert gidx[i] is pidx[i] - else: - assert_eq(gidx[i], pidx[i]) - - -@pytest.mark.parametrize("dtype", ALL_TYPES) -def test_index_empty_from_pandas(dtype): - pidx = pd.Index([], dtype=dtype) - gidx = cudf.from_pandas(pidx) - - assert_eq(pidx, gidx) - - -def test_empty_index_init(): - pidx = pd.Index([]) - gidx = cudf.Index([]) - - assert_eq(pidx, gidx) - - -@pytest.mark.parametrize( - "data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)] -) -@pytest.mark.parametrize("data_name", [None, 1, "abc"]) -@pytest.mark.parametrize("index", [True, False]) -@pytest.mark.parametrize("name", [None, no_default, 1, "abc"]) -def test_index_to_frame(data, data_name, index, name): - pidx = pd.Index(data, name=data_name) - gidx = cudf.from_pandas(pidx) - - expected = pidx.to_frame(index=index, name=name) - actual = gidx.to_frame(index=index, name=name) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[1, 2, 3], range(0, 10)]) -@pytest.mark.parametrize("dtype", ["str", "int64", "float64"]) -def test_index_with_index_dtype(data, dtype): - pidx = pd.Index(data) - gidx = cudf.Index(data) - - expected = pd.Index(pidx, dtype=dtype) - actual = cudf.Index(gidx, dtype=dtype) - - assert_eq(expected, actual) - - -def test_period_index_error(): - pidx = pd.PeriodIndex(data=[pd.Period("2020-01")]) - with pytest.raises(NotImplementedError): - cudf.from_pandas(pidx) - with pytest.raises(NotImplementedError): - cudf.Index(pidx) - with pytest.raises(NotImplementedError): - cudf.Series(pidx) - with pytest.raises(NotImplementedError): - cudf.Series(pd.Series(pidx)) - with pytest.raises(NotImplementedError): - cudf.Series(pd.array(pidx)) - - -def test_index_from_dataframe_valueerror(): - with pytest.raises(ValueError): - cudf.Index(cudf.DataFrame(range(1))) - - -def test_index_from_scalar_valueerror(): - with pytest.raises(ValueError): - cudf.Index(11) - - -@pytest.mark.parametrize("idx", [0, np.int64(0)]) -def test_index_getitem_from_int(idx): - result = cudf.Index([1, 2])[idx] - assert result == 1 - - -@pytest.mark.parametrize("idx", [1.5, True, "foo"]) -def test_index_getitem_from_nonint_raises(idx): - with pytest.raises(ValueError): - cudf.Index([1, 2])[idx] - - -@pytest.mark.parametrize( - "data", - [ - cp.ones(5, dtype=cp.float16), - np.ones(5, dtype="float16"), - pd.Series([0.1, 1.2, 3.3], dtype="float16"), - pytest.param( - pa.array(np.ones(5, dtype="float16")), - marks=pytest.mark.xfail( - reason="https://issues.apache.org/jira/browse/ARROW-13762" - ), - ), - ], -) -def test_index_raises_float16(data): - with pytest.raises(TypeError): - cudf.Index(data) - - -def test_from_pandas_rangeindex_return_rangeindex(): - pidx = pd.RangeIndex(start=3, stop=9, step=3, name="a") - result = cudf.Index.from_pandas(pidx) - expected = cudf.RangeIndex(start=3, stop=9, step=3, name="a") - assert_eq(result, expected, exact=True) - - -@pytest.mark.parametrize( - "idx", - [ - cudf.RangeIndex(1), - cudf.DatetimeIndex(np.array([1, 2], dtype="datetime64[ns]")), - cudf.TimedeltaIndex(np.array([1, 2], dtype="timedelta64[ns]")), - ], -) -def test_index_to_pandas_nullable_notimplemented(idx): - with pytest.raises(NotImplementedError): - idx.to_pandas(nullable=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - ], -) -def test_index_to_pandas_arrow_type_nullable_raises(scalar): - pa_array = pa.array([scalar, None]) - idx = cudf.Index(pa_array) - with pytest.raises(ValueError): - idx.to_pandas(nullable=True, arrow_type=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - ], -) -def test_index_to_pandas_arrow_type(scalar): - pa_array = pa.array([scalar, None]) - idx = cudf.Index(pa_array) - result = idx.to_pandas(arrow_type=True) - expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array)) - pd.testing.assert_index_equal(result, expected) - - -@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)]) -def test_rangeindex_all(data): - result = cudf.RangeIndex(data).all() - expected = cudf.Index(list(data)).all() - assert result == expected - - -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)]) -def test_rangeindex_factorize(sort, data): - res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort) - exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort) - assert_eq(res_codes, exp_codes) - assert_eq(res_uniques, exp_uniques) - - -def test_rangeindex_dropna(): - ri = cudf.RangeIndex(range(2)) - result = ri.dropna() - expected = ri.copy() - assert_eq(result, expected) - - -def test_rangeindex_unique_shallow_copy(): - ri_pandas = pd.RangeIndex(1) - result = ri_pandas.unique() - assert result is not ri_pandas - - ri_cudf = cudf.RangeIndex(1) - result = ri_cudf.unique() - assert result is not ri_cudf - assert_eq(result, ri_cudf) - - -def test_rename_shallow_copy(): - idx = pd.Index([1]) - result = idx.rename("a") - assert idx.to_numpy(copy=False) is result.to_numpy(copy=False) - - idx = cudf.Index([1]) - result = idx.rename("a") - assert idx._column is result._column - - -@pytest.mark.parametrize("data", [range(2), [10, 11, 12]]) -def test_index_contains_hashable(data): - gidx = cudf.Index(data) - pidx = gidx.to_pandas() - - assert_exceptions_equal( - lambda: [] in gidx, - lambda: [] in pidx, - lfunc_args_and_kwargs=((),), - rfunc_args_and_kwargs=((),), - ) - - -@pytest.mark.parametrize("data", [[0, 1, 2], [1.1, 2.3, 4.5]]) -@pytest.mark.parametrize("dtype", ["int32", "float32", "float64"]) -@pytest.mark.parametrize("needle", [0, 1, 2.3]) -def test_index_contains_float_int(data, dtype, needle): - gidx = cudf.Index(data=data, dtype=dtype) - pidx = gidx.to_pandas() - - actual = needle in gidx - expected = needle in pidx - - assert_eq(actual, expected) - - -def test_Index_init_with_nans(): - with cudf.option_context("mode.pandas_compatible", True): - gi = cudf.Index([1, 2, 3, np.nan]) - assert gi.dtype == np.dtype("float64") - pi = pd.Index([1, 2, 3, np.nan]) - assert_eq(pi, gi) - - -def test_index_datetime_repeat(): - gidx = cudf.date_range("2021-01-01", periods=3, freq="D") - pidx = gidx.to_pandas() - - actual = gidx.repeat(5) - expected = pidx.repeat(5) - - assert_eq(actual, expected) - - actual = gidx.to_frame().repeat(5) - - assert_eq(actual.index, expected) - - -@pytest.mark.parametrize( - "index", - [ - cudf.Index([1]), - cudf.RangeIndex(1), - cudf.MultiIndex(levels=[[0]], codes=[[0]]), - ], -) -def test_index_assignment_no_shallow_copy(index): - df = cudf.DataFrame(range(1)) - df.index = index - assert df.index is index - - -def test_bool_rangeindex_raises(): - assert_exceptions_equal( - lfunc=bool, - rfunc=bool, - lfunc_args_and_kwargs=[[pd.RangeIndex(0)]], - rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]], - ) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py deleted file mode 100644 index 00ae99466bb..00000000000 --- a/python/cudf/cudf/tests/test_indexing.py +++ /dev/null @@ -1,2389 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import weakref -from datetime import datetime -from itertools import combinations - -import cupy -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import _utils as utils, assert_eq -from cudf.testing._utils import ( - INTEGER_TYPES, - assert_exceptions_equal, - expect_warning_if, -) - -index_dtypes = INTEGER_TYPES - - -@pytest.fixture -def pdf_gdf(): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": ["c", "d", "e"]}, index=["one", "two", "three"] - ) - gdf = cudf.from_pandas(pdf) - return pdf, gdf - - -@pytest.fixture -def pdf_gdf_multi(): - pdf = pd.DataFrame(np.random.rand(7, 5)) - pdfIndex = pd.MultiIndex( - [ - ["a", "b", "c"], - ["house", "store", "forest"], - ["clouds", "clear", "storm"], - ["fire", "smoke", "clear"], - ], - [ - [0, 0, 0, 0, 1, 1, 2], - [1, 1, 1, 1, 0, 0, 2], - [0, 0, 2, 2, 2, 0, 1], - [0, 0, 0, 1, 2, 0, 1], - ], - ) - pdfIndex.names = ["alpha", "location", "weather", "sign"] - pdf.index = pdfIndex - gdf = cudf.from_pandas(pdf) - return pdf, gdf - - -@pytest.mark.parametrize( - "i1, i2, i3", - ( - [ - (slice(None, 12), slice(3, None), slice(None, None, 2)), - (range(12), range(3, 12), range(0, 9, 2)), - (np.arange(12), np.arange(3, 12), np.arange(0, 9, 2)), - (list(range(12)), list(range(3, 12)), list(range(0, 9, 2))), - ( - pd.Series(range(12)), - pd.Series(range(3, 12)), - pd.Series(range(0, 9, 2)), - ), - ( - cudf.Series(range(12)), - cudf.Series(range(3, 12)), - cudf.Series(range(0, 9, 2)), - ), - ( - [i in range(12) for i in range(20)], - [i in range(3, 12) for i in range(12)], - [i in range(0, 9, 2) for i in range(9)], - ), - ( - np.array([i in range(12) for i in range(20)], dtype=bool), - np.array([i in range(3, 12) for i in range(12)], dtype=bool), - np.array([i in range(0, 9, 2) for i in range(9)], dtype=bool), - ), - ] - + [ - ( - np.arange(12, dtype=t), - np.arange(3, 12, dtype=t), - np.arange(0, 9, 2, dtype=t), - ) - for t in index_dtypes - ] - ), - ids=( - [ - "slice", - "range", - "numpy.array", - "list", - "pandas.Series", - "Series", - "list[bool]", - "numpy.array[bool]", - ] - + ["numpy.array[%s]" % np.dtype(t).type.__name__ for t in index_dtypes] - ), -) -def test_series_indexing(i1, i2, i3): - a1 = np.arange(20) - series = cudf.Series(a1) - - # Indexing - sr1 = series.iloc[i1] - assert sr1.null_count == 0 - np.testing.assert_equal(sr1.to_numpy(), a1[:12]) - - sr2 = sr1.iloc[i2] - assert sr2.null_count == 0 - np.testing.assert_equal(sr2.to_numpy(), a1[3:12]) - - # Index with stride - sr3 = sr2.iloc[i3] - assert sr3.null_count == 0 - np.testing.assert_equal(sr3.to_numpy(), a1[3:12:2]) - - # Integer indexing - if isinstance(i1, range): - for i in i1: # Python int-s - assert series[i] == a1[i] - if isinstance(i1, np.ndarray) and i1.dtype in index_dtypes: - for i in i1: # numpy integers - assert series[i] == a1[i] - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "arg", - [ - 1, - -1, - "b", - np.int32(1), - np.uint32(1), - np.int8(1), - np.uint8(1), - np.int16(1), - np.uint16(1), - np.int64(1), - np.uint64(1), - ], -) -def test_series_get_item_iloc_defer(arg): - # Indexing for non-numeric dtype Index - ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) - gs = cudf.from_pandas(ps) - - arg_not_str = not isinstance(arg, str) - with expect_warning_if(arg_not_str): - expect = ps[arg] - with expect_warning_if(arg_not_str): - got = gs[arg] - - assert_eq(expect, got) - - -def test_series_iloc_defer_cudf_scalar(): - ps = pd.Series([1, 2, 3], index=pd.Index(["a", "b", "c"])) - gs = cudf.from_pandas(ps) - - for t in index_dtypes: - arg = cudf.Scalar(1, dtype=t) - got = gs.iloc[arg] - expect = 2 - assert_eq(expect, got) - - -def test_series_indexing_large_size(): - n_elem = 100_000 - gsr = cudf.Series(cupy.ones(n_elem)) - gsr[0] = None - got = gsr[gsr.isna()] - expect = cudf.Series([None], dtype="float64") - - assert_eq(expect, got) - - -@pytest.mark.parametrize("psr", [pd.Series([1, 2, 3], index=["a", "b", "c"])]) -@pytest.mark.parametrize( - "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] -) -def test_series_get_item(psr, arg): - gsr = cudf.from_pandas(psr) - - expect = psr[arg] - got = gsr[arg] - - assert_eq(expect, got) - - -def test_dataframe_column_name_indexing(): - df = cudf.DataFrame() - data = np.asarray(range(10), dtype=np.int32) - df["a"] = data - df[1] = data - np.testing.assert_equal( - df["a"].to_numpy(), np.asarray(range(10), dtype=np.int32) - ) - np.testing.assert_equal( - df[1].to_numpy(), np.asarray(range(10), dtype=np.int32) - ) - - pdf = pd.DataFrame() - nelem = 10 - pdf["key1"] = np.random.randint(0, 5, nelem) - pdf["key2"] = np.random.randint(0, 3, nelem) - pdf[1] = np.arange(1, 1 + nelem) - pdf[2] = np.random.random(nelem) - df = cudf.from_pandas(pdf) - - assert_eq(df[df.columns], df) - assert_eq(df[df.columns[:1]], df[["key1"]]) - - for i in range(1, len(pdf.columns) + 1): - for idx in combinations(pdf.columns, i): - assert pdf[list(idx)].equals(df[list(idx)].to_pandas()) - - # test for only numeric columns - df = pd.DataFrame() - for i in range(0, 10): - df[i] = range(nelem) - gdf = cudf.DataFrame.from_pandas(df) - assert_eq(gdf, df) - - assert_eq(gdf[gdf.columns], gdf) - assert_eq(gdf[gdf.columns[:3]], gdf[[0, 1, 2]]) - - -def test_dataframe_slicing(): - df = cudf.DataFrame() - size = 123 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) - df["b"] = hb = np.random.random(size).astype(np.float32) - df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype( - np.int64 - ) - df["d"] = hd = np.random.random(size).astype(np.float64) - - # Row slice first 10 - first_10 = df[:10] - assert len(first_10) == 10 - assert tuple(first_10.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(first_10["a"].to_numpy(), ha[:10]) - np.testing.assert_equal(first_10["b"].to_numpy(), hb[:10]) - np.testing.assert_equal(first_10["c"].to_numpy(), hc[:10]) - np.testing.assert_equal(first_10["d"].to_numpy(), hd[:10]) - del first_10 - - # Row slice last 10 - last_10 = df[-10:] - assert len(last_10) == 10 - assert tuple(last_10.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(last_10["a"].to_numpy(), ha[-10:]) - np.testing.assert_equal(last_10["b"].to_numpy(), hb[-10:]) - np.testing.assert_equal(last_10["c"].to_numpy(), hc[-10:]) - np.testing.assert_equal(last_10["d"].to_numpy(), hd[-10:]) - del last_10 - - # Row slice [begin:end] - begin = 7 - end = 121 - subrange = df[begin:end] - assert len(subrange) == end - begin - assert tuple(subrange.columns) == ("a", "b", "c", "d") - np.testing.assert_equal(subrange["a"].to_numpy(), ha[begin:end]) - np.testing.assert_equal(subrange["b"].to_numpy(), hb[begin:end]) - np.testing.assert_equal(subrange["c"].to_numpy(), hc[begin:end]) - np.testing.assert_equal(subrange["d"].to_numpy(), hd[begin:end]) - del subrange - - -@pytest.mark.parametrize("step", [1, 2, 5]) -@pytest.mark.parametrize("scalar", [0, 20, 100]) -def test_dataframe_loc(scalar, step): - size = 123 - pdf = pd.DataFrame( - { - "a": np.random.randint(low=0, high=100, size=size), - "b": np.random.random(size).astype(np.float32), - "c": np.random.random(size).astype(np.float64), - "d": np.random.random(size).astype(np.float64), - } - ) - pdf.index.name = "index" - - df = cudf.DataFrame.from_pandas(pdf) - - assert_eq(df.loc[:, ["a"]], pdf.loc[:, ["a"]]) - - assert_eq(df.loc[:, "d"], pdf.loc[:, "d"]) - - # Scalar label - assert_eq(df.loc[scalar], pdf.loc[scalar]) - - # Full slice - assert_eq(df.loc[:, "c"], pdf.loc[:, "c"]) - - # Repeat with at[] - assert_eq(df.loc[:, ["a"]], df.at[:, ["a"]]) - assert_eq(df.loc[:, "d"], df.at[:, "d"]) - assert_eq(df.loc[scalar], df.at[scalar]) - assert_eq(df.loc[:, "c"], df.at[:, "c"]) - - begin = 110 - end = 122 - - assert_eq( - df.loc[begin:end:step, ["c", "d", "a"]], - pdf.loc[begin:end:step, ["c", "d", "a"]], - ) - - assert_eq(df.loc[begin:end, ["c", "d"]], pdf.loc[begin:end, ["c", "d"]]) - - # Slicing on columns: - assert_eq( - df.loc[begin:end:step, "a":"c"], pdf.loc[begin:end:step, "a":"c"] - ) - - # Slicing of size 1: - assert_eq(df.loc[begin:begin, "a"], pdf.loc[begin:begin, "a"]) - - # TODO: Pandas changes the dtype here when it shouldn't - assert_eq( - df.loc[begin, "a":"a"], pdf.loc[begin, "a":"a"], check_dtype=False - ) - - # Repeat with at[] - assert_eq( - df.loc[begin:end:step, ["c", "d", "a"]], - df.at[begin:end:step, ["c", "d", "a"]], - ) - assert_eq(df.loc[begin:end, ["c", "d"]], df.at[begin:end, ["c", "d"]]) - assert_eq(df.loc[begin:end:step, "a":"c"], df.at[begin:end:step, "a":"c"]) - assert_eq(df.loc[begin:begin, "a"], df.at[begin:begin, "a"]) - assert_eq(df.loc[begin, "a":"a"], df.at[begin, "a":"a"], check_dtype=False) - - # Make int64 index - offset = 50 - df2 = df[offset:] - pdf2 = pdf[offset:] - begin = 117 - end = 122 - assert_eq( - df2.loc[begin:end, ["c", "d", "a"]], - pdf2.loc[begin:end, ["c", "d", "a"]], - ) - - # loc with list like indexing - assert_eq(df.loc[[0]], pdf.loc[[0]]) - # loc with column like indexing - assert_eq(df.loc[cudf.Series([0])], pdf.loc[pd.Series([0])]) - assert_eq(df.loc[cudf.Series([0])._column], pdf.loc[pd.Series([0])]) - assert_eq(df.loc[np.array([0])], pdf.loc[np.array([0])]) - - -def test_dataframe_loc_duplicate_index_scalar(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2]) - gdf = cudf.DataFrame.from_pandas(pdf) - - pdf_sorted = pdf.sort_values(by=list(pdf.columns), axis=0) - gdf_sorted = gdf.sort_values(by=list(gdf.columns), axis=0) - - assert_eq(pdf_sorted, gdf_sorted) - - -@pytest.mark.parametrize( - "mask", - [[True, False, False, False, False], [True, False, True, False, True]], -) -@pytest.mark.parametrize("arg", ["a", slice("a", "a"), slice("a", "b")]) -def test_dataframe_loc_mask(mask, arg): - pdf = pd.DataFrame( - {"a": ["a", "b", "c", "d", "e"], "b": ["f", "g", "h", "i", "j"]} - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - assert_eq(pdf.loc[mask, arg], gdf.loc[mask, arg]) - - -def test_dataframe_loc_outbound(): - df = cudf.DataFrame() - size = 10 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) - df["b"] = hb = np.random.random(size).astype(np.float32) - - pdf = pd.DataFrame() - pdf["a"] = ha - pdf["b"] = hb - - assert_exceptions_equal(lambda: pdf.loc[11], lambda: df.loc[11]) - - -def test_series_loc_numerical(): - ps = pd.Series([1, 2, 3, 4, 5], index=[5, 6, 7, 8, 9]) - gs = cudf.Series.from_pandas(ps) - - assert_eq(ps.loc[5], gs.loc[5]) - assert_eq(ps.loc[6], gs.loc[6]) - assert_eq(ps.loc[6:8], gs.loc[6:8]) - assert_eq(ps.loc[:8], gs.loc[:8]) - assert_eq(ps.loc[6:], gs.loc[6:]) - assert_eq(ps.loc[::2], gs.loc[::2]) - assert_eq(ps.loc[[5, 8, 9]], gs.loc[[5, 8, 9]]) - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - ) - assert_eq(ps.loc[[5, 8, 9]], gs.loc[cupy.array([5, 8, 9])]) - - -def test_series_loc_float_index(): - ps = pd.Series([1, 2, 3, 4, 5], index=[5.43, 6.34, 7.34, 8.0, 9.1]) - gs = cudf.Series.from_pandas(ps) - - assert_eq(ps.loc[5.43], gs.loc[5.43]) - assert_eq(ps.loc[8], gs.loc[8]) - assert_eq(ps.loc[6.1:8], gs.loc[6.1:8]) - assert_eq(ps.loc[:7.1], gs.loc[:7.1]) - assert_eq(ps.loc[6.345:], gs.loc[6.345:]) - assert_eq(ps.loc[::2], gs.loc[::2]) - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - ) - - -def test_series_loc_string(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=["one", "two", "three", "four", "five"] - ) - gs = cudf.Series.from_pandas(ps) - - assert_eq(ps.loc["one"], gs.loc["one"]) - assert_eq(ps.loc["five"], gs.loc["five"]) - assert_eq(ps.loc["two":"four"], gs.loc["two":"four"]) - assert_eq(ps.loc[:"four"], gs.loc[:"four"]) - assert_eq(ps.loc["two":], gs.loc["two":]) - assert_eq(ps.loc[::2], gs.loc[::2]) - assert_eq(ps.loc[["one", "four", "five"]], gs.loc[["one", "four", "five"]]) - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - ) - - -def test_series_loc_datetime(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=pd.date_range("20010101", "20010105") - ) - gs = cudf.Series.from_pandas(ps) - - # a few different ways of specifying a datetime label: - assert_eq(ps.loc["20010101"], gs.loc["20010101"]) - assert_eq(ps.loc["2001-01-01"], gs.loc["2001-01-01"]) - assert_eq( - ps.loc[pd.to_datetime("2001-01-01")], - gs.loc[pd.to_datetime("2001-01-01")], - ) - assert_eq( - ps.loc[np.datetime64("2001-01-01")], - gs.loc[np.datetime64("2001-01-01")], - ) - - assert_eq( - ps.loc["2001-01-02":"2001-01-05"], - gs.loc["2001-01-02":"2001-01-05"], - check_freq=False, - ) - assert_eq(ps.loc["2001-01-02":], gs.loc["2001-01-02":], check_freq=False) - assert_eq(ps.loc[:"2001-01-04"], gs.loc[:"2001-01-04"], check_freq=False) - assert_eq(ps.loc[::2], gs.loc[::2], check_freq=False) - - assert_eq( - ps.loc[["2001-01-01", "2001-01-04", "2001-01-05"]], - gs.loc[["2001-01-01", "2001-01-04", "2001-01-05"]], - ) - - assert_eq( - ps.loc[ - [ - pd.to_datetime("2001-01-01"), - pd.to_datetime("2001-01-04"), - pd.to_datetime("2001-01-05"), - ] - ], - gs.loc[ - [ - pd.to_datetime("2001-01-01"), - pd.to_datetime("2001-01-04"), - pd.to_datetime("2001-01-05"), - ] - ], - ) - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - check_freq=False, - ) - - just_less_than_max = ps.index.max() - pd.Timedelta("5m") - - assert_eq( - ps.loc[:just_less_than_max], - gs.loc[:just_less_than_max], - check_freq=False, - ) - - -def test_series_loc_categorical(): - ps = pd.Series( - [1, 2, 3, 4, 5], index=pd.Categorical(["a", "b", "c", "d", "e"]) - ) - gs = cudf.Series.from_pandas(ps) - - assert_eq(ps.loc["a"], gs.loc["a"]) - assert_eq(ps.loc["e"], gs.loc["e"]) - assert_eq(ps.loc["b":"d"], gs.loc["b":"d"]) - assert_eq(ps.loc[:"d"], gs.loc[:"d"]) - assert_eq(ps.loc["b":], gs.loc["b":]) - assert_eq(ps.loc[::2], gs.loc[::2]) - - # order of categories changes, so we can only - # compare values: - assert_eq( - ps.loc[["a", "d", "e"]].values, gs.loc[["a", "d", "e"]].to_numpy() - ) - - assert_eq( - ps.loc[[True, False, True, False, True]], - gs.loc[[True, False, True, False, True]], - ) - - -@pytest.mark.parametrize( - "obj", - [ - pd.DataFrame( - {"a": [1, 2, 3, 4]}, - index=pd.MultiIndex.from_frame( - pd.DataFrame( - {"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]} - ) - ), - ), - pd.Series( - [1, 2, 3, 4], - index=pd.MultiIndex.from_frame( - pd.DataFrame( - {"A": [2, 3, 1, 4], "B": ["low", "high", "high", "low"]} - ) - ), - ), - ], -) -def test_dataframe_series_loc_multiindex(obj): - pindex = pd.MultiIndex.from_frame( - pd.DataFrame({"A": [3, 2], "B": ["high", "low"]}) - ) - - gobj = cudf.from_pandas(obj) - gindex = cudf.MultiIndex.from_pandas(pindex) - - # cudf MultiIndex as arg - expected = obj.loc[pindex] - got = gobj.loc[gindex] - assert_eq(expected, got) - - # pandas MultiIndex as arg - expected = obj.loc[pindex] - got = gobj.loc[pindex] - assert_eq(expected, got) - - -@pytest.mark.parametrize("nelem", [2, 5, 20, 100]) -def test_series_iloc(nelem): - # create random cudf.Series - np.random.seed(12) - ps = pd.Series(np.random.sample(nelem)) - - # gpu cudf.Series - gs = cudf.Series(ps) - - # positive tests for indexing - np.testing.assert_allclose(gs.iloc[-1 * nelem], ps.iloc[-1 * nelem]) - np.testing.assert_allclose(gs.iloc[-1], ps.iloc[-1]) - np.testing.assert_allclose(gs.iloc[0], ps.iloc[0]) - np.testing.assert_allclose(gs.iloc[1], ps.iloc[1]) - np.testing.assert_allclose(gs.iloc[nelem - 1], ps.iloc[nelem - 1]) - - # positive tests for slice - np.testing.assert_allclose(gs.iloc[-1:1].to_numpy(), ps.iloc[-1:1]) - np.testing.assert_allclose( - gs.iloc[nelem - 1 : -1].to_numpy(), ps.iloc[nelem - 1 : -1] - ) - np.testing.assert_allclose( - gs.iloc[0 : nelem - 1].to_pandas(), ps.iloc[0 : nelem - 1] - ) - np.testing.assert_allclose(gs.iloc[0:nelem].to_pandas(), ps.iloc[0:nelem]) - np.testing.assert_allclose(gs.iloc[1:1].to_pandas(), ps.iloc[1:1]) - np.testing.assert_allclose(gs.iloc[1:2].to_pandas(), ps.iloc[1:2].values) - np.testing.assert_allclose( - gs.iloc[nelem - 1 : nelem + 1].to_pandas(), - ps.iloc[nelem - 1 : nelem + 1], - ) - np.testing.assert_allclose( - gs.iloc[nelem : nelem * 2].to_pandas(), ps.iloc[nelem : nelem * 2] - ) - - -@pytest.mark.parametrize("nelem", [2, 5, 20, 100]) -def test_dataframe_iloc(nelem): - gdf = cudf.DataFrame() - - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) - - pdf = pd.DataFrame() - pdf["a"] = ha - pdf["b"] = hb - - gdf.index.name = "index" - pdf.index.name = "index" - - assert_eq(gdf.iloc[-1:1], pdf.iloc[-1:1]) - assert_eq(gdf.iloc[nelem - 1 : -1], pdf.iloc[nelem - 1 : -1]) - assert_eq(gdf.iloc[0 : nelem - 1], pdf.iloc[0 : nelem - 1]) - assert_eq(gdf.iloc[0:nelem], pdf.iloc[0:nelem]) - assert_eq(gdf.iloc[1:1], pdf.iloc[1:1]) - assert_eq(gdf.iloc[1:2], pdf.iloc[1:2]) - assert_eq(gdf.iloc[nelem - 1 : nelem + 1], pdf.iloc[nelem - 1 : nelem + 1]) - assert_eq(gdf.iloc[nelem : nelem * 2], pdf.iloc[nelem : nelem * 2]) - - assert_eq(gdf.iloc[-1 * nelem], pdf.iloc[-1 * nelem]) - assert_eq(gdf.iloc[-1], pdf.iloc[-1]) - assert_eq(gdf.iloc[0], pdf.iloc[0]) - assert_eq(gdf.iloc[1], pdf.iloc[1]) - assert_eq(gdf.iloc[nelem - 1], pdf.iloc[nelem - 1]) - - # Repeat the above with iat[] - assert_eq(gdf.iloc[-1:1], gdf.iat[-1:1]) - assert_eq(gdf.iloc[nelem - 1 : -1], gdf.iat[nelem - 1 : -1]) - assert_eq(gdf.iloc[0 : nelem - 1], gdf.iat[0 : nelem - 1]) - assert_eq(gdf.iloc[0:nelem], gdf.iat[0:nelem]) - assert_eq(gdf.iloc[1:1], gdf.iat[1:1]) - assert_eq(gdf.iloc[1:2], gdf.iat[1:2]) - assert_eq(gdf.iloc[nelem - 1 : nelem + 1], gdf.iat[nelem - 1 : nelem + 1]) - assert_eq(gdf.iloc[nelem : nelem * 2], gdf.iat[nelem : nelem * 2]) - - assert_eq(gdf.iloc[-1 * nelem], gdf.iat[-1 * nelem]) - assert_eq(gdf.iloc[-1], gdf.iat[-1]) - assert_eq(gdf.iloc[0], gdf.iat[0]) - assert_eq(gdf.iloc[1], gdf.iat[1]) - assert_eq(gdf.iloc[nelem - 1], gdf.iat[nelem - 1]) - - # iloc with list like indexing - assert_eq(gdf.iloc[[0]], pdf.iloc[[0]]) - # iloc with column like indexing - assert_eq(gdf.iloc[cudf.Series([0])], pdf.iloc[pd.Series([0])]) - assert_eq(gdf.iloc[cudf.Series([0])._column], pdf.iloc[pd.Series([0])]) - assert_eq(gdf.iloc[np.array([0])], pdf.loc[np.array([0])]) - - -def test_dataframe_iloc_tuple(): - gdf = cudf.DataFrame() - nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) - - pdf = pd.DataFrame() - pdf["a"] = ha - pdf["b"] = hb - - assert_eq(gdf.iloc[1, [1]], pdf.iloc[1, [1]], check_dtype=False) - assert_eq(gdf.iloc[:, -1], pdf.iloc[:, -1]) - - -def test_dataframe_iloc_index_error(): - gdf = cudf.DataFrame() - nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) - - pdf = pd.DataFrame() - pdf["a"] = ha - pdf["b"] = hb - - with pytest.raises(IndexError): - pdf.iloc[nelem * 2] - with pytest.raises(IndexError): - gdf.iloc[nelem * 2] - - -@pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) -def test_dataframe_take(ntake): - np.random.seed(0) - df = cudf.DataFrame() - - nelem = 123 - df["ii"] = np.random.randint(0, 20, nelem) - df["ff"] = np.random.random(nelem) - - take_indices = np.random.randint(0, len(df), ntake) - - actual = df.take(take_indices) - expected = df.to_pandas().take(take_indices) - - assert actual.ii.null_count == 0 - assert actual.ff.null_count == 0 - assert_eq(actual, expected) - - -@pytest.mark.parametrize("ntake", [1, 2, 8, 9]) -def test_dataframe_take_with_multiindex(ntake): - np.random.seed(0) - df = cudf.DataFrame( - index=cudf.MultiIndex( - levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], - codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - ) - ) - - nelem = 9 - df["ii"] = np.random.randint(0, 20, nelem) - df["ff"] = np.random.random(nelem) - - take_indices = np.random.randint(0, len(df), ntake) - - actual = df.take(take_indices) - expected = df.to_pandas().take(take_indices) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) -def test_series_take(ntake): - np.random.seed(0) - nelem = 123 - - psr = pd.Series(np.random.randint(0, 20, nelem)) - gsr = cudf.Series(psr) - - take_indices = np.random.randint(0, len(gsr), ntake) - - actual = gsr.take(take_indices) - expected = psr.take(take_indices) - - assert_eq(actual, expected) - - -def test_series_take_positional(): - psr = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"]) - - gsr = cudf.Series.from_pandas(psr) - - take_indices = [1, 2, 0, 3] - - expect = psr.take(take_indices) - got = gsr.take(take_indices) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("nelem", [0, 1, 5, 20, 100]) -@pytest.mark.parametrize("slice_start", [None, 0, 1, 3, 10, -10]) -@pytest.mark.parametrize("slice_end", [None, 0, 1, 30, 50, -1]) -def test_dataframe_masked_slicing(nelem, slice_start, slice_end): - gdf = cudf.DataFrame() - gdf["a"] = list(range(nelem)) - gdf["b"] = list(range(nelem, 2 * nelem)) - gdf["a"] = gdf["a"]._column.set_mask(utils.random_bitmask(nelem)) - gdf["b"] = gdf["b"]._column.set_mask(utils.random_bitmask(nelem)) - - def do_slice(x): - return x[slice_start:slice_end] - - expect = do_slice(gdf.to_pandas()) - got = do_slice(gdf).to_pandas() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("dtype", [int, float, str]) -def test_empty_boolean_mask(dtype): - gdf = cudf.datasets.randomdata(nrows=0, dtypes={"a": dtype}) - pdf = gdf.to_pandas() - - compare_val = dtype(1) - - expected = pdf[pdf.a == compare_val] - got = gdf[gdf.a == compare_val] - assert_eq(expected, got) - - expected = pdf.a[pdf.a == compare_val] - got = gdf.a[gdf.a == compare_val] - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - [1.0, 2.0, 3.0, 4.0], - ["one", "two", "three", "four"], - pd.Series(["a", "b", "c", "d"], dtype="category"), - pd.Series(pd.date_range("2010-01-01", "2010-01-04")), - ], -) -@pytest.mark.parametrize( - "mask", - [ - [True, True, True, True], - [False, False, False, False], - [True, False, True, False], - [True, False, False, True], - np.array([True, False, True, False]), - pd.Series([True, False, True, False]), - cudf.Series([True, False, True, False]), - ], -) -@pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) -def test_series_apply_boolean_mask(data, mask, nulls): - psr = pd.Series(data) - - if len(data) > 0: - if nulls == "one": - p = np.random.randint(0, 4) - psr[p] = None - elif nulls == "some": - p1, p2 = np.random.randint(0, 4, (2,)) - psr[p1] = None - psr[p2] = None - elif nulls == "all": - psr[:] = None - - gsr = cudf.from_pandas(psr) - - # TODO: from_pandas(psr) has dtype "float64" - # when psr has dtype "object" and is all None - if psr.dtype == "object" and nulls == "all": - gsr = cudf.Series([None, None, None, None], dtype="object") - - if isinstance(mask, cudf.Series): - expect = psr[mask.to_pandas()] - else: - expect = psr[mask] - got = gsr[mask] - - assert_eq(expect, got) - - -def test_dataframe_apply_boolean_mask(): - pdf = pd.DataFrame( - { - "a": [0, 1, 2, 3], - "b": [0.1, 0.2, None, 0.3], - "c": ["a", None, "b", "c"], - } - ) - gdf = cudf.DataFrame.from_pandas(pdf) - assert_eq(pdf[[True, False, True, False]], gdf[[True, False, True, False]]) - - -""" -This test compares cudf and Pandas DataFrame boolean indexing. -""" - - -@pytest.mark.parametrize( - "mask_fn", [lambda x: x, lambda x: np.array(x), lambda x: pd.Series(x)] -) -def test_dataframe_boolean_mask(mask_fn): - mask_base = [ - True, - False, - True, - False, - True, - False, - True, - False, - True, - False, - ] - pdf = pd.DataFrame({"x": range(10), "y": range(10)}) - gdf = cudf.from_pandas(pdf) - mask = mask_fn(mask_base) - assert len(mask) == gdf.shape[0] - pdf_masked = pdf[mask] - gdf_masked = gdf[mask] - assert pdf_masked.to_string().split() == gdf_masked.to_string().split() - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "key, value", - [ - (0, 4), - (1, 4), - ([0, 1], 4), - ([0, 1], [4, 5]), - (slice(0, 2), [4, 5]), - (slice(1, None), [4, 5, 6, 7]), - ([], 1), - ([], []), - (slice(None, None), 1), - (slice(-1, -3), 7), - ], -) -@pytest.mark.parametrize("nulls", ["none", "some", "all"]) -def test_series_setitem_basics(key, value, nulls): - psr = pd.Series([1, 2, 3, 4, 5]) - if nulls == "some": - psr[[0, 4]] = None - elif nulls == "all": - psr[:] = None - gsr = cudf.from_pandas(psr) - with expect_warning_if( - isinstance(value, list) and len(value) == 0 and nulls == "none" - ): - psr[key] = value - with expect_warning_if( - isinstance(value, list) and len(value) == 0 and not len(key) == 0 - ): - gsr[key] = value - assert_eq(psr, gsr, check_dtype=False) - - -def test_series_setitem_null(): - gsr = cudf.Series([1, 2, 3, 4]) - gsr[0] = None - - expect = cudf.Series([None, 2, 3, 4]) - got = gsr - assert_eq(expect, got) - - gsr = cudf.Series([None, 2, 3, 4]) - gsr[0] = 1 - - expect = cudf.Series([1, 2, 3, 4]) - got = gsr - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "key, value", - [ - (0, 4), - (1, 4), - ([0, 1], 4), - ([0, 1], [4, 5]), - (slice(0, 2), [4, 5]), - (slice(1, None), [4, 5, 6, 7]), - ([], 1), - ([], []), - (slice(None, None), 1), - (slice(-1, -3), 7), - ], -) -@pytest.mark.parametrize("nulls", ["none", "some", "all"]) -def test_series_setitem_iloc(key, value, nulls): - psr = pd.Series([1, 2, 3, 4, 5]) - if nulls == "some": - psr[[0, 4]] = None - elif nulls == "all": - psr[:] = None - gsr = cudf.from_pandas(psr) - with expect_warning_if( - isinstance(value, list) and len(value) == 0 and nulls == "none" - ): - psr.iloc[key] = value - with expect_warning_if( - isinstance(value, list) and len(value) == 0 and not len(key) == 0 - ): - gsr.iloc[key] = value - assert_eq(psr, gsr, check_dtype=False) - - -@pytest.mark.parametrize( - "key, value", - [ - pytest.param( - 0, - 0.5, - ), - ([0, 1], 0.5), - ([0, 1], [0.5, 2.5]), - (slice(0, 2), [0.5, 0.25]), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_series_setitem_dtype(key, value): - psr = pd.Series([1, 2, 3], dtype="int32") - gsr = cudf.from_pandas(psr) - - with expect_warning_if(isinstance(value, (float, list))): - psr[key] = value - with expect_warning_if(isinstance(value, (float, list))): - gsr[key] = value - - assert_eq(psr, gsr) - - -def test_series_setitem_datetime(): - psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]") - gsr = cudf.from_pandas(psr) - - psr[0] = np.datetime64("2005") - gsr[0] = np.datetime64("2005") - - assert_eq(psr, gsr) - - -def test_series_setitem_datetime_coerced(): - psr = pd.Series(["2001", "2002", "2003"], dtype="datetime64[ns]") - gsr = cudf.from_pandas(psr) - - psr[0] = "2005" - gsr[0] = "2005" - - assert_eq(psr, gsr) - - -def test_series_setitem_categorical(): - psr = pd.Series(["a", "b", "a", "c", "d"], dtype="category") - gsr = cudf.from_pandas(psr) - - psr[0] = "d" - gsr[0] = "d" - assert_eq(psr, gsr) - - psr = psr.cat.add_categories(["e"]) - gsr = gsr.cat.add_categories(["e"]) - psr[0] = "e" - gsr[0] = "e" - assert_eq(psr, gsr) - - psr[[0, 1]] = "b" - gsr[[0, 1]] = "b" - assert_eq(psr, gsr) - - psr[0:3] = "e" - gsr[0:3] = "e" - assert_eq(psr, gsr) - - -@pytest.mark.parametrize( - "key, value", - [ - (0, "d"), - (0, "g"), - ([0, 1], "g"), - ([0, 1], None), - (slice(None, 2), "g"), - (slice(None, 2), ["g", None]), - ], -) -def test_series_setitem_string(key, value): - psr = pd.Series(["a", "b", "c", "d", "e"]) - gsr = cudf.from_pandas(psr) - psr[key] = value - gsr[key] = value - assert_eq(psr, gsr) - - psr = pd.Series(["a", None, "c", "d", "e"]) - gsr = cudf.from_pandas(psr) - psr[key] = value - gsr[key] = value - assert_eq(psr, gsr) - - -@pytest.mark.parametrize( - "key, value", - [ - ("a", 4), - ("b", 4), - ("b", np.int8(8)), - ("d", 4), - ("d", np.int8(16)), - ("d", np.float32(16)), - (["a", "b"], 4), - (["a", "b"], [4, 5]), - ([True, False, True], 4), - ([False, False, False], 4), - ([True, False, True], [4, 5]), - ], -) -def test_series_setitem_loc(key, value): - psr = pd.Series([1, 2, 3], ["a", "b", "c"]) - gsr = cudf.from_pandas(psr) - psr.loc[key] = value - gsr.loc[key] = value - assert_eq(psr, gsr) - - -@pytest.mark.parametrize( - "key, value", - [ - (1, "d"), - (2, "e"), - (4, "f"), - ([1, 3], "g"), - ([1, 3], ["g", "h"]), - ([True, False, True], "i"), - ([False, False, False], "j"), - ([True, False, True], ["k", "l"]), - ], -) -def test_series_setitem_loc_numeric_index(key, value): - psr = pd.Series(["a", "b", "c"], [1, 2, 3]) - gsr = cudf.from_pandas(psr) - psr.loc[key] = value - gsr.loc[key] = value - assert_eq(psr, gsr) - - -@pytest.mark.parametrize( - "key, value", - [ - ((0, 0), 5), - ((slice(None), 0), 5), - ((slice(None), 0), range(3)), - ((slice(None, -1), 0), range(2)), - (([0, 1], 0), 5), - ], -) -def test_dataframe_setitem_iloc(key, value, pdf_gdf): - pdf, gdf = pdf_gdf - pdf.iloc[key] = value - gdf.iloc[key] = value - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "key, value", - [ - (("one", "a"), 5), - ((slice(None), "a"), 5), - ((slice(None), "a"), range(3)), - ((slice(None), "a"), [3, 2, 1]), - ((slice(None, "two"), "a"), range(2)), - ((slice(None, "two"), "a"), [4, 5]), - ((["one", "two"], "a"), 5), - (("one", "c"), 5), - ((["one", "two"], "c"), 5), - ((slice(None), "c"), 5), - ((slice(None), "c"), range(3)), - ((slice(None), "c"), [3, 2, 1]), - ((slice(None, "two"), "c"), range(2)), - ((slice(None, "two"), "c"), [4, 5]), - ], -) -def test_dataframe_setitem_loc(key, value, pdf_gdf): - pdf, gdf = pdf_gdf - pdf.loc[key] = value - gdf.loc[key] = value - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "key, value", - [ - (("one", "a"), 5), - ((slice(None), "a"), range(3)), - ((slice(None), "a"), [3, 2, 1]), - ], -) -def test_dataframe_setitem_loc_empty_df(key, value): - pdf, gdf = pd.DataFrame(), cudf.DataFrame() - pdf.loc[key] = value - gdf.loc[key] = value - assert_eq(pdf, gdf, check_dtype=False) - - -@pytest.mark.parametrize( - "key,value", - [ - ((0, 0), 5.0), - ((slice(None), 0), 5.0), - ((slice(None), 0), np.arange(7, dtype="float64")), - ], -) -def test_dataframe_setitem_iloc_multiindex(key, value, pdf_gdf_multi): - pdf, gdf = pdf_gdf_multi - - pdf.iloc[key] = value - gdf.iloc[key] = value - - assert_eq(pdf, gdf) - - -def test_boolean_indexing_single_row(pdf_gdf): - pdf, gdf = pdf_gdf - assert_eq( - pdf.loc[[True, False, False], :], gdf.loc[[True, False, False], :] - ) - - -def test_iloc_negative_indices(): - psr = pd.Series([1, 2, 3, 4, 5]) - gsr = cudf.from_pandas(psr) - assert_eq(psr.iloc[[-1, -2, -4]], gsr.iloc[[-1, -2, -4]]) - - -def test_out_of_bounds_indexing(): - psr = pd.Series([1, 2, 3]) - gsr = cudf.from_pandas(psr) - - assert_exceptions_equal( - lambda: psr[[0, 1, 9]], - lambda: gsr[[0, 1, 9]], - ) - assert_exceptions_equal( - lambda: psr[[0, 1, -4]], - lambda: gsr[[0, 1, -4]], - ) - assert_exceptions_equal( - lambda: psr.__setitem__([0, 1, 9], 2), - lambda: gsr.__setitem__([0, 1, 9], 2), - ) - assert_exceptions_equal( - lambda: psr.__setitem__([0, 1, -4], 2), - lambda: gsr.__setitem__([0, 1, -4], 2), - ) - - -def test_out_of_bounds_indexing_empty(): - psr = pd.Series(dtype="int64") - gsr = cudf.from_pandas(psr) - assert_exceptions_equal( - lambda: psr.iloc.__setitem__(-1, 2), - lambda: gsr.iloc.__setitem__(-1, 2), - ) - assert_exceptions_equal( - lambda: psr.iloc.__setitem__(1, 2), - lambda: gsr.iloc.__setitem__(1, 2), - ) - - -def test_sliced_indexing(): - a = list(range(4, 4 + 150)) - b = list(range(0, 0 + 150)) - pdf = pd.DataFrame({"a": a, "b": b}) - gdf = cudf.DataFrame.from_pandas(pdf) - pdf = pdf.set_index("a") - gdf = gdf.set_index("a") - pidx = pdf.index[:75] - gidx = gdf.index[:75] - - assert_eq(pdf.loc[pidx], gdf.loc[gidx]) - - -@pytest.mark.parametrize("index", [["a"], ["a", "a"], ["a", "a", "b", "c"]]) -def test_iloc_categorical_index(index): - gdf = cudf.DataFrame({"data": range(len(index))}, index=index) - gdf.index = gdf.index.astype("category") - pdf = gdf.to_pandas() - expect = pdf.iloc[:, 0] - got = gdf.iloc[:, 0] - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "sli", - [ - slice("2001", "2002"), - slice("2002", "2001"), - slice("2001", None), - ], -) -@pytest.mark.parametrize("is_dataframe", [True, False]) -def test_loc_datetime_index(sli, is_dataframe): - sli = slice(pd.to_datetime(sli.start), pd.to_datetime(sli.stop)) - - if is_dataframe is True: - pd_data = pd.DataFrame( - {"a": [1, 2, 3]}, - index=pd.Series(["2001", "2009", "2002"], dtype="datetime64[ns]"), - ) - else: - pd_data = pd.Series( - [1, 2, 3], - pd.Series(["2001", "2009", "2002"], dtype="datetime64[ns]"), - ) - - gd_data = cudf.from_pandas(pd_data) - expect = pd_data.loc[sli] - got = gd_data.loc[sli] - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "sli", - [ - slice("2001", "2020"), - slice(None, "2020"), - ], -) -def test_loc_datetime_index_slice_not_in(sli): - pd_data = pd.Series( - [1, 2, 3], - pd.Series(["2001", "2009", "2002"], dtype="datetime64[ns]"), - ) - gd_data = cudf.from_pandas(pd_data) - with pytest.raises(KeyError): - assert_eq(pd_data.loc[sli], gd_data.loc[sli]) - - with pytest.raises(KeyError): - sli = slice(pd.to_datetime(sli.start), pd.to_datetime(sli.stop)) - assert_eq(pd_data.loc[sli], gd_data.loc[sli]) - - -@pytest.mark.parametrize( - "gdf_kwargs", - [ - {"data": {"a": range(1000)}}, - {"data": {"a": range(1000), "b": range(1000)}}, - { - "data": { - "a": range(20), - "b": range(20), - "c": ["abc", "def", "xyz", "def", "pqr"] * 4, - } - }, - {"index": [1, 2, 3]}, - {"index": range(1000)}, - {"columns": ["a", "b", "c", "d"]}, - {"columns": ["a"], "index": range(1000)}, - {"columns": ["a", "col2", "...col n"], "index": range(1000)}, - {"index": cudf.Series(range(1000)).astype("str")}, - { - "columns": ["a", "b", "c", "d"], - "index": cudf.Series(range(1000)).astype("str"), - }, - ], -) -@pytest.mark.parametrize( - "slice", - [ - slice(6, None), # start but no stop, [6:] - slice(None, None, 3), # only step, [::3] - slice(1, 10, 2), # start, stop, step - slice(3, -5, 2), # negative stop - slice(-2, -4), # slice is empty - slice(-10, -20, -1), # reversed slice - slice(None), # slices everything, same as [:] - slice(250, 500), - slice(250, 251), - slice(50), - slice(1, 10), - slice(10, 20), - slice(15, 24), - slice(6), - ], -) -def test_dataframe_sliced(gdf_kwargs, slice): - gdf = cudf.DataFrame(**gdf_kwargs) - pdf = gdf.to_pandas() - - actual = gdf[slice] - expected = pdf[slice] - - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "gdf", - [ - cudf.DataFrame({"a": range(10000)}), - cudf.DataFrame( - { - "a": range(10000), - "b": range(10000), - "c": range(10000), - "d": range(10000), - "e": range(10000), - "f": range(10000), - } - ), - cudf.DataFrame({"a": range(20), "b": range(20)}), - cudf.DataFrame( - { - "a": range(20), - "b": range(20), - "c": ["abc", "def", "xyz", "def", "pqr"] * 4, - } - ), - cudf.DataFrame(index=[1, 2, 3]), - cudf.DataFrame(index=range(10000)), - cudf.DataFrame(columns=["a", "b", "c", "d"]), - cudf.DataFrame(columns=["a"], index=range(10000)), - cudf.DataFrame(columns=["a", "col2", "...col n"], index=range(10000)), - cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), - cudf.DataFrame( - columns=["a", "b", "c", "d"], - index=cudf.Series(range(10000)).astype("str"), - ), - ], -) -@pytest.mark.parametrize( - "slice", - [slice(6), slice(1), slice(7), slice(1, 3)], -) -def test_dataframe_iloc_index(gdf, slice): - pdf = gdf.to_pandas() - - actual = gdf.iloc[:, slice] - expected = pdf.iloc[:, slice] - - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data", - [ - [[0], [1], [2]], - [[0, 1], [2, 3], [4, 5]], - [[[0, 1], [2]], [[3, 4]], [[5, 6]]], - [None, [[0, 1], [2]], [[3, 4], [5, 6]]], - [[], [[0, 1], [2]], [[3, 4], [5, 6]]], - [[], [["a", "b"], None], [["c", "d"], []]], - ], -) -@pytest.mark.parametrize( - "key", [[], [0], [0, 1], [0, 1, 0], slice(None), slice(0, 2), slice(1, 3)] -) -def test_iloc_with_lists(data, key): - psr = pd.Series(data) - gsr = cudf.Series(data) - assert_eq(psr.iloc[key], gsr.iloc[key]) - - pdf = pd.DataFrame({"a": data, "b": data}) - gdf = cudf.DataFrame({"a": data, "b": data}) - assert_eq(pdf.iloc[key], gdf.iloc[key]) - - -@pytest.mark.parametrize("key", [5, -10, "0", "a", np.array(5), np.array("a")]) -def test_loc_bad_key_type(key): - psr = pd.Series([1, 2, 3]) - gsr = cudf.from_pandas(psr) - assert_exceptions_equal(lambda: psr[key], lambda: gsr[key]) - assert_exceptions_equal(lambda: psr.loc[key], lambda: gsr.loc[key]) - - -@pytest.mark.parametrize("key", ["b", 1.0, np.array("b")]) -def test_loc_bad_key_type_string_index(key): - psr = pd.Series([1, 2, 3], index=["a", "1", "c"]) - gsr = cudf.from_pandas(psr) - assert_exceptions_equal(lambda: psr[key], lambda: gsr[key]) - assert_exceptions_equal(lambda: psr.loc[key], lambda: gsr.loc[key]) - - -def test_loc_zero_dim_array(): - psr = pd.Series([1, 2, 3]) - gsr = cudf.from_pandas(psr) - - assert_eq(psr[np.array(0)], gsr[np.array(0)]) - assert_eq(psr[np.array([0])[0]], gsr[np.array([0])[0]]) - - -@pytest.mark.parametrize( - "arg", - [ - slice(None), - slice((1, 2), None), - slice(None, (1, 2)), - (1, 1), - pytest.param( - (1, slice(None)), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/46704" - ), - ), - 1, - 2, - ], -) -def test_loc_series_multiindex(arg): - gsr = cudf.DataFrame( - {"a": [1, 1, 2], "b": [1, 2, 3], "c": ["a", "b", "c"]} - ).set_index(["a", "b"])["c"] - psr = gsr.to_pandas() - assert_eq(psr.loc[arg], gsr.loc[arg]) - - -@pytest.mark.parametrize( - "arg", - [ - slice(None, None, -1), - slice(None, -1, -1), - slice(4, -1, -1), - slice(None, None, -3), - slice(None, -1, -3), - slice(4, -1, -3), - ], -) -@pytest.mark.parametrize( - "pobj", [pd.DataFrame({"a": [1, 2, 3, 4, 5]}), pd.Series([1, 2, 3, 4, 5])] -) -def test_iloc_before_zero_terminate(arg, pobj): - gobj = cudf.from_pandas(pobj) - - assert_eq(pobj.iloc[arg], gobj.iloc[arg]) - - -def test_iloc_decimal(): - sr = cudf.Series(["1.00", "2.00", "3.00", "4.00"]).astype( - cudf.Decimal64Dtype(scale=2, precision=3) - ) - got = sr.iloc[[3, 2, 1, 0]] - expect = cudf.Series( - ["4.00", "3.00", "2.00", "1.00"], - ).astype(cudf.Decimal64Dtype(scale=2, precision=3)) - assert_eq(expect.reset_index(drop=True), got.reset_index(drop=True)) - - -@pytest.mark.parametrize( - ("key, value"), - [ - ( - ([0], ["x", "y"]), - [10, 20], - ), - ( - ([0, 2], ["x", "y"]), - [[10, 30], [20, 40]], - ), - ( - (0, ["x", "y"]), - [10, 20], - ), - ( - ([0, 2], "x"), - [10, 20], - ), - ], -) -def test_dataframe_loc_inplace_update(key, value): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - pdf = gdf.to_pandas() - - actual = gdf.loc[key] = value - expected = pdf.loc[key] = value - - assert_eq(expected, actual) - - -def test_dataframe_loc_inplace_update_string_index(): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}, index=list("abc")) - pdf = gdf.to_pandas() - - actual = gdf.loc[["a"], ["x", "y"]] = [10, 20] - expected = pdf.loc[["a"], ["x", "y"]] = [10, 20] - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - ("key, value"), - [ - ([0], [10, 20]), - ([0, 2], [[10, 30], [20, 40]]), - (([0, 2], [0, 1]), [[10, 30], [20, 40]]), - (([0, 2], 0), [10, 30]), - ((0, [0, 1]), [20, 40]), - ], -) -def test_dataframe_iloc_inplace_update(key, value): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - pdf = gdf.to_pandas() - - actual = gdf.iloc[key] = value - expected = pdf.iloc[key] = value - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "loc_key", - [([0, 2], ["x", "y"])], -) -@pytest.mark.parametrize( - "iloc_key", - [[0, 2]], -) -@pytest.mark.parametrize( - ("data, index"), - [ - ( - {"x": [10, 20], "y": [30, 40]}, - [0, 2], - ) - ], -) -def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe( - loc_key, iloc_key, data, index -): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - pdf = gdf.to_pandas() - - actual = gdf.loc[loc_key] = cudf.DataFrame(data, index=cudf.Index(index)) - expected = pdf.loc[loc_key] = pd.DataFrame(data, index=pd.Index(index)) - assert_eq(expected, actual) - - actual = gdf.iloc[iloc_key] = cudf.DataFrame(data, index=cudf.Index(index)) - expected = pdf.iloc[iloc_key] = pd.DataFrame(data, index=pd.Index(index)) - assert_eq(expected, actual) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="No warning in older versions of pandas", -) -def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns(): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - pdf = gdf.to_pandas() - - actual = gdf.loc[[0, 2], ["x", "y"]] = cudf.DataFrame( - {"b": [10, 20], "y": [30, 40]}, index=cudf.Index([0, 2]) - ) - with pytest.warns(FutureWarning): - # Seems to be a false warning from pandas, - # but nevertheless catching it. - expected = pdf.loc[[0, 2], ["x", "y"]] = pd.DataFrame( - {"b": [10, 20], "y": [30, 40]}, index=pd.Index([0, 2]) - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - ("key, value"), - [ - (([0, 2], ["x", "y"]), [[10, 30, 50], [20, 40, 60]]), - (([0], ["x", "y"]), [[10], [20]]), - ], -) -def test_dataframe_loc_inplace_update_shape_mismatch(key, value): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - with pytest.raises(ValueError, match="shape mismatch:"): - gdf.loc[key] = value - - -@pytest.mark.parametrize( - ("key, value"), - [ - ([0, 2], [[10, 30, 50], [20, 40, 60]]), - ([0], [[10], [20]]), - ], -) -def test_dataframe_iloc_inplace_update_shape_mismatch(key, value): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - with pytest.raises(ValueError, match="shape mismatch:"): - gdf.iloc[key] = value - - -def test_dataframe_loc_inplace_update_shape_mismatch_RHS_df(): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - with pytest.raises(ValueError, match="shape mismatch:"): - gdf.loc[([0, 2], ["x", "y"])] = cudf.DataFrame( - {"x": [10, 20]}, index=cudf.Index([0, 2]) - ) - - -def test_dataframe_iloc_inplace_update_shape_mismatch_RHS_df(): - gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) - with pytest.raises(ValueError, match="shape mismatch:"): - gdf.iloc[[0, 2]] = cudf.DataFrame( - {"x": [10, 20]}, index=cudf.Index([0, 2]) - ) - - -@pytest.mark.parametrize( - "array,is_error", - [ - (cupy.arange(20, 40).reshape(-1, 2), False), - (cupy.arange(20, 50).reshape(-1, 3), True), - (np.arange(20, 40).reshape(-1, 2), False), - (np.arange(20, 30).reshape(-1, 1), False), - (cupy.arange(20, 30).reshape(-1, 1), False), - ], -) -def test_dataframe_indexing_setitem_np_cp_array(array, is_error): - gdf = cudf.DataFrame({"a": range(10), "b": range(10)}) - pdf = gdf.to_pandas() - if not is_error: - gdf.loc[:, ["a", "b"]] = array - pdf.loc[:, ["a", "b"]] = cupy.asnumpy(array) - - assert_eq(gdf, pdf) - else: - assert_exceptions_equal( - lfunc=pdf.loc.__setitem__, - rfunc=gdf.loc.__setitem__, - lfunc_args_and_kwargs=( - [(slice(None, None, None), ["a", "b"]), cupy.asnumpy(array)], - {}, - ), - rfunc_args_and_kwargs=( - [(slice(None, None, None), ["a", "b"]), array], - {}, - ), - ) - - -def test_iloc_single_row_with_nullable_column(): - # see https://github.com/rapidsai/cudf/issues/11349 - pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.4]}) - df = cudf.from_pandas(pdf) - - df.iloc[0] # before the fix for #11349 this would segfault - assert_eq(pdf.iloc[0], df.iloc[0]) - - -def test_loc_single_row_from_slice(): - # see https://github.com/rapidsai/cudf/issues/11930 - pdf = pd.DataFrame({"a": [10, 20, 30], "b": [1, 2, 3]}).set_index("a") - df = cudf.from_pandas(pdf) - assert_eq(pdf.loc[5:10], df.loc[5:10]) - - -@pytest.mark.parametrize("indexer", ["loc", "iloc"]) -@pytest.mark.parametrize( - "mask", - [[False, True], [False, False, True, True, True]], - ids=["too-short", "too-long"], -) -def test_boolean_mask_wrong_length(indexer, mask): - s = pd.Series([1, 2, 3, 4]) - - indexee = getattr(s, indexer) - with pytest.raises(IndexError): - indexee[mask] - - c = cudf.from_pandas(s) - indexee = getattr(c, indexer) - with pytest.raises(IndexError): - indexee[mask] - - -@pytest.mark.parametrize("indexer", ["loc", "iloc"]) -def test_boolean_mask_columns(indexer): - df = pd.DataFrame(np.zeros((3, 3))) - cdf = cudf.from_pandas(df) - mask = [True, False, True] - expect = getattr(df, indexer)[:, mask] - got = getattr(cdf, indexer)[:, mask] - - assert_eq(expect, got) - - -@pytest.mark.parametrize("indexer", ["loc", "iloc"]) -@pytest.mark.parametrize( - "mask", - [[False, True], [False, False, True, True, True]], - ids=["too-short", "too-long"], -) -def test_boolean_mask_columns_wrong_length(indexer, mask): - df = pd.DataFrame(np.zeros((3, 3))) - cdf = cudf.from_pandas(df) - - with pytest.raises(IndexError): - getattr(df, indexer)[:, mask] - with pytest.raises(IndexError): - getattr(cdf, indexer)[:, mask] - - -def test_boolean_mask_columns_iloc_series(): - df = pd.DataFrame(np.zeros((3, 3))) - cdf = cudf.from_pandas(df) - - mask = pd.Series([True, False, True], dtype=bool) - with pytest.raises(NotImplementedError): - df.iloc[:, mask] - - with pytest.raises(NotImplementedError): - cdf.iloc[:, mask] - - -@pytest.mark.parametrize("index_type", ["single", "slice"]) -def test_loc_timestamp_issue_8585(index_type): - # https://github.com/rapidsai/cudf/issues/8585 - start = pd.Timestamp( - datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") - ) - end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M")) - timestamps = pd.date_range(start, end, periods=12) - value = np.random.normal(size=12) - df = pd.DataFrame(value, index=timestamps, columns=["value"]) - cdf = cudf.from_pandas(df) - if index_type == "single": - index = pd.Timestamp( - datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M") - ) - elif index_type == "slice": - index = slice(start, end, None) - else: - raise ValueError("Invalid index type") - expect = df.loc[index] - actual = cdf.loc[index] - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "index_type", - [ - "single", - pytest.param( - "slice", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/8585" - ), - ), - pytest.param( - "date_range", - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/8585" - ), - ), - ], -) -def test_loc_multiindex_timestamp_issue_8585(index_type): - # https://github.com/rapidsai/cudf/issues/8585 - start = pd.Timestamp( - datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") - ) - end = pd.Timestamp(datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M")) - timestamps = pd.date_range(start, end, periods=4) - labels = ["A", "B", "C"] - index = pd.MultiIndex.from_product( - [timestamps, labels], names=["timestamp", "label"] - ) - value = np.random.normal(size=12) - df = pd.DataFrame(value, index=index, columns=["value"]) - cdf = cudf.from_pandas(df) - start = pd.Timestamp( - datetime.strptime("2021-03-12 01:00", "%Y-%m-%d %H:%M") - ) - end = pd.Timestamp(datetime.strptime("2021-03-12 02:00", "%Y-%m-%d %H:%M")) - if index_type == "single": - index = pd.Timestamp( - datetime.strptime("2021-03-12 03:00", "%Y-%m-%d %H:%M") - ) - elif index_type == "slice": - index = slice(start, end, None) - elif index_type == "date_range": - index = pd.date_range(start, end, periods=2) - else: - raise ValueError("Invalid index type") - expect = df.loc[index] - actual = cdf.loc[index] - assert_eq(expect, actual) - - -def test_loc_repeated_index_label_issue_8693(): - # https://github.com/rapidsai/cudf/issues/8693 - s = pd.Series([1, 2, 3, 4], index=[0, 1, 1, 2]) - cs = cudf.from_pandas(s) - expect = s.loc[1] - actual = cs.loc[1] - assert_eq(expect, actual) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13268") -@pytest.mark.parametrize( - "indexer", [(..., 0), (0, ...)], ids=["row_ellipsis", "column_ellipsis"] -) -def test_loc_ellipsis_as_slice_issue_13268(indexer): - # https://github.com/rapidsai/cudf/issues/13268 - df = pd.DataFrame(np.arange(4).reshape(2, 2)) - cdf = cudf.from_pandas(df) - - expect = df.loc[indexer] - actual = cdf.loc[indexer] - assert_eq(expect, actual) - - -@pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/13269 " - "and https://github.com/rapidsai/cudf/issues/13273" -) -def test_loc_repeated_column_label_issue_13269(): - # https://github.com/rapidsai/cudf/issues/13269 - # https://github.com/rapidsai/cudf/issues/13273 - df = pd.DataFrame(np.arange(4).reshape(2, 2)) - cdf = cudf.from_pandas(df) - - expect = df.loc[:, [0, 1, 0]] - actual = cdf.loc[:, [0, 1, 0]] - assert_eq(expect, actual) - - -def test_loc_column_boolean_mask_issue_13270(): - # https://github.com/rapidsai/cudf/issues/13270 - df = pd.DataFrame(np.arange(4).reshape(2, 2)) - cdf = cudf.from_pandas(df) - expect = df.loc[:, [True, True]] - actual = cdf.loc[:, [True, True]] - assert_eq(expect, actual) - - -@pytest.mark.parametrize("indexer", [[1], [0, 2]]) -def test_iloc_integer_categorical_issue_13013(indexer): - # https://github.com/rapidsai/cudf/issues/13013 - s = pd.Series([0, 1, 2]) - index = pd.Categorical(indexer) - expect = s.iloc[index] - c = cudf.from_pandas(s) - actual = c.iloc[index] - assert_eq(expect, actual) - - -def test_iloc_incorrect_boolean_mask_length_issue_13015(): - # https://github.com/rapidsai/cudf/issues/13015 - s = pd.Series([0, 1, 2]) - with pytest.raises(IndexError): - s.iloc[[True, False]] - c = cudf.from_pandas(s) - with pytest.raises(IndexError): - c.iloc[[True, False]] - - -def test_iloc_column_boolean_mask_issue_13265(): - # https://github.com/rapidsai/cudf/issues/13265 - df = pd.DataFrame(np.arange(4).reshape(2, 2)) - cdf = cudf.from_pandas(df) - expect = df.iloc[:, [True, True]] - actual = cdf.iloc[:, [True, True]] - assert_eq(expect, actual) - - -def test_iloc_repeated_column_label_issue_13266(): - # https://github.com/rapidsai/cudf/issues/13266 - # https://github.com/rapidsai/cudf/issues/13273 - df = pd.DataFrame(np.arange(4).reshape(2, 2)) - cdf = cudf.from_pandas(df) - - with pytest.raises(NotImplementedError): - cdf.iloc[:, [0, 1, 0]] - - -@pytest.mark.parametrize( - "indexer", - [ - (..., 0), - (0, ...), - ], - ids=["row_ellipsis", "column_ellipsis"], -) -def test_iloc_ellipsis_as_slice_issue_13267(indexer): - # https://github.com/rapidsai/cudf/issues/13267 - df = pd.DataFrame(np.arange(4).reshape(2, 2)) - cdf = cudf.from_pandas(df) - - expect = df.iloc[indexer] - actual = cdf.iloc[indexer] - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "indexer", - [ - 0, - (slice(None), 0), - ([0, 2], 1), - (slice(None), slice(None)), - (slice(None), [1, 0]), - (0, 0), - (1, [1, 0]), - ([1, 0], 0), - ([1, 2], [0, 1]), - ], -) -def test_iloc_multiindex_lookup_as_label_issue_13515(indexer): - # https://github.com/rapidsai/cudf/issues/13515 - df = pd.DataFrame( - {"a": [1, 1, 3], "b": [2, 3, 4], "c": [1, 6, 7], "d": [1, 8, 9]} - ).set_index(["a", "b"]) - cdf = cudf.from_pandas(df) - - expect = df.iloc[indexer] - actual = cdf.iloc[indexer] - assert_eq(expect, actual) - - -def test_loc_unsorted_index_slice_lookup_keyerror_issue_12833(): - # https://github.com/rapidsai/cudf/issues/12833 - df = pd.DataFrame({"a": [1, 2, 3]}, index=[7, 0, 4]) - cdf = cudf.from_pandas(df) - - # Check that pandas don't change their mind - with pytest.raises(KeyError): - df.loc[1:5] - - with pytest.raises(KeyError): - cdf.loc[1:5] - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13379") -@pytest.mark.parametrize("index", [range(5), list(range(5))]) -def test_loc_missing_label_keyerror_issue_13379(index): - # https://github.com/rapidsai/cudf/issues/13379 - df = pd.DataFrame({"a": index}, index=index) - cdf = cudf.from_pandas(df) - # Check that pandas don't change their mind - with pytest.raises(KeyError): - df.loc[[0, 5]] - - with pytest.raises(KeyError): - cdf.loc[[0, 5]] - - -@pytest.mark.parametrize("series", [True, False], ids=["Series", "DataFrame"]) -def test_loc_repeated_label_ordering_issue_13658(series): - # https://github.com/rapidsai/cudf/issues/13658 - values = range(2048) - index = [1 for _ in values] - if series: - frame = cudf.Series(values, index=index) - else: - frame = cudf.DataFrame({"a": values}, index=index) - expect = frame.to_pandas().loc[[1]] - actual = frame.loc[[1]] - assert_eq(actual, expect) - - -@pytest.mark.parametrize("index", [None, [2, 1, 3, 5, 4]]) -def test_loc_bool_key_numeric_index_raises(index): - ser = cudf.Series(range(5), index=index) - with pytest.raises(KeyError): - ser.loc[True] - - -class TestLocIndexWithOrder: - # https://github.com/rapidsai/cudf/issues/12833 - @pytest.fixture(params=["increasing", "decreasing", "neither"]) - def order(self, request): - return request.param - - @pytest.fixture(params=[-1, 1], ids=["reverse", "forward"]) - def take_order(self, request): - return request.param - - @pytest.fixture(params=["float", "int", "string", "range"]) - def dtype(self, request): - return request.param - - @pytest.fixture - def index(self, order, dtype): - if dtype == "string": - index = ["a", "h", "f", "z"] - elif dtype == "int": - index = [-1, 10, 7, 14] - elif dtype == "float": - index = [-1.5, 7.10, 2.4, 11.2] - elif dtype == "range": - if order == "increasing": - return cudf.RangeIndex(2, 10, 3) - elif order == "decreasing": - return cudf.RangeIndex(10, 1, -3) - else: - return cudf.RangeIndex(10, 20, 3) - else: - raise ValueError(f"Unhandled index dtype {dtype}") - if order == "decreasing": - return sorted(index, reverse=True) - elif order == "increasing": - return sorted(index) - elif order == "neither": - return index - else: - raise ValueError(f"Unhandled index order {order}") - - @pytest.fixture - def df(self, index): - return cudf.DataFrame({"a": range(len(index))}, index=index) - - def test_loc_index_inindex_slice(self, df, take_order): - pdf = df.to_pandas() - lo = pdf.index[1] - hi = pdf.index[-2] - expect = pdf.loc[lo:hi:take_order] - actual = df.loc[lo:hi:take_order] - assert_eq(expect, actual) - - def test_loc_index_inindex_subset(self, df, take_order): - pdf = df.to_pandas() - vals = [pdf.index[0], pdf.index[2]][::take_order] - expect = pdf.loc[vals] - actual = df.loc[vals] - assert_eq(expect, actual) - - def test_loc_index_notinindex_slice( - self, request, df, order, dtype, take_order - ): - pdf = df.to_pandas() - lo = pdf.index[1] - hi = pdf.index[-2] - if isinstance(lo, str): - lo = chr(ord(lo) - 1) - hi = chr(ord(hi) + 1) - else: - lo -= 1 - hi += 1 - if order == "neither" and dtype != "range": - with pytest.raises(KeyError): - pdf.loc[lo:hi:take_order] - with pytest.raises(KeyError): - df.loc[lo:hi:take_order] - else: - expect = pdf.loc[lo:hi:take_order] - actual = df.loc[lo:hi:take_order] - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "arg", - [ - (2, ("one", "second")), - (slice(None, None, None), ("two", "first")), - (1, ("one", "first")), - (slice(None, None, None), ("two", "second")), - (slice(None, None, None), ("two", "first", "three")), - (3, ("two", "first", "three")), - (slice(None, None, None), ("two",)), - (0, ("two",)), - ], -) -def test_loc_dataframe_column_multiindex(arg): - gdf = cudf.DataFrame( - [list("abcd"), list("efgh"), list("ijkl"), list("mnop")], - columns=cudf.MultiIndex.from_product( - [["one", "two"], ["first", "second"], ["three"]] - ), - ) - pdf = gdf.to_pandas() - - assert_eq(gdf.loc[arg], pdf.loc[arg]) - - -@pytest.mark.parametrize( - "arg", [slice(2, 4), slice(2, 5), slice(2.3, 5), slice(4.6, 6)] -) -def test_series_iloc_float_int(arg): - gs = cudf.Series(range(4), index=[2.0, 3.0, 4.5, 5.5]) - ps = gs.to_pandas() - - actual = gs.loc[arg] - expected = ps.loc[arg] - - assert_eq(actual, expected) - - -def test_iloc_loc_mixed_dtype(): - df = cudf.DataFrame({"a": ["a", "b"], "b": [0, 1]}) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(TypeError): - df.iloc[0] - with pytest.raises(TypeError): - df.loc[0] - df = df.astype("str") - pdf = df.to_pandas() - - assert_eq(df.iloc[0], pdf.iloc[0]) - assert_eq(df.loc[0], pdf.loc[0]) - - -def test_loc_setitem_categorical_integer_not_position_based(): - gdf = cudf.DataFrame(range(3), index=cudf.CategoricalIndex([1, 2, 3])) - pdf = gdf.to_pandas() - gdf.loc[1] = 10 - pdf.loc[1] = 10 - assert_eq(gdf, pdf) - - -@pytest.mark.parametrize("typ", ["datetime64[ns]", "timedelta64[ns]"]) -@pytest.mark.parametrize("idx_method, key", [["iloc", 0], ["loc", "a"]]) -def test_series_iloc_scalar_datetimelike_return_pd_scalar( - typ, idx_method, key -): - obj = cudf.Series([1, 2, 3], index=list("abc"), dtype=typ) - with cudf.option_context("mode.pandas_compatible", True): - result = getattr(obj, idx_method)[key] - expected = getattr(obj.to_pandas(), idx_method)[key] - assert result == expected - - -@pytest.mark.parametrize("typ", ["datetime64[ns]", "timedelta64[ns]"]) -@pytest.mark.parametrize( - "idx_method, row_key, col_key", [["iloc", 0, 0], ["loc", "a", "a"]] -) -def test_dataframe_iloc_scalar_datetimelike_return_pd_scalar( - typ, idx_method, row_key, col_key -): - obj = cudf.DataFrame( - [1, 2, 3], index=list("abc"), columns=["a"], dtype=typ - ) - with cudf.option_context("mode.pandas_compatible", True): - result = getattr(obj, idx_method)[row_key, col_key] - expected = getattr(obj.to_pandas(), idx_method)[row_key, col_key] - assert result == expected - - -@pytest.mark.parametrize("idx_method, key", [["iloc", 0], ["loc", "a"]]) -def test_series_iloc_scalar_interval_return_pd_scalar(idx_method, key): - iidx = cudf.IntervalIndex.from_breaks([1, 2, 3]) - obj = cudf.Series(iidx, index=list("ab")) - with cudf.option_context("mode.pandas_compatible", True): - result = getattr(obj, idx_method)[key] - expected = getattr(obj.to_pandas(), idx_method)[key] - assert result == expected - - -@pytest.mark.parametrize( - "idx_method, row_key, col_key", [["iloc", 0, 0], ["loc", "a", "a"]] -) -def test_dataframe_iloc_scalar_interval_return_pd_scalar( - idx_method, row_key, col_key -): - iidx = cudf.IntervalIndex.from_breaks([1, 2, 3]) - obj = cudf.DataFrame({"a": iidx}, index=list("ab")) - with cudf.option_context("mode.pandas_compatible", True): - result = getattr(obj, idx_method)[row_key, col_key] - expected = getattr(obj.to_pandas(), idx_method)[row_key, col_key] - assert result == expected - - -def test_scalar_loc_row_categoricalindex(): - df = cudf.DataFrame( - range(4), index=cudf.CategoricalIndex(["a", "a", "b", "c"]) - ) - result = df.loc["a"] - expected = df.to_pandas().loc["a"] - assert_eq(result, expected) - - -@pytest.mark.parametrize("klass", [cudf.DataFrame, cudf.Series]) -@pytest.mark.parametrize("indexer", ["iloc", "loc"]) -def test_iloc_loc_no_circular_reference(klass, indexer): - obj = klass([0]) - ref = weakref.ref(obj) - getattr(obj, indexer)[0] - del obj - assert ref() is None - - -def test_loc_setitem_empty_dataframe(): - pdf = pd.DataFrame(index=["index_1", "index_2", "index_3"]) - gdf = cudf.from_pandas(pdf) - pdf.loc[["index_1"], "new_col"] = "A" - gdf.loc[["index_1"], "new_col"] = "A" - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "data", - [ - [15, 14, 12, 10, 1], - [1, 10, 12, 14, 15], - ], -) -@pytest.mark.parametrize( - "scalar", - [ - 1, - 10, - 15, - 14, - 0, - 2, - ], -) -def test_loc_datetime_monotonic_with_ts(data, scalar): - gdf = cudf.DataFrame( - {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]}, - index=cudf.Index(data, dtype="datetime64[ns]"), - ) - pdf = gdf.to_pandas() - - i = pd.Timestamp(scalar) - - actual = gdf.loc[i:] - expected = pdf.loc[i:] - - assert_eq(actual, expected) - - actual = gdf.loc[:i] - expected = pdf.loc[:i] - - assert_eq(actual, expected) - - -@pytest.mark.parametrize("data", [[15, 14, 3, 10, 1]]) -@pytest.mark.parametrize("scalar", [1, 10, 15, 14, 0, 2]) -def test_loc_datetime_random_with_ts(data, scalar): - gdf = cudf.DataFrame( - {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]}, - index=cudf.Index(data, dtype="datetime64[ns]"), - ) - pdf = gdf.to_pandas() - - i = pd.Timestamp(scalar) - - if i not in pdf.index: - assert_exceptions_equal( - lambda: pdf.loc[i:], - lambda: gdf.loc[i:], - lfunc_args_and_kwargs=([],), - rfunc_args_and_kwargs=([],), - ) - assert_exceptions_equal( - lambda: pdf.loc[:i], - lambda: gdf.loc[:i], - lfunc_args_and_kwargs=([],), - rfunc_args_and_kwargs=([],), - ) - else: - actual = gdf.loc[i:] - expected = pdf.loc[i:] - - assert_eq(actual, expected) - - actual = gdf.loc[:i] - expected = pdf.loc[:i] - - assert_eq(actual, expected) - - -def test_sliced_categorical_as_ordered(): - df = cudf.DataFrame({"a": list("caba"), "b": list(range(4))}) - df["a"] = df["a"].astype("category") - df = df.iloc[:2] - result = df["a"].cat.as_ordered() - expected = cudf.Series( - ["c", "a"], - dtype=cudf.CategoricalDtype(list("abc"), ordered=True), - name="a", - ) - assert_eq(result, expected) - - -def test_duplicate_labels_raises(): - df = cudf.DataFrame([[1, 2]], columns=["a", "b"]) - with pytest.raises(ValueError): - df[["a", "a"]] - with pytest.raises(ValueError): - df.loc[:, ["a", "a"]] - - -@pytest.mark.parametrize("indexer", ["iloc", "loc"]) -@pytest.mark.parametrize("dtype", ["category", "timedelta64[ns]"]) -def test_loc_iloc_setitem_col_slice_non_cupy_types(indexer, dtype): - df_pd = pd.DataFrame(range(2), dtype=dtype) - df_cudf = cudf.DataFrame.from_pandas(df_pd) - getattr(df_pd, indexer)[:, 0] = getattr(df_pd, indexer)[:, 0] - getattr(df_cudf, indexer)[:, 0] = getattr(df_cudf, indexer)[:, 0] - assert_eq(df_pd, df_cudf) diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py deleted file mode 100644 index c76a49103e2..00000000000 --- a/python/cudf/cudf/tests/test_interpolate.py +++ /dev/null @@ -1,147 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@pytest.mark.parametrize( - "data", - [ - # basics - {"A": [1.0, 2.0, 3.0], "B": [4.0, 5.0, 6.0]}, - {"A": [1.0, None, 3.0], "B": [4.0, None, 6.0]}, - {"A": [None, 2.0, 3.0], "B": [4.0, 5.0, None]}, - ], -) -@pytest.mark.parametrize("method", ["linear"]) -@pytest.mark.parametrize("axis", [0]) -def test_interpolate_dataframe(data, method, axis): - # Pandas interpolate methods do not seem to work - # with nullable dtypes yet, so this method treats - # NAs as NaNs - # https://github.com/pandas-dev/pandas/issues/40252 - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expect = pdf.interpolate(method=method, axis=axis) - got = gdf.interpolate(method=method, axis=axis) - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - [1.0, 2.0, 3.0], - [1.0, None, 3.0], - [None, 2.0, None, 4.0], - [1.0, None, 3.0, None], - [None, None, 3.0, 4.0], - [1.0, 2.0, None, None], - [None, None, None, None], - [0.1, 0.2, 0.3], - ], -) -@pytest.mark.parametrize("method", ["linear"]) -@pytest.mark.parametrize("axis", [0]) -def test_interpolate_series(data, method, axis): - gsr = cudf.Series(data) - psr = gsr.to_pandas() - - is_str_dtype = psr.dtype == "object" - with expect_warning_if(is_str_dtype): - expect = psr.interpolate(method=method, axis=axis) - with expect_warning_if(is_str_dtype): - got = gsr.interpolate(method=method, axis=axis) - - assert_eq(expect, got, check_dtype=psr.dtype != "object") - - -@pytest.mark.parametrize( - "data,index", [([2.0, None, 4.0, None, 2.0], [1, 2, 3, 2, 1])] -) -def test_interpolate_series_unsorted_index(data, index): - gsr = cudf.Series(data, index=index) - psr = gsr.to_pandas() - - expect = psr.interpolate(method="values") - got = gsr.interpolate(method="values") - - assert_eq(expect, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - [1.0, 2.0, 3.0, 4.0], - [None, 2.0, 3.0, 4.0], - [1.0, 2.0, 3.0, None], - [None, None, 3.0, 4.0], - [1.0, 2.0, None, None], - [1.0, None, 3.0, None], - [None, 2.0, None, 4.0], - [None, None, None, None], - ], -) -@pytest.mark.parametrize("index", [[0, 1, 2, 3], [0, 2, 4, 6], [0, 3, 4, 9]]) -@pytest.mark.parametrize("method", ["index", "values"]) -def test_interpolate_series_values_or_index(data, index, method): - gsr = cudf.Series(data, index=index) - psr = gsr.to_pandas() - - is_str_dtype = gsr.dtype == "object" - with expect_warning_if(is_str_dtype): - expect = psr.interpolate(method=method) - with expect_warning_if(is_str_dtype): - got = gsr.interpolate(method=method) - - assert_eq(expect, got, check_dtype=psr.dtype != "object") - - -@pytest.mark.parametrize( - "data,kwargs", - [ - ( - {"A": ["a", "b", "c"], "B": ["d", "e", "f"]}, - {"axis": 0, "method": "linear"}, - ), - ({"A": [1, 2, 3]}, {"method": "pad", "limit_direction": "forward"}), - ({"A": [1, 2, 3]}, {"method": "ffill", "limit_direction": "forward"}), - ({"A": [1, 2, 3]}, {"method": "bfill", "limit_direction": "backward"}), - ( - {"A": [1, 2, 3]}, - {"method": "backfill", "limit_direction": "backward"}, - ), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Does not fail on older versions of pandas", -) -def test_interpolate_dataframe_error_cases(data, kwargs): - gsr = cudf.DataFrame(data) - psr = gsr.to_pandas() - - assert_exceptions_equal( - lfunc=psr.interpolate, - rfunc=gsr.interpolate, - lfunc_args_and_kwargs=([], kwargs), - rfunc_args_and_kwargs=([], kwargs), - ) - - -def test_interpolate_noop_new_column(): - ser = cudf.Series([1.0, 2.0, 3.0]) - result = ser.interpolate() - assert ser._column is not result._column diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py deleted file mode 100644 index 5e1dd33fbf1..00000000000 --- a/python/cudf/cudf/tests/test_interval.py +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_GE_220 -from cudf.testing import assert_eq - - -@pytest.mark.parametrize( - "data1, data2", - [(1, 2), (1.0, 2.0), (3, 4.0)], -) -@pytest.mark.parametrize("data3, data4", [(6, 10), (5.0, 9.0), (2, 6.0)]) -@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) -def test_create_interval_series(data1, data2, data3, data4, closed): - expect = pd.Series(pd.Interval(data1, data2, closed), dtype="interval") - got = cudf.Series(pd.Interval(data1, data2, closed), dtype="interval") - assert_eq(expect, got) - - expect_two = pd.Series( - [pd.Interval(data1, data2, closed), pd.Interval(data3, data4, closed)], - dtype="interval", - ) - got_two = cudf.Series( - [pd.Interval(data1, data2, closed), pd.Interval(data3, data4, closed)], - dtype="interval", - ) - assert_eq(expect_two, got_two) - - expect_three = pd.Series( - [ - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - pd.Interval(data1, data2, closed), - ], - dtype="interval", - ) - got_three = cudf.Series( - [ - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - pd.Interval(data1, data2, closed), - ], - dtype="interval", - ) - assert_eq(expect_three, got_three) - - -@pytest.mark.parametrize( - "data1, data2", - [(1, 2), (1.0, 2.0), (3, 4.0)], -) -@pytest.mark.parametrize("data3, data4", [(6, 10), (5.0, 9.0), (2, 6.0)]) -@pytest.mark.parametrize("closed", ["left", "right", "both", "neither"]) -def test_create_interval_df(data1, data2, data3, data4, closed): - # df for both pandas and cudf only works when interval is in a list - expect = pd.DataFrame( - [pd.Interval(data1, data2, closed)], dtype="interval" - ) - got = cudf.DataFrame([pd.Interval(data1, data2, closed)], dtype="interval") - assert_eq(expect, got) - - expect_two = pd.DataFrame( - { - "a": [ - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - ], - "b": [ - pd.Interval(data3, data4, closed), - pd.Interval(data1, data2, closed), - ], - }, - dtype="interval", - ) - got_two = cudf.DataFrame( - { - "a": [ - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - ], - "b": [ - pd.Interval(data3, data4, closed), - pd.Interval(data1, data2, closed), - ], - }, - dtype="interval", - ) - assert_eq(expect_two, got_two) - - expect_three = pd.DataFrame( - { - "a": [ - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - pd.Interval(data1, data2, closed), - ], - "b": [ - pd.Interval(data3, data4, closed), - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - ], - "c": [ - pd.Interval(data1, data2, closed), - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - ], - }, - dtype="interval", - ) - - got_three = cudf.DataFrame( - { - "a": [ - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - pd.Interval(data1, data2, closed), - ], - "b": [ - pd.Interval(data3, data4, closed), - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - ], - "c": [ - pd.Interval(data1, data2, closed), - pd.Interval(data1, data2, closed), - pd.Interval(data3, data4, closed), - ], - }, - dtype="interval", - ) - assert_eq(expect_three, got_three) - - -def test_create_interval_index_from_list(): - interval_list = [ - np.nan, - pd.Interval(2.0, 3.0, closed="right"), - pd.Interval(3.0, 4.0, closed="right"), - ] - - expected = pd.Index(interval_list) - actual = cudf.Index(interval_list) - - assert_eq(expected, actual) - - -def test_interval_index_unique(): - interval_list = [ - np.nan, - pd.Interval(2.0, 3.0, closed="right"), - pd.Interval(3.0, 4.0, closed="right"), - np.nan, - pd.Interval(3.0, 4.0, closed="right"), - pd.Interval(3.0, 4.0, closed="right"), - ] - pi = pd.Index(interval_list) - gi = cudf.from_pandas(pi) - - expected = pi.unique() - actual = gi.unique() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex]) -@pytest.mark.parametrize("tz", ["US/Eastern", None]) -@pytest.mark.skipif( - condition=not PANDAS_GE_220, - reason="ME frequency new in pandas 2.2", -) -def test_interval_with_datetime(tz, box): - dti = pd.date_range( - start=pd.Timestamp("20180101", tz=tz), - end=pd.Timestamp("20181231", tz=tz), - freq="ME", - ) - pobj = box(pd.IntervalIndex.from_breaks(dti)) - if tz is None: - gobj = cudf.from_pandas(pobj) - assert_eq(pobj, gobj) - else: - with pytest.raises(NotImplementedError): - cudf.from_pandas(pobj) - - -def test_from_pandas_intervaldtype(): - dtype = pd.IntervalDtype("int64", closed="left") - result = cudf.from_pandas(dtype) - expected = cudf.IntervalDtype("int64", closed="left") - assert_eq(result, expected) - - -def test_intervaldtype_eq_string_with_attributes(): - dtype = cudf.IntervalDtype("int64", closed="left") - assert dtype == "interval" - assert dtype == "interval[int64, left]" - - -def test_reduction_return_interval_pandas_compatible(): - ii = pd.IntervalIndex.from_tuples( - [("2017-01-03", "2017-01-04")], dtype="interval[datetime64[ns], right]" - ) - cudf_ii = cudf.IntervalIndex.from_pandas(ii) - with cudf.option_context("mode.pandas_compatible", True): - result = cudf_ii.min() - expected = ii.min() - assert result == expected diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py deleted file mode 100644 index 9a95f0e01ab..00000000000 --- a/python/cudf/cudf/tests/test_join_order.py +++ /dev/null @@ -1,275 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -import itertools -import operator -import string -from collections import defaultdict - -import numpy as np -import pytest - -import cudf -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.testing import assert_eq - - -@pytest.fixture(params=[False, True], ids=["unsorted", "sorted"]) -def sort(request): - return request.param - - -@pytest.fixture -def left(): - left_key = [1, 3, 2, 1, 1, 2, 5, 1, 4, 5, 8, 12, 12312, 1] * 100 - left_val = list(range(len(left_key))) - return cudf.DataFrame({"key": left_key, "val": left_val}) - - -@pytest.fixture -def right(): - right_key = [12312, 12312, 3, 2, 1, 1, 5, 7, 2] * 200 - right_val = list( - itertools.islice(itertools.cycle(string.ascii_letters), len(right_key)) - ) - return cudf.DataFrame({"key": right_key, "val": right_val}) - - -# Behaviour in sort=False case didn't match documentation in many -# cases prior to https://github.com/pandas-dev/pandas/pull/54611 -# (released as part of pandas 2.2) -if PANDAS_GE_220: - # Behaviour in sort=False case didn't match documentation in many - # cases prior to https://github.com/pandas-dev/pandas/pull/54611 - # (released as part of pandas 2.2) - def expected(left, right, sort, *, how): - left = left.to_pandas() - right = right.to_pandas() - return left.merge(right, on="key", how=how, sort=sort) - -else: - - def expect_inner(left, right, sort): - left_key = left.key.values_host.tolist() - left_val = left.val.values_host.tolist() - right_key = right.key.values_host.tolist() - right_val = right.val.values_host.tolist() - - right_have = defaultdict(list) - for i, k in enumerate(right_key): - right_have[k].append(i) - keys = [] - val_x = [] - val_y = [] - for k, v in zip(left_key, left_val): - if k not in right_have: - continue - for i in right_have[k]: - keys.append(k) - val_x.append(v) - val_y.append(right_val[i]) - - if sort: - # Python sort is stable, so this will preserve input order for - # equal items. - keys, val_x, val_y = zip( - *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) - ) - return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) - - def expect_left(left, right, sort): - left_key = left.key.values_host.tolist() - left_val = left.val.values_host.tolist() - right_key = right.key.values_host.tolist() - right_val = right.val.values_host.tolist() - - right_have = defaultdict(list) - for i, k in enumerate(right_key): - right_have[k].append(i) - keys = [] - val_x = [] - val_y = [] - for k, v in zip(left_key, left_val): - if k not in right_have: - right_vals = [None] - else: - right_vals = [right_val[i] for i in right_have[k]] - - for rv in right_vals: - keys.append(k) - val_x.append(v) - val_y.append(rv) - - if sort: - # Python sort is stable, so this will preserve input order for - # equal items. - keys, val_x, val_y = zip( - *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) - ) - return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) - - def expect_outer(left, right, sort): - left_key = left.key.values_host.tolist() - left_val = left.val.values_host.tolist() - right_key = right.key.values_host.tolist() - right_val = right.val.values_host.tolist() - right_have = defaultdict(list) - for i, k in enumerate(right_key): - right_have[k].append(i) - keys = [] - val_x = [] - val_y = [] - for k, v in zip(left_key, left_val): - if k not in right_have: - right_vals = [None] - else: - right_vals = [right_val[i] for i in right_have[k]] - for rv in right_vals: - keys.append(k) - val_x.append(v) - val_y.append(rv) - left_have = set(left_key) - for k, v in zip(right_key, right_val): - if k not in left_have: - keys.append(k) - val_x.append(None) - val_y.append(v) - - # Python sort is stable, so this will preserve input order for - # equal items. - # outer joins are always sorted, but we test both sort values - keys, val_x, val_y = zip( - *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0)) - ) - return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y}) - - def expected(left, right, sort, *, how): - if how == "inner": - return expect_inner(left, right, sort) - elif how == "outer": - return expect_outer(left, right, sort) - elif how == "left": - return expect_left(left, right, sort) - elif how == "right": - return expect_left(right, left, sort).rename( - {"val_x": "val_y", "val_y": "val_x"}, axis=1 - ) - else: - raise NotImplementedError() - - -@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"]) -def test_join_ordering_pandas_compat(request, left, right, sort, how): - request.applymarker( - pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and how == "right", - reason="TODO: Result ording of suffix'ed columns is incorrect", - ) - ) - with cudf.option_context("mode.pandas_compatible", True): - actual = left.merge(right, on="key", how=how, sort=sort) - expect = expected(left, right, sort, how=how) - assert_eq(expect, actual) - - -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("on_index", [True, False]) -@pytest.mark.parametrize("left_unique", [True, False]) -@pytest.mark.parametrize("left_monotonic", [True, False]) -@pytest.mark.parametrize("right_unique", [True, False]) -@pytest.mark.parametrize("right_monotonic", [True, False]) -def test_merge_combinations( - request, - how, - sort, - on_index, - left_unique, - left_monotonic, - right_unique, - right_monotonic, -): - request.applymarker( - pytest.mark.xfail( - condition=how == "outer" - and on_index - and left_unique - and not left_monotonic - and right_unique - and not right_monotonic, - reason="https://github.com/pandas-dev/pandas/issues/55992", - ) - ) - left = [2, 3] - if left_unique: - left.append(4 if left_monotonic else 1) - else: - left.append(3 if left_monotonic else 2) - - right = [2, 3] - if right_unique: - right.append(4 if right_monotonic else 1) - else: - right.append(3 if right_monotonic else 2) - - left = cudf.DataFrame({"key": left}) - right = cudf.DataFrame({"key": right}) - - if on_index: - left = left.set_index("key") - right = right.set_index("key") - on_kwargs = {"left_index": True, "right_index": True} - else: - on_kwargs = {"on": "key"} - - with cudf.option_context("mode.pandas_compatible", True): - result = cudf.merge(left, right, how=how, sort=sort, **on_kwargs) - if on_index: - left = left.reset_index() - right = right.reset_index() - - if how in ["left", "right", "inner"]: - if how in ["left", "inner"]: - expected, other, other_unique = left, right, right_unique - else: - expected, other, other_unique = right, left, left_unique - if how == "inner": - keep_values = set(left["key"].values_host).intersection( - right["key"].values_host - ) - keep_mask = expected["key"].isin(keep_values) - expected = expected[keep_mask] - if sort: - expected = expected.sort_values("key") - if not other_unique: - other_value_counts = other["key"].value_counts() - repeats = other_value_counts.reindex( - expected["key"].values, fill_value=1 - ) - repeats = repeats.astype(np.intp) - expected = expected["key"].repeat(repeats.values) - expected = expected.to_frame() - elif how == "outer": - if on_index and left_unique and left["key"].equals(right["key"]): - expected = cudf.DataFrame({"key": left["key"]}) - else: - left_counts = left["key"].value_counts() - right_counts = right["key"].value_counts() - expected_counts = left_counts.mul(right_counts, fill_value=1) - expected_counts = expected_counts.astype(np.intp) - expected = expected_counts.index.values_host.repeat( - expected_counts.values_host - ) - expected = cudf.DataFrame({"key": expected}) - expected = expected.sort_values("key") - - if on_index: - expected = expected.set_index("key") - else: - expected = expected.reset_index(drop=True) - - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py deleted file mode 100644 index b1ce69e58ef..00000000000 --- a/python/cudf/cudf/tests/test_joining.py +++ /dev/null @@ -1,2277 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from itertools import combinations, product, repeat - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import ( - INTEGER_TYPES, - NUMERIC_TYPES, - TIMEDELTA_TYPES, - assert_exceptions_equal, - expect_warning_if, -) - -_JOIN_TYPES = ("left", "inner", "outer", "right", "leftanti", "leftsemi") - - -def make_params(): - np.random.seed(0) - - hows = _JOIN_TYPES - - # Test specific cases (1) - aa = [0, 0, 4, 5, 5] - bb = [0, 0, 2, 3, 5] - for how in hows: - yield (aa, bb, how) - - # Test specific cases (2) - aa = [0, 0, 1, 2, 3] - bb = [0, 1, 2, 2, 3] - for how in hows: - yield (aa, bb, how) - - # Test large random integer inputs - aa = np.random.randint(0, 50, 100) - bb = np.random.randint(0, 50, 100) - for how in hows: - yield (aa, bb, how) - - # Test floating point inputs - aa = np.random.random(50) - bb = np.random.random(50) - for how in hows: - yield (aa, bb, how) - - -def pd_odd_joins(left, right, join_type): - if join_type == "leftanti": - return left[~left.index.isin(right.index)][left.columns] - elif join_type == "leftsemi": - return left[left.index.isin(right.index)][left.columns] - - -def assert_join_results_equal(expect, got, how, **kwargs): - if how not in _JOIN_TYPES: - raise ValueError(f"Unrecognized join type {how}") - if how == "right": - got = got[expect.columns] - - if isinstance(expect, (pd.Series, cudf.Series)): - return assert_eq( - expect.sort_values().reset_index(drop=True), - got.sort_values().reset_index(drop=True), - **kwargs, - ) - elif isinstance(expect, (pd.DataFrame, cudf.DataFrame)): - if not len( - expect.columns - ): # can't sort_values() on a df without columns - return assert_eq(expect, got, **kwargs) - - assert_eq( - expect.sort_values(expect.columns.to_list()).reset_index( - drop=True - ), - got.sort_values(got.columns.to_list()).reset_index(drop=True), - **kwargs, - ) - elif isinstance(expect, (pd.Index, cudf.Index)): - return assert_eq(expect.sort_values(), got.sort_values(), **kwargs) - else: - raise ValueError(f"Not a join result: {type(expect).__name__}") - - -@pytest.mark.parametrize("aa,bb,how", make_params()) -def test_dataframe_join_how(aa, bb, how): - df = cudf.DataFrame() - df["a"] = aa - df["b"] = bb - - def work_pandas(df, how): - df1 = df.set_index("a") - df2 = df.set_index("b") - if how == "leftanti": - joined = pd_odd_joins(df1, df2, "leftanti") - elif how == "leftsemi": - joined = pd_odd_joins(df1, df2, "leftsemi") - else: - joined = df1.join(df2, how=how, sort=True) - return joined - - def work_gdf(df): - df1 = df.set_index("a") - df2 = df.set_index("b") - joined = df1.join(df2, how=how, sort=True) - return joined - - expect = work_pandas(df.to_pandas(), how) - got = work_gdf(df) - expecto = expect.copy() - goto = got.copy() - - expect = expect.astype(np.float64).fillna(np.nan)[expect.columns] - got = got.astype(np.float64).fillna(np.nan)[expect.columns] - - assert got.index.name is None - - assert list(expect.columns) == list(got.columns) - if how in {"left", "inner", "right", "leftanti", "leftsemi"}: - assert_eq(sorted(expect.index.values), sorted(got.index.values)) - if how != "outer": - # Newly introduced ambiguous ValueError thrown when - # an index and column have the same name. Rename the - # index so sorts work. - # TODO: What is the less hacky way? - expect.index.name = "bob" - got.index.name = "mary" - assert_join_results_equal(expect, got, how=how) - # if(how=='right'): - # _sorted_check_series(expect['a'], expect['b'], - # got['a'], got['b']) - # else: - # _sorted_check_series(expect['b'], expect['a'], got['b'], - # got['a']) - else: - for c in expecto.columns: - _check_series(expecto[c].fillna(-1), goto[c].fillna(-1)) - - -def _check_series(expect, got): - magic = 0xDEADBEAF - - direct_equal = np.all(expect.values == got.to_numpy()) - nanfilled_equal = np.all( - expect.fillna(magic).values == got.fillna(magic).to_numpy() - ) - msg = "direct_equal={}, nanfilled_equal={}".format( - direct_equal, nanfilled_equal - ) - assert direct_equal or nanfilled_equal, msg - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="bug in older version of pandas", -) -def test_dataframe_join_suffix(): - np.random.seed(0) - - df = cudf.DataFrame(np.random.randint(0, 5, (5, 3)), columns=list("abc")) - - left = df.set_index("a") - right = df.set_index("c") - msg = ( - "there are overlapping columns but lsuffix and rsuffix are not defined" - ) - with pytest.raises(ValueError, match=msg): - left.join(right) - - got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True) - expect = left.to_pandas().join( - right.to_pandas(), - lsuffix="_left", - rsuffix="_right", - sort=True, - ) - # TODO: Retain result index name - expect.index.name = None - assert_eq(got, expect) - - got_sorted = got.sort_values(by=["b_left", "c", "b_right"], axis=0) - expect_sorted = expect.sort_values(by=["b_left", "c", "b_right"], axis=0) - assert_eq(got_sorted, expect_sorted) - - -def test_dataframe_join_cats(): - lhs = cudf.DataFrame() - lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) - lhs["b"] = bb = np.arange(len(lhs)) - lhs = lhs.set_index("a") - - rhs = cudf.DataFrame() - rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc")) - rhs["c"] = cc = np.arange(len(rhs)) - rhs = rhs.set_index("a") - - got = lhs.join(rhs) - expect = lhs.to_pandas().join(rhs.to_pandas()) - - # Note: pandas make an object Index after joining - assert_join_results_equal(expect, got, how="inner") - - # Just do some rough checking here. - assert list(got.columns) == ["b", "c"] - assert len(got) > 0 - assert set(got.index.to_pandas()) & set("abc") - assert set(got["b"].to_numpy()) & set(bb) - assert set(got["c"].to_numpy()) & set(cc) - - -def test_dataframe_join_combine_cats(): - lhs = cudf.DataFrame({"join_index": ["a", "b", "c"], "data_x": [1, 2, 3]}) - rhs = cudf.DataFrame({"join_index": ["b", "c", "d"], "data_y": [2, 3, 4]}) - - lhs["join_index"] = lhs["join_index"].astype("category") - rhs["join_index"] = rhs["join_index"].astype("category") - - lhs = lhs.set_index("join_index") - rhs = rhs.set_index("join_index") - - lhs_pd = lhs.to_pandas() - rhs_pd = rhs.to_pandas() - - lhs_pd.index = lhs_pd.index.astype("object") - rhs_pd.index = rhs_pd.index.astype("object") - - expect = lhs_pd.join(rhs_pd, how="outer") - expect.index = expect.index.astype("category") - got = lhs.join(rhs, how="outer") - - assert_eq(expect.index.sort_values(), got.index.sort_values()) - - -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_dataframe_join_mismatch_cats(how): - pdf1 = pd.DataFrame( - { - "join_col": ["a", "b", "c", "d", "e"], - "data_col_left": [10, 20, 30, 40, 50], - } - ) - pdf2 = pd.DataFrame( - {"join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8]} - ) - - pdf1["join_col"] = pdf1["join_col"].astype("category") - pdf2["join_col"] = pdf2["join_col"].astype("category") - - gdf1 = cudf.from_pandas(pdf1) - gdf2 = cudf.from_pandas(pdf2) - - gdf1 = gdf1.set_index("join_col") - gdf2 = gdf2.set_index("join_col") - - pdf1 = pdf1.set_index("join_col") - pdf2 = pdf2.set_index("join_col") - join_gdf = gdf1.join(gdf2, how=how, sort=True) - join_pdf = pdf1.join(pdf2, how=how) - - got = join_gdf.fillna(-1).to_pandas() - expect = join_pdf.fillna(-1) # note: cudf join doesn't mask NA - - # We yield a categorical here whereas pandas gives Object. - expect.index = expect.index.astype("category") - # cudf creates the columns in different order than pandas for right join - if how == "right": - got = got[["data_col_left", "data_col_right"]] - - expect.data_col_right = expect.data_col_right.astype(np.int64) - expect.data_col_left = expect.data_col_left.astype(np.int64) - - assert_join_results_equal(expect, got, how=how, check_categorical=False) - - -@pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None]) -def test_dataframe_merge_on(on): - np.random.seed(0) - - # Make cuDF - df_left = cudf.DataFrame() - nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) - df_left["left_val"] = np.arange(nelem) - - df_right = cudf.DataFrame() - nelem = 500 - df_right["key1"] = np.random.randint(0, 30, nelem) - df_right["key2"] = np.random.randint(0, 50, nelem) - df_right["right_val"] = np.arange(nelem) - - # Make pandas DF - pddf_left = df_left.to_pandas() - pddf_right = df_right.to_pandas() - - # Expected result (from pandas) - pddf_joined = pddf_left.merge(pddf_right, on=on, how="left") - - # Test (from cuDF; doesn't check for ordering) - join_result = df_left.merge(df_right, on=on, how="left") - join_result_cudf = cudf.merge(df_left, df_right, on=on, how="left") - - join_result["right_val"] = ( - join_result["right_val"].astype(np.float64).fillna(np.nan) - ) - - join_result_cudf["right_val"] = ( - join_result_cudf["right_val"].astype(np.float64).fillna(np.nan) - ) - - for col in list(pddf_joined.columns): - if col.count("_y") > 0: - join_result[col] = ( - join_result[col].astype(np.float64).fillna(np.nan) - ) - join_result_cudf[col] = ( - join_result_cudf[col].astype(np.float64).fillna(np.nan) - ) - - # Test dataframe equality (ignore order of rows and columns) - cdf_result = ( - join_result.to_pandas() - .sort_values(list(pddf_joined.columns)) - .reset_index(drop=True) - ) - - pdf_result = pddf_joined.sort_values( - list(pddf_joined.columns) - ).reset_index(drop=True) - - assert_join_results_equal(cdf_result, pdf_result, how="left") - - merge_func_result_cdf = ( - join_result_cudf.to_pandas() - .sort_values(list(pddf_joined.columns)) - .reset_index(drop=True) - ) - - assert_join_results_equal(merge_func_result_cdf, cdf_result, how="left") - - -def test_dataframe_merge_on_unknown_column(): - np.random.seed(0) - - # Make cuDF - df_left = cudf.DataFrame() - nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) - df_left["left_val"] = np.arange(nelem) - - df_right = cudf.DataFrame() - nelem = 500 - df_right["key1"] = np.random.randint(0, 30, nelem) - df_right["key2"] = np.random.randint(0, 50, nelem) - df_right["right_val"] = np.arange(nelem) - - with pytest.raises(KeyError) as raises: - df_left.merge(df_right, on="bad_key", how="left") - raises.match("bad_key") - - -def test_dataframe_merge_no_common_column(): - np.random.seed(0) - - # Make cuDF - df_left = cudf.DataFrame() - nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) - df_left["left_val"] = np.arange(nelem) - - df_right = cudf.DataFrame() - nelem = 500 - df_right["key3"] = np.random.randint(0, 30, nelem) - df_right["key4"] = np.random.randint(0, 50, nelem) - df_right["right_val"] = np.arange(nelem) - - with pytest.raises(ValueError) as raises: - df_left.merge(df_right, how="left") - raises.match("No common columns to perform merge on") - - -def test_dataframe_empty_merge(): - gdf1 = cudf.DataFrame({"a": [], "b": []}) - gdf2 = cudf.DataFrame({"a": [], "c": []}) - - expect = cudf.DataFrame({"a": [], "b": [], "c": []}) - got = gdf1.merge(gdf2, how="left", on=["a"]) - - assert_join_results_equal(expect, got, how="left") - - -def test_dataframe_merge_order(): - gdf1 = cudf.DataFrame() - gdf2 = cudf.DataFrame() - gdf1["id"] = [10, 11] - gdf1["timestamp"] = [1, 2] - gdf1["a"] = [3, 4] - - gdf2["id"] = [4, 5] - gdf2["a"] = [7, 8] - - gdf = gdf1.merge(gdf2, how="left", on=["id", "a"]) - - df1 = pd.DataFrame() - df2 = pd.DataFrame() - df1["id"] = [10, 11] - df1["timestamp"] = [1, 2] - df1["a"] = [3, 4] - - df2["id"] = [4, 5] - df2["a"] = [7, 8] - - df = df1.merge(df2, how="left", on=["id", "a"]) - assert_join_results_equal(df, gdf, how="left") - - -@pytest.mark.parametrize( - "pairs", - [ - ("", ""), - ("", "a"), - ("", "ab"), - ("", "abc"), - ("", "b"), - ("", "bcd"), - ("", "cde"), - ("a", "a"), - ("a", "ab"), - ("a", "abc"), - ("a", "b"), - ("a", "bcd"), - ("a", "cde"), - ("ab", "ab"), - ("ab", "abc"), - ("ab", "b"), - ("ab", "bcd"), - ("ab", "cde"), - ("abc", "abc"), - ("abc", "b"), - ("abc", "bcd"), - ("abc", "cde"), - ("b", "b"), - ("b", "bcd"), - ("b", "cde"), - ("bcd", "bcd"), - ("bcd", "cde"), - ("cde", "cde"), - ], -) -@pytest.mark.parametrize("max", [5, 1000]) -@pytest.mark.parametrize("rows", [1, 5, 100]) -@pytest.mark.parametrize("how", ["left", "inner", "outer"]) -def test_dataframe_pairs_of_triples(pairs, max, rows, how): - np.random.seed(0) - - pdf_left = pd.DataFrame() - pdf_right = pd.DataFrame() - for left_column in pairs[0]: - pdf_left[left_column] = np.random.randint(0, max, rows) - for right_column in pairs[1]: - pdf_right[right_column] = np.random.randint(0, max, rows) - gdf_left = cudf.from_pandas(pdf_left) - gdf_right = cudf.from_pandas(pdf_right) - if not set(pdf_left.columns).intersection(pdf_right.columns): - with pytest.raises( - pd.errors.MergeError, - match="No common columns to perform merge on", - ): - pdf_left.merge(pdf_right) - with pytest.raises( - ValueError, match="No common columns to perform merge on" - ): - gdf_left.merge(gdf_right) - elif not [value for value in pdf_left if value in pdf_right]: - with pytest.raises( - pd.errors.MergeError, - match="No common columns to perform merge on", - ): - pdf_left.merge(pdf_right) - with pytest.raises( - ValueError, match="No common columns to perform merge on" - ): - gdf_left.merge(gdf_right) - else: - pdf_result = pdf_left.merge(pdf_right, how=how) - gdf_result = gdf_left.merge(gdf_right, how=how) - assert np.array_equal(gdf_result.columns, pdf_result.columns) - for column in gdf_result: - gdf_col_result_sorted = gdf_result[column].fillna(-1).sort_values() - pd_col_result_sorted = pdf_result[column].fillna(-1).sort_values() - assert np.array_equal( - gdf_col_result_sorted.to_pandas().values, - pd_col_result_sorted.values, - ) - - -def test_safe_merging_with_left_empty(): - np.random.seed(0) - - pairs = ("bcd", "b") - pdf_left = pd.DataFrame() - pdf_right = pd.DataFrame() - for left_column in pairs[0]: - pdf_left[left_column] = np.random.randint(0, 10, 0) - for right_column in pairs[1]: - pdf_right[right_column] = np.random.randint(0, 10, 5) - gdf_left = cudf.from_pandas(pdf_left) - gdf_right = cudf.from_pandas(pdf_right) - - pdf_result = pdf_left.merge(pdf_right) - gdf_result = gdf_left.merge(gdf_right) - # Simplify test because pandas does not consider empty Index and RangeIndex - # to be equivalent. TODO: Allow empty Index objects to have equivalence. - assert len(pdf_result) == len(gdf_result) - - -@pytest.mark.parametrize("how", ["left", "inner", "outer"]) -@pytest.mark.parametrize("left_empty", [True, False]) -@pytest.mark.parametrize("right_empty", [True, False]) -def test_empty_joins(how, left_empty, right_empty): - pdf = pd.DataFrame({"x": [1, 2, 3]}) - - if left_empty: - left = pdf.head(0) - else: - left = pdf - if right_empty: - right = pdf.head(0) - else: - right = pdf - - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - - expected = left.merge(right, how=how) - result = gleft.merge(gright, how=how) - assert len(expected) == len(result) - - -def test_merge_left_index_zero(): - left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) - right = pd.DataFrame( - {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] - ) - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - pd_merge = left.merge(right, left_on="x", right_on="y") - gd_merge = gleft.merge(gright, left_on="x", right_on="y") - - assert_join_results_equal(pd_merge, gd_merge, how="left") - - -@pytest.mark.parametrize( - "kwargs", - [ - {"left_index": True, "right_on": "y"}, - {"right_index": True, "left_on": "x"}, - {"left_on": "x", "right_on": "y"}, - {"left_index": True, "right_index": True}, - ], -) -def test_merge_left_right_index_left_right_on_zero_kwargs(kwargs): - left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) - right = pd.DataFrame( - {"y": [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6] - ) - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - pd_merge = left.merge(right, **kwargs) - gd_merge = gleft.merge(gright, **kwargs) - assert_join_results_equal(pd_merge, gd_merge, how="left") - - -@pytest.mark.parametrize( - "kwargs", - [ - {"left_index": True, "right_on": "y"}, - {"right_index": True, "left_on": "x"}, - {"left_on": "x", "right_on": "y"}, - {"left_index": True, "right_index": True}, - ], -) -def test_merge_left_right_index_left_right_on_kwargs(kwargs): - left = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6]) - right = pd.DataFrame( - {"y": [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7] - ) - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - pd_merge = left.merge(right, **kwargs) - gd_merge = gleft.merge(gright, **kwargs) - assert_join_results_equal(pd_merge, gd_merge, how="left") - - -def test_indicator(): - gdf = cudf.DataFrame({"x": [1, 2, 1]}) - gdf.merge(gdf, indicator=False) - - with pytest.raises(NotImplementedError) as info: - gdf.merge(gdf, indicator=True) - - assert "indicator=False" in str(info.value) - - -def test_merge_suffixes(): - pdf = cudf.DataFrame({"x": [1, 2, 1]}) - gdf = cudf.DataFrame({"x": [1, 2, 1]}) - assert_join_results_equal( - gdf.merge(gdf, suffixes=("left", "right")), - pdf.merge(pdf, suffixes=("left", "right")), - how="left", - ) - - assert_exceptions_equal( - lfunc=pdf.merge, - rfunc=gdf.merge, - lfunc_args_and_kwargs=([pdf], {"lsuffix": "left", "rsuffix": "right"}), - rfunc_args_and_kwargs=([gdf], {"lsuffix": "left", "rsuffix": "right"}), - ) - - -def test_merge_left_on_right_on(): - left = pd.DataFrame({"xx": [1, 2, 3, 4, 5, 6]}) - right = pd.DataFrame({"xx": [10, 20, 30, 6, 5, 4]}) - - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - - assert_join_results_equal( - left.merge(right, on="xx"), gleft.merge(gright, on="xx"), how="left" - ) - - assert_join_results_equal( - left.merge(right, left_on="xx", right_on="xx"), - gleft.merge(gright, left_on="xx", right_on="xx"), - how="left", - ) - - -def test_merge_on_index_retained(): - df = cudf.DataFrame() - df["a"] = [1, 2, 3, 4, 5] - df["b"] = ["a", "b", "c", "d", "e"] - df.index = [5, 3, 4, 2, 1] - - df2 = cudf.DataFrame() - df2["a2"] = [1, 2, 3, 4, 5] - df2["res"] = ["a", "b", "c", "d", "e"] - - pdf = df.to_pandas() - pdf2 = df2.to_pandas() - - gdm = df.merge(df2, left_index=True, right_index=True, how="left") - pdm = pdf.merge(pdf2, left_index=True, right_index=True, how="left") - gdm["a2"] = gdm["a2"].astype("float64") - assert_eq(gdm.sort_index(), pdm.sort_index()) - - -@pytest.mark.parametrize( - "kwargs", - [ - {"left_index": True, "right_on": "y"}, - {"right_index": True, "left_on": "x"}, - {"left_on": "x", "right_on": "y"}, - ], -) -def test_merge_left_right_index_left_right_on_kwargs2(kwargs): - left = pd.DataFrame({"x": [1, 2, 3]}, index=[10, 20, 30]) - right = pd.DataFrame({"y": [10, 20, 30]}, index=[1, 2, 30]) - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - gd_merge = gleft.merge(gright, **kwargs) - pd_merge = left.merge(right, **kwargs) - if pd_merge.empty: - assert gd_merge.empty - - -@pytest.mark.parametrize( - "hows", [{"how": "inner"}, {"how": "left"}, {"how": "outer"}] -) -@pytest.mark.parametrize( - "ons", - [ - {"on": "a"}, - {"on": ["a", "b"]}, - {"on": ["b", "a"]}, - {"on": ["a", "aa", "b"]}, - {"on": ["b", "a", "aa"]}, - ], -) -def test_merge_sort(ons, hows): - kwargs = {} - kwargs.update(hows) - kwargs.update(ons) - kwargs["sort"] = True - a = [4, 6, 9, 5, 2, 4, 1, 8, 1] - b = [9, 8, 7, 8, 3, 9, 7, 9, 2] - aa = [8, 9, 2, 9, 3, 1, 2, 3, 4] - left = pd.DataFrame({"a": a, "b": b, "aa": aa}) - right = left.copy(deep=True) - - left.index = [6, 5, 4, 7, 5, 5, 5, 4, 4] - right.index = [5, 4, 1, 9, 4, 3, 5, 4, 4] - - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - gd_merge = gleft.merge(gright, **kwargs) - - pd_merge = left.merge(right, **kwargs) - # require the join keys themselves to be sorted correctly - # the non-key columns will NOT match pandas ordering - assert_join_results_equal( - pd_merge[kwargs["on"]], gd_merge[kwargs["on"]], how="left" - ) - pd_merge = pd_merge.drop(kwargs["on"], axis=1) - gd_merge = gd_merge.drop(kwargs["on"], axis=1) - if not pd_merge.empty: - # check to make sure the non join key columns are the same - pd_merge = pd_merge.sort_values(list(pd_merge.columns)).reset_index( - drop=True - ) - gd_merge = gd_merge.sort_values(list(gd_merge.columns)).reset_index( - drop=True - ) - - assert_join_results_equal(pd_merge, gd_merge, how="left") - - -@pytest.mark.parametrize( - "kwargs", - [ - {"left_on": ["a"], "left_index": False, "right_index": True}, - {"right_on": ["b"], "left_index": True, "right_index": False}, - ], -) -def test_merge_sort_on_indexes(kwargs): - left_index = kwargs["left_index"] - right_index = kwargs["right_index"] - kwargs["sort"] = True - a = [4, 6, 9, 5, 2, 4, 1, 8, 1] - left = pd.DataFrame({"a": a}) - right = pd.DataFrame({"b": a}) - - left.index = [6, 5, 4, 7, 5, 5, 5, 4, 4] - right.index = [5, 4, 1, 9, 4, 3, 5, 4, 4] - - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - gd_merge = gleft.merge(gright, **kwargs) - - if left_index and right_index: - check_if_sorted = gd_merge[["a", "b"]].to_pandas() - check_if_sorted.index.name = "index" - definitely_sorted = check_if_sorted.sort_values(["index", "a", "b"]) - definitely_sorted.index.name = None - assert_eq(gd_merge, definitely_sorted) - elif left_index: - assert gd_merge["b"].is_monotonic_increasing - elif right_index: - assert gd_merge["a"].is_monotonic_increasing - - -@pytest.mark.parametrize( - "dtype", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_join_datetimes_index(dtype): - datetimes = pd.Series(pd.date_range("20010101", "20010102", freq="12h")) - pdf_lhs = pd.DataFrame(index=[1, 0, 1, 2, 0, 0, 1]) - pdf_rhs = pd.DataFrame({"d": datetimes}) - gdf_lhs = cudf.from_pandas(pdf_lhs) - gdf_rhs = cudf.from_pandas(pdf_rhs) - - gdf_rhs["d"] = gdf_rhs["d"].astype(dtype) - - pdf = pdf_lhs.join(pdf_rhs, sort=True) - gdf = gdf_lhs.join(gdf_rhs, sort=True) - - assert gdf["d"].dtype == cudf.dtype(dtype) - - assert_join_results_equal(pdf, gdf, how="inner", check_dtype=False) - - -def test_join_with_different_names(): - left = pd.DataFrame({"a": [0, 1, 2.0, 3, 4, 5, 9]}) - right = pd.DataFrame({"b": [12, 5, 3, 9.0, 5], "c": [1, 2, 3, 4, 5.0]}) - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - pd_merge = left.merge(right, how="outer", left_on=["a"], right_on=["b"]) - gd_merge = gleft.merge(gright, how="outer", left_on=["a"], right_on=["b"]) - assert_join_results_equal(pd_merge, gd_merge, how="outer") - - -def test_join_same_name_different_order(): - left = pd.DataFrame({"a": [0, 0], "b": [1, 2]}) - right = pd.DataFrame({"a": [1, 2], "b": [0, 0]}) - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - pd_merge = left.merge(right, left_on=["a", "b"], right_on=["b", "a"]) - gd_merge = gleft.merge(gright, left_on=["a", "b"], right_on=["b", "a"]) - assert_join_results_equal(pd_merge, gd_merge, how="left") - - -def test_join_empty_table_dtype(): - left = pd.DataFrame({"a": []}) - right = pd.DataFrame({"b": [12, 5, 3, 9.0, 5], "c": [1, 2, 3, 4, 5.0]}) - gleft = cudf.from_pandas(left) - gright = cudf.from_pandas(right) - pd_merge = left.merge(right, how="left", left_on=["a"], right_on=["b"]) - gd_merge = gleft.merge(gright, how="left", left_on=["a"], right_on=["b"]) - assert_eq(pd_merge["a"].dtype, gd_merge["a"].dtype) - - -@pytest.mark.parametrize("how", ["outer", "inner", "left", "right"]) -@pytest.mark.parametrize( - "column_a", - [ - ( - pd.Series([None, 1, 2, 3, 4, 5, 6, 7], dtype=np.float64), - pd.Series([8, 9, 10, 11, 12, None, 14, 15], dtype=np.float64), - ) - ], -) -@pytest.mark.parametrize( - "column_b", - [ - ( - pd.Series([0, 1, 0, None, 1, 0, 0, 0], dtype=np.float64), - pd.Series([None, 1, 2, 1, 2, 2, 0, 0], dtype=np.float64), - ) - ], -) -@pytest.mark.parametrize( - "column_c", - [ - ( - pd.Series(["dog", "cat", "fish", "bug"] * 2), - pd.Series(["bird", "cat", "mouse", "snake"] * 2), - ), - ( - pd.Series(["dog", "cat", "fish", "bug"] * 2).astype("category"), - pd.Series(["bird", "cat", "mouse", "snake"] * 2).astype( - "category" - ), - ), - ], -) -def test_join_multi(how, column_a, column_b, column_c): - index = ["b", "c"] - df1 = pd.DataFrame() - df1["a1"] = column_a[0] - df1["b"] = column_b[0] - df1["c"] = column_c[0] - df1 = df1.set_index(index) - gdf1 = cudf.from_pandas(df1) - - df2 = pd.DataFrame() - df2["a2"] = column_a[1] - df2["b"] = column_b[1] - df2["c"] = column_c[1] - df2 = df2.set_index(index) - gdf2 = cudf.from_pandas(df2) - - gdf_result = gdf1.join(gdf2, how=how, sort=True) - pdf_result = df1.join(df2, how=how, sort=True) - - # Make sure columns are in the same order - columns = pdf_result.columns.values - gdf_result = gdf_result[columns] - pdf_result = pdf_result[columns] - - assert_join_results_equal(pdf_result, gdf_result, how="inner") - - -@pytest.mark.parametrize( - "kwargs", - [ - { - "left_on": ["a", "b"], - "right_on": ["a", "b"], - "left_index": False, - "right_index": False, - }, # left and right on, no indices - { - "left_on": None, - "right_on": None, - "left_index": True, - "right_index": True, - }, # left_index and right_index, no on - { - "left_on": ["a", "b"], - "right_on": None, - "left_index": False, - "right_index": True, - }, # left on and right_index - { - "left_on": None, - "right_on": ["a", "b"], - "left_index": True, - "right_index": False, - }, # right_on and left_index - ], -) -def test_merge_multi(kwargs): - left = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 3, 5, 6], - "b": [1, 3, 5, 7, 5, 9, 0], - "c": ["o", "p", "q", "r", "s", "t", "u"], - "d": ["v", "w", "x", "y", "z", "1", "2"], - } - ) - right = cudf.DataFrame( - { - "a": [0, 9, 3, 4, 3, 7, 8], - "b": [2, 4, 5, 7, 5, 6, 8], - "c": ["a", "b", "c", "d", "e", "f", "g"], - "d": ["j", "i", "j", "k", "l", "m", "n"], - } - ) - - if ( - kwargs["left_on"] is not None - and kwargs["right_on"] is not None - and kwargs["left_index"] is False - and kwargs["right_index"] is False - ): - left = left.set_index(["c", "d"]) - right = right.set_index(["c", "d"]) - elif ( - kwargs["left_on"] is None - and kwargs["right_on"] is None - and kwargs["left_index"] is True - and kwargs["right_index"] is True - ): - left = left.set_index(["a", "b"]) - right = right.set_index(["a", "b"]) - elif kwargs["left_on"] is not None and kwargs["right_index"] is True: - left = left.set_index(["c", "d"]) - right = right.set_index(["a", "b"]) - elif kwargs["right_on"] is not None and kwargs["left_index"] is True: - left = left.set_index(["a", "b"]) - right = right.set_index(["c", "d"]) - - gleft = left.to_pandas() - gright = right.to_pandas() - - kwargs["sort"] = True - expect = gleft.merge(gright, **kwargs) - got = left.merge(right, **kwargs) - - assert_eq(expect.sort_index().index, got.sort_index().index) - - expect.index = range(len(expect)) - got.index = range(len(got)) - expect = expect.sort_values(list(expect.columns)) - got = got.sort_values(list(got.columns)) - expect.index = range(len(expect)) - got.index = range(len(got)) - - assert_join_results_equal(expect, got, how="left") - - -@pytest.mark.parametrize("dtype_l", INTEGER_TYPES) -@pytest.mark.parametrize("dtype_r", INTEGER_TYPES) -def test_typecast_on_join_int_to_int(dtype_l, dtype_r): - other_data = ["a", "b", "c"] - - join_data_l = cudf.Series([1, 2, 3], dtype=dtype_l) - join_data_r = cudf.Series([1, 2, 4], dtype=dtype_r) - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) - - exp_join_data = [1, 2] - exp_other_data = ["a", "b"] - exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - - expect = cudf.DataFrame( - { - "join_col": exp_join_col, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - - got = gdf_l.merge(gdf_r, on="join_col", how="inner") - - assert_join_results_equal(expect, got, how="inner") - - -@pytest.mark.parametrize("dtype_l", ["float32", "float64"]) -@pytest.mark.parametrize("dtype_r", ["float32", "float64"]) -def test_typecast_on_join_float_to_float(dtype_l, dtype_r): - other_data = ["a", "b", "c", "d", "e", "f"] - - join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) - join_data_r = cudf.Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) - - if dtype_l != dtype_r: - exp_join_data = [1, 2, 3, 4.5] - exp_other_data = ["a", "b", "c", "e"] - else: - exp_join_data = [1, 2, 3, 0.9, 4.5] - exp_other_data = ["a", "b", "c", "d", "e"] - - exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - - expect = cudf.DataFrame( - { - "join_col": exp_join_col, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - - got = gdf_l.merge(gdf_r, on="join_col", how="inner") - - assert_join_results_equal(expect, got, how="inner") - - -@pytest.mark.parametrize("dtype_l", NUMERIC_TYPES) -@pytest.mark.parametrize("dtype_r", NUMERIC_TYPES) -def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): - if ( - ("int" in dtype_l or "long" in dtype_l) - and ("int" in dtype_r or "long" in dtype_r) - ) or ("float" in dtype_l and "float" in dtype_r): - pytest.skip("like types not tested in this function") - - other_data = ["a", "b", "c", "d", "e", "f"] - - join_data_l = cudf.Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) - join_data_r = cudf.Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - exp_dtype = np.result_type(np.dtype(dtype_l), np.dtype(dtype_r)) - - exp_join_data = [1, 2, 3] - exp_other_data = ["a", "b", "c"] - exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - - expect = cudf.DataFrame( - { - "join_col": exp_join_col, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - - got = gdf_l.merge(gdf_r, on="join_col", how="inner") - - assert_join_results_equal(expect, got, how="inner") - - -def test_typecast_on_join_no_float_round(): - other_data = ["a", "b", "c", "d", "e"] - - join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8") - join_data_r = cudf.Series([1, 2, 3, 4.01, 4.99], dtype="float32") - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - exp_join_data = [1, 2, 3, 4, 5] - exp_Bx = ["a", "b", "c", "d", "e"] - exp_By = ["a", "b", "c", None, None] - exp_join_col = cudf.Series(exp_join_data, dtype="float32") - - expect = cudf.DataFrame( - {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By} - ) - - got = gdf_l.merge(gdf_r, on="join_col", how="left") - - assert_join_results_equal(expect, got, how="left") - - -@pytest.mark.parametrize( - "dtypes", - [ - (np.dtype("int8"), np.dtype("int16")), - (np.dtype("int16"), np.dtype("int32")), - (np.dtype("int32"), np.dtype("int64")), - (np.dtype("uint8"), np.dtype("uint16")), - (np.dtype("uint16"), np.dtype("uint32")), - (np.dtype("uint32"), np.dtype("uint64")), - (np.dtype("float32"), np.dtype("float64")), - (np.dtype("int32"), np.dtype("float32")), - (np.dtype("uint32"), np.dtype("float32")), - ], -) -def test_typecast_on_join_overflow_unsafe(dtypes): - dtype_l, dtype_r = dtypes - if dtype_l.kind in {"i", "u"}: - dtype_l_max = np.iinfo(dtype_l).max - elif dtype_l.kind == "f": - dtype_l_max = np.finfo(dtype_r).max - - lhs = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}, dtype=dtype_l) - rhs = cudf.DataFrame({"a": [1, 2, 3, 4, dtype_l_max + 1]}, dtype=dtype_r) - - p_lhs = lhs.to_pandas() - p_rhs = rhs.to_pandas() - - with expect_warning_if( - (dtype_l.kind == "f" and dtype_r.kind in {"i", "u"}) - or (dtype_l.kind in {"i", "u"} and dtype_r.kind == "f"), - UserWarning, - ): - expect = p_lhs.merge(p_rhs, on="a", how="left") - got = lhs.merge(rhs, on="a", how="left") - - # The dtypes here won't match exactly because pandas does some unsafe - # conversions (with a warning that we are catching above) that we don't - # want to match. - assert_join_results_equal(expect, got, how="left", check_dtype=False) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(5, 2), - Decimal64Dtype(7, 5), - Decimal64Dtype(12, 7), - Decimal128Dtype(20, 5), - ], -) -def test_decimal_typecast_inner(dtype): - other_data = ["a", "b", "c", "d", "e"] - - join_data_l = cudf.Series(["1.6", "9.5", "7.2", "8.7", "2.3"]).astype( - dtype - ) - join_data_r = cudf.Series(["1.6", "9.5", "7.2", "4.5", "2.3"]).astype( - dtype - ) - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - exp_join_data = ["1.6", "9.5", "7.2", "2.3"] - exp_other_data = ["a", "b", "c", "e"] - - exp_join_col = cudf.Series(exp_join_data).astype(dtype) - - expected = cudf.DataFrame( - { - "join_col": exp_join_col, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - - got = gdf_l.merge(gdf_r, on="join_col", how="inner") - - assert_join_results_equal(expected, got, how="inner") - assert_eq(dtype, got["join_col"].dtype) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(7, 3), - Decimal64Dtype(9, 5), - Decimal64Dtype(14, 10), - Decimal128Dtype(21, 9), - ], -) -def test_decimal_typecast_left(dtype): - other_data = ["a", "b", "c", "d"] - - join_data_l = cudf.Series(["95.05", "384.26", "74.22", "1456.94"]).astype( - dtype - ) - join_data_r = cudf.Series( - ["95.05", "62.4056", "74.22", "1456.9472"] - ).astype(dtype) - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - exp_join_data = ["95.05", "74.22", "384.26", "1456.94"] - exp_other_data_x = ["a", "c", "b", "d"] - exp_other_data_y = ["a", "c", None, None] - - exp_join_col = cudf.Series(exp_join_data).astype(dtype) - - expected = cudf.DataFrame( - { - "join_col": exp_join_col, - "B_x": exp_other_data_x, - "B_y": exp_other_data_y, - } - ) - - got = gdf_l.merge(gdf_r, on="join_col", how="left") - - assert_join_results_equal(expected, got, how="left") - assert_eq(dtype, got["join_col"].dtype) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(7, 3), - Decimal64Dtype(10, 5), - Decimal64Dtype(18, 9), - Decimal128Dtype(22, 8), - ], -) -def test_decimal_typecast_outer(dtype): - other_data = ["a", "b", "c"] - join_data_l = cudf.Series(["741.248", "1029.528", "3627.292"]).astype( - dtype - ) - join_data_r = cudf.Series(["9284.103", "1029.528", "948.637"]).astype( - dtype - ) - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - exp_join_data = ["9284.103", "948.637", "1029.528", "741.248", "3627.292"] - exp_other_data_x = [None, None, "b", "a", "c"] - exp_other_data_y = ["a", "c", "b", None, None] - exp_join_col = cudf.Series(exp_join_data).astype(dtype) - expected = cudf.DataFrame( - { - "join_col": exp_join_col, - "B_x": exp_other_data_x, - "B_y": exp_other_data_y, - } - ) - got = gdf_l.merge(gdf_r, on="join_col", how="outer") - - assert_join_results_equal(expected, got, how="outer") - assert_eq(dtype, got["join_col"].dtype) - - -@pytest.mark.parametrize( - "dtype_l", - [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5)], -) -@pytest.mark.parametrize( - "dtype_r", - [Decimal64Dtype(8, 3), Decimal64Dtype(11, 6)], -) -def test_mixed_decimal_typecast(dtype_l, dtype_r): - other_data = ["a", "b", "c", "d"] - - join_data_l = cudf.Series(["95.05", "34.6", "74.22", "14.94"]).astype( - dtype_r - ) - join_data_r = cudf.Series(["95.05", "62.4056", "74.22", "1.42"]).astype( - dtype_l - ) - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - with pytest.raises( - TypeError, - match="Decimal columns can only be merged with decimal columns " - "of the same precision and scale", - ): - gdf_l.merge(gdf_r, on="join_col", how="inner") - - -@pytest.mark.parametrize( - "dtype_l", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -@pytest.mark.parametrize( - "dtype_r", - ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"], -) -def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r): - other_data = ["a", "b", "c", "d", "e"] - join_data_l = cudf.Series( - ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-15"] - ).astype(dtype_l) - join_data_r = cudf.Series( - ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-16"] - ).astype(dtype_r) - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - exp_dtype = max(np.dtype(dtype_l), np.dtype(dtype_r)) - - exp_join_data = ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01"] - exp_other_data = ["a", "b", "c", "d"] - exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - - expect = cudf.DataFrame( - { - "join_col": exp_join_col, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - - got = gdf_l.merge(gdf_r, on="join_col", how="inner") - - assert_join_results_equal(expect, got, how="inner") - - -@pytest.mark.parametrize("dtype_l", ["category", "str", "int32", "float32"]) -@pytest.mark.parametrize("dtype_r", ["category", "str", "int32", "float32"]) -def test_typecast_on_join_categorical(dtype_l, dtype_r): - if not (dtype_l == "category" or dtype_r == "category"): - pytest.skip("at least one side must be category for this set of tests") - if dtype_l == "category" and dtype_r == "category": - pytest.skip("Can't determine which categorical to use") - - other_data = ["a", "b", "c", "d", "e"] - join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype=dtype_l) - join_data_r = cudf.Series([1, 2, 3, 4, 6], dtype=dtype_r) - if dtype_l == "category": - exp_dtype = join_data_l.dtype.categories.dtype - elif dtype_r == "category": - exp_dtype = join_data_r.dtype.categories.dtype - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - exp_join_data = [1, 2, 3, 4] - exp_other_data = ["a", "b", "c", "d"] - exp_join_col = cudf.Series(exp_join_data, dtype=exp_dtype) - - expect = cudf.DataFrame( - { - "join_col": exp_join_col, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - - got = gdf_l.merge(gdf_r, on="join_col", how="inner") - assert_join_results_equal(expect, got, how="inner") - - -def make_categorical_dataframe(categories, ordered=False): - dtype = CategoricalDtype(categories=categories, ordered=ordered) - data = cudf.Series(categories).astype(dtype) - return cudf.DataFrame({"key": data}) - - -def test_categorical_typecast_inner(): - # Inner join casting rules for categoricals - - # Equal categories, equal ordering -> common categorical - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([1, 2, 3], ordered=False) - result = left.merge(right, how="inner", on="key") - - expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) - expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - - assert_join_results_equal( - expect_data, result["key"], how="inner", check_categorical=False - ) - - # Equal categories, unequal ordering -> error - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([1, 2, 3], ordered=True) - - with pytest.raises(TypeError): - result = left.merge(right, how="inner", on="key") - - # Unequal categories - # Neither ordered -> unordered categorical with intersection - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([2, 3, 4], ordered=False) - - result = left.merge(right, how="inner", on="key") - - expect_dtype = cudf.CategoricalDtype(categories=[2, 3], ordered=False) - expect_data = cudf.Series([2, 3], dtype=expect_dtype, name="key") - assert_join_results_equal( - expect_data, result["key"], how="inner", check_categorical=False - ) - - # One is ordered -> error - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([2, 3, 4], ordered=True) - - with pytest.raises(TypeError): - result = left.merge(right, how="inner", on="key") - - # Both are ordered -> error - left = make_categorical_dataframe([1, 2, 3], ordered=True) - right = make_categorical_dataframe([2, 3, 4], ordered=True) - - with pytest.raises(TypeError): - result = left.merge(right, how="inner", on="key") - - -def test_categorical_typecast_left(): - # TODO: generalize to right or write another test - # Left join casting rules for categoricals - - # equal categories, neither ordered -> common dtype - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([1, 2, 3], ordered=False) - - result = left.merge(right, on="key", how="left") - - expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) - expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - - assert_join_results_equal(expect_data, result["key"], how="left") - - # equal categories, unequal ordering -> error - left = make_categorical_dataframe([1, 2, 3], ordered=True) - right = make_categorical_dataframe([1, 2, 3], ordered=False) - - with pytest.raises(TypeError): - result = left.merge(right, on="key", how="left") - with pytest.raises(TypeError): - result = right.merge(left, on="key", how="left") - - # unequal categories neither ordered -> left dtype - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([2, 3, 4], ordered=False) - - result = left.merge(right, on="key", how="left") - expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) - expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - - assert_join_results_equal(expect_data, result["key"], how="left") - - # unequal categories, unequal ordering -> error - left = make_categorical_dataframe([1, 2, 3], ordered=True) - right = make_categorical_dataframe([2, 3, 4], ordered=False) - - with pytest.raises(TypeError): - result = left.merge(right, on="key", how="left") - - # unequal categories, right ordered -> error - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([2, 3, 4], ordered=True) - - with pytest.raises(TypeError): - result = left.merge(right, on="key", how="left") - - # unequal categories, both ordered -> error - left = make_categorical_dataframe([1, 2, 3], ordered=True) - right = make_categorical_dataframe([2, 3, 4], ordered=True) - - with pytest.raises(TypeError): - result = left.merge(right, on="key", how="left") - - -def test_categorical_typecast_outer(): - # Outer join casting rules for categoricals - - # equal categories, neither ordered -> common dtype - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([1, 2, 3], ordered=False) - result = left.merge(right, on="key", how="outer") - - expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False) - expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - - assert_join_results_equal(expect_data, result["key"], how="outer") - - # equal categories, both ordered -> common dtype - left = make_categorical_dataframe([1, 2, 3], ordered=True) - right = make_categorical_dataframe([1, 2, 3], ordered=True) - result = left.merge(right, on="key", how="outer") - - expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=True) - expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key") - - assert_join_results_equal(expect_data, result["key"], how="outer") - - # equal categories, one ordered -> error - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([1, 2, 3], ordered=True) - - with pytest.raises(TypeError): - result = left.merge(right, how="outer", on="key") - with pytest.raises(TypeError): - result = right.merge(left, how="outer", on="key") - - # unequal categories, neither ordered -> superset - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([2, 3, 4], ordered=False) - result = left.merge(right, on="key", how="outer") - - expect_dtype = CategoricalDtype(categories=[1, 2, 3, 4], ordered=False) - expect_data = cudf.Series([1, 2, 3, 4], dtype=expect_dtype, name="key") - - assert_join_results_equal(expect_data, result["key"], how="outer") - - # unequal categories, one ordered -> error - left = make_categorical_dataframe([1, 2, 3], ordered=False) - right = make_categorical_dataframe([2, 3, 4], ordered=True) - - with pytest.raises(TypeError): - result = left.merge(right, how="outer", on="key") - with pytest.raises(TypeError): - result = right.merge(left, how="outer", on="key") - - # unequal categories, both ordered -> error - left = make_categorical_dataframe([1, 2, 3], ordered=True) - right = make_categorical_dataframe([2, 3, 4], ordered=True) - with pytest.raises(TypeError): - result = left.merge(right, how="outer", on="key") - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) -def test_categorical_typecast_inner_one_cat(dtype): - data = np.array([1, 2, 3], dtype=dtype) - - left = make_categorical_dataframe(data) - right = left.astype(left["key"].dtype.categories.dtype) - - result = left.merge(right, on="key", how="inner") - assert result["key"].dtype == left["key"].dtype.categories.dtype - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) -def test_categorical_typecast_left_one_cat(dtype): - data = np.array([1, 2, 3], dtype=dtype) - - left = make_categorical_dataframe(data) - right = left.astype(left["key"].dtype.categories.dtype) - - result = left.merge(right, on="key", how="left") - assert result["key"].dtype == left["key"].dtype - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"]) -def test_categorical_typecast_outer_one_cat(dtype): - data = np.array([1, 2, 3], dtype=dtype) - - left = make_categorical_dataframe(data) - right = left.astype(left["key"].dtype.categories.dtype) - - result = left.merge(right, on="key", how="outer") - assert result["key"].dtype == left["key"].dtype.categories.dtype - - -@pytest.mark.parametrize( - ("lhs", "rhs"), - [ - (["a", "b"], ["a"]), - (["a"], ["a", "b"]), - (["a", "b"], ["b"]), - (["b"], ["a", "b"]), - (["a"], ["a"]), - ], -) -@pytest.mark.parametrize("how", ["left", "right", "outer", "inner"]) -@pytest.mark.parametrize("level", ["a", "b", 0, 1]) -def test_index_join(lhs, rhs, how, level): - l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4]}) - l_df = cudf.from_pandas(l_pdf) - r_df = cudf.from_pandas(r_pdf) - p_lhs = l_pdf.set_index(lhs).index - p_rhs = r_pdf.set_index(rhs).index - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - - expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) - got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - - assert_join_results_equal(expected, got, how=how) - - -def test_index_join_corner_cases(): - l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_pdf = pd.DataFrame( - {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} - ) - l_df = cudf.from_pandas(l_pdf) - r_df = cudf.from_pandas(r_pdf) - - # Join when column name doesn't match with level - lhs = ["a", "b"] - # level and rhs don't match - rhs = ["c"] - level = "b" - how = "outer" - p_lhs = l_pdf.set_index(lhs).index - p_rhs = r_pdf.set_index(rhs).index - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) - got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - - assert_join_results_equal(expected, got, how=how) - - # sort is supported only in case of two non-MultiIndex join - # Join when column name doesn't match with level - lhs = ["a"] - # level and rhs don't match - rhs = ["a"] - level = "b" - how = "left" - p_lhs = l_pdf.set_index(lhs).index - p_rhs = r_pdf.set_index(rhs).index - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - expected = p_lhs.join(p_rhs, how=how, sort=True) - got = g_lhs.join(g_rhs, how=how, sort=True) - - assert_join_results_equal(expected, got, how=how) - - # Pandas Index.join on categorical column returns generic column - # but cudf will be returning a categorical column itself. - lhs = ["a", "b"] - rhs = ["a"] - level = "a" - how = "inner" - l_df["a"] = l_df["a"].astype("category") - r_df["a"] = r_df["a"].astype("category") - p_lhs = l_pdf.set_index(lhs).index - p_rhs = r_pdf.set_index(rhs).index - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - expected = p_lhs.join(p_rhs, level=level, how=how).to_frame(index=False) - got = g_lhs.join(g_rhs, level=level, how=how).to_frame(index=False) - - got["a"] = got["a"].astype(expected["a"].dtype) - - assert_join_results_equal(expected, got, how=how) - - -def test_index_join_exception_cases(): - l_df = cudf.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) - r_df = cudf.DataFrame( - {"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]} - ) - - # Join between two MultiIndex - lhs = ["a", "b"] - rhs = ["a", "c"] - level = "a" - how = "outer" - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - - with pytest.raises(TypeError): - g_lhs.join(g_rhs, level=level, how=how) - - # Improper level value, level should be an int or scalar value - level = ["a"] - rhs = ["a"] - g_lhs = l_df.set_index(lhs).index - g_rhs = r_df.set_index(rhs).index - with pytest.raises(ValueError): - g_lhs.join(g_rhs, level=level, how=how) - - -def test_typecast_on_join_indexes(): - join_data_l = cudf.Series([1, 2, 3, 4, 5], dtype="int8") - join_data_r = cudf.Series([1, 2, 3, 4, 6], dtype="int32") - other_data = ["a", "b", "c", "d", "e"] - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - gdf_l = gdf_l.set_index("join_col") - gdf_r = gdf_r.set_index("join_col") - - exp_join_data = [1, 2, 3, 4] - exp_other_data = ["a", "b", "c", "d"] - - expect = cudf.DataFrame( - { - "join_col": exp_join_data, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - expect = expect.set_index("join_col") - - got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - - assert_join_results_equal(expect, got, how="inner") - - -def test_typecast_on_join_multiindices(): - join_data_l_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int8") - join_data_l_1 = cudf.Series([2, 3, 4.1, 5.9, 6], dtype="float32") - join_data_l_2 = cudf.Series([7, 8, 9, 0, 1], dtype="float32") - - join_data_r_0 = cudf.Series([1, 2, 3, 4, 5], dtype="int32") - join_data_r_1 = cudf.Series([2, 3, 4, 5, 6], dtype="int32") - join_data_r_2 = cudf.Series([7, 8, 9, 0, 0], dtype="float64") - - other_data = ["a", "b", "c", "d", "e"] - - gdf_l = cudf.DataFrame( - { - "join_col_0": join_data_l_0, - "join_col_1": join_data_l_1, - "join_col_2": join_data_l_2, - "B": other_data, - } - ) - gdf_r = cudf.DataFrame( - { - "join_col_0": join_data_r_0, - "join_col_1": join_data_r_1, - "join_col_2": join_data_r_2, - "B": other_data, - } - ) - - gdf_l = gdf_l.set_index(["join_col_0", "join_col_1", "join_col_2"]) - gdf_r = gdf_r.set_index(["join_col_0", "join_col_1", "join_col_2"]) - - exp_join_data_0 = cudf.Series([1, 2], dtype="int32") - exp_join_data_1 = cudf.Series([2, 3], dtype="float64") - exp_join_data_2 = cudf.Series([7, 8], dtype="float64") - exp_other_data = cudf.Series(["a", "b"]) - - expect = cudf.DataFrame( - { - "join_col_0": exp_join_data_0, - "join_col_1": exp_join_data_1, - "join_col_2": exp_join_data_2, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"]) - got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - - assert_join_results_equal(expect, got, how="inner") - - -def test_typecast_on_join_indexes_matching_categorical(): - join_data_l = cudf.Series(["a", "b", "c", "d", "e"], dtype="category") - join_data_r = cudf.Series(["a", "b", "c", "d", "e"], dtype="str") - other_data = [1, 2, 3, 4, 5] - - gdf_l = cudf.DataFrame({"join_col": join_data_l, "B": other_data}) - gdf_r = cudf.DataFrame({"join_col": join_data_r, "B": other_data}) - - gdf_l = gdf_l.set_index("join_col") - gdf_r = gdf_r.set_index("join_col") - - exp_join_data = ["a", "b", "c", "d", "e"] - exp_other_data = [1, 2, 3, 4, 5] - - expect = cudf.DataFrame( - { - "join_col": exp_join_data, - "B_x": exp_other_data, - "B_y": exp_other_data, - } - ) - expect = expect.set_index("join_col") - got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") - - assert_join_results_equal(expect, got, how="inner") - - -@pytest.mark.parametrize( - "lhs", - [ - cudf.Series([1, 2, 3], name="a"), - cudf.DataFrame({"a": [2, 3, 4], "c": [4, 5, 6]}), - ], -) -@pytest.mark.parametrize( - "rhs", - [ - cudf.Series([1, 2, 3], name="b"), - cudf.DataFrame({"b": [2, 3, 4], "c": [4, 5, 6]}), - ], -) -@pytest.mark.parametrize( - "how", ["left", "inner", "outer", "leftanti", "leftsemi"] -) -@pytest.mark.parametrize( - "kwargs", - [ - {"left_on": "a", "right_on": "b"}, - {"left_index": True, "right_on": "b"}, - {"left_on": "a", "right_index": True}, - {"left_index": True, "right_index": True}, - ], -) -def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): - if how in ("leftsemi", "leftanti") and ( - kwargs.get("left_index") or kwargs.get("right_index") - ): - pytest.skip("Index joins not compatible with leftsemi and leftanti") - - check_lhs = lhs.copy() - check_rhs = rhs.copy() - if isinstance(lhs, cudf.Series): - check_lhs = lhs.to_frame() - if isinstance(rhs, cudf.Series): - check_rhs = rhs.to_frame() - - expect = cudf.merge(check_lhs, check_rhs, how=how, **kwargs) - got = cudf.merge(lhs, rhs, how=how, **kwargs) - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.xfail(reason="Cannot sort values of list dtype") -@pytest.mark.parametrize( - "how", ["left", "inner", "right", "leftanti", "leftsemi"] -) -def test_merge_with_lists(how): - pd_left = pd.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [[1, 2, 3], [4, 5], None, [6], [7, 8, None], []], - "c": ["a", "b", "c", "d", "e", "f"], - } - ) - pd_right = pd.DataFrame( - { - "a": [4, 3, 2, 1, 0, -1], - "d": [[[1, 2], None], [], [[3, 4]], None, [[5], [6, 7]], [[8]]], - } - ) - - gd_left = cudf.from_pandas(pd_left) - gd_right = cudf.from_pandas(pd_right) - - expect = pd_left.merge(pd_right, on="a") - got = gd_left.merge(gd_right, on="a") - - assert_join_results_equal(expect, got, how=how) - - -def test_join_renamed_index(): - df = cudf.DataFrame( - {0: [1, 2, 3, 4, 5], 1: [1, 2, 3, 4, 5], "c": [1, 2, 3, 4, 5]} - ).set_index([0, 1]) - df.index.names = ["a", "b"] # doesn't actually change df._index._data - - expect = df.to_pandas().merge( - df.to_pandas(), left_index=True, right_index=True - ) - got = df.merge(df, left_index=True, right_index=True, how="inner") - assert_join_results_equal(expect, got, how="inner") - - -@pytest.mark.parametrize( - "lhs_col, lhs_idx, rhs_col, rhs_idx, on", - [ - (["A", "B"], "L0", ["B", "C"], "L0", ["B"]), - (["A", "B"], "L0", ["B", "C"], "L0", ["L0"]), - (["A", "B"], "L0", ["B", "C"], "L0", ["B", "L0"]), - (["A", "B"], "L0", ["C", "L0"], "A", ["A"]), - (["A", "B"], "L0", ["C", "L0"], "A", ["L0"]), - (["A", "B"], "L0", ["C", "L0"], "A", ["A", "L0"]), - ], -) -@pytest.mark.parametrize( - "how", ["left", "inner", "right", "outer", "leftanti", "leftsemi"] -) -def test_join_merge_with_on(lhs_col, lhs_idx, rhs_col, rhs_idx, on, how): - lhs_data = {col_name: [4, 5, 6] for col_name in lhs_col} - lhs_index = cudf.Index([0, 1, 2], name=lhs_idx) - - rhs_data = {col_name: [4, 5, 6] for col_name in rhs_col} - rhs_index = cudf.Index([2, 3, 4], name=rhs_idx) - - gd_left = cudf.DataFrame(lhs_data, lhs_index) - gd_right = cudf.DataFrame(rhs_data, rhs_index) - pd_left = gd_left.to_pandas() - pd_right = gd_right.to_pandas() - - expect = pd_left.merge(pd_right, on=on).sort_index(axis=1, ascending=False) - got = gd_left.merge(gd_right, on=on).sort_index(axis=1, ascending=False) - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "on", - ["A", "L0"], -) -@pytest.mark.parametrize( - "how", ["left", "inner", "right", "outer", "leftanti", "leftsemi"] -) -def test_join_merge_invalid_keys(on, how): - gd_left = cudf.DataFrame( - {"A": [1, 2, 3], "B": [4, 5, 6]}, index=cudf.Index([0, 1, 2], name="C") - ) - gd_right = cudf.DataFrame( - {"D": [2, 3, 4], "E": [7, 8, 0]}, index=cudf.Index([0, 2, 4], name="F") - ) - pd_left = gd_left.to_pandas() - pd_right = gd_right.to_pandas() - - with pytest.raises(KeyError): - pd_left.merge(pd_right, on=on) - gd_left.merge(gd_right, on=on) - - -@pytest.mark.parametrize( - "str_data", - [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]], -) -@pytest.mark.parametrize("num_keys", [1, 2, 3]) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_key(str_data, num_keys, how): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_keys): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - if len(other_data) == 0: - pdf["a"] = pdf["a"].astype("str") - pdf2 = pdf.copy() - gdf2 = gdf.copy() - - expect = pdf.merge(pdf2, on=list(range(num_keys)), how=how) - got = gdf.merge(gdf2, on=list(range(num_keys)), how=how) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] # reorder columns - - if how == "right": - got = got[expect.columns] # reorder columns - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "str_data_nulls", - [ - ["a", "b", "c"], - ["a", "b", "f", "g"], - ["f", "g", "h", "i", "j"], - ["f", "g", "h"], - [None, None, None, None, None], - [], - ], -) -def test_string_join_key_nulls(str_data_nulls): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["key"] = pd.Series(str_data, dtype="str") - gdf["key"] = cudf.Series(str_data, dtype="str") - pdf["vals"] = other_data - gdf["vals"] = other_data - - pdf2 = pd.DataFrame() - gdf2 = cudf.DataFrame() - pdf2["key"] = pd.Series(str_data_nulls, dtype="str") - gdf2["key"] = cudf.Series(str_data_nulls, dtype="str") - pdf2["vals"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["vals"] = cudf.Series(other_data_nulls, dtype="int64") - - expect = pdf.merge(pdf2, on="key", how="left") - got = gdf.merge(gdf2, on="key", how="left") - got["vals_y"] = got["vals_y"].fillna(-1) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - expect["vals_y"] = expect["vals_y"].fillna(-1).astype("int64") - - assert_join_results_equal(expect, got, how="left") - - -@pytest.mark.parametrize( - "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] -) -@pytest.mark.parametrize("num_cols", [1, 2, 3]) -@pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) -def test_string_join_non_key(str_data, num_cols, how): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_cols): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - if len(other_data) == 0: - pdf["a"] = pdf["a"].astype("str") - - pdf2 = pdf.copy() - gdf2 = gdf.copy() - - expect = pdf.merge(pdf2, on=["a"], how=how) - got = gdf.merge(gdf2, on=["a"], how=how) - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - if how == "right": - got = got[expect.columns] # reorder columns - - assert_join_results_equal(expect, got, how=how) - - -@pytest.mark.parametrize( - "str_data_nulls", - [ - ["a", "b", "c"], - ["a", "b", "f", "g"], - ["f", "g", "h", "i", "j"], - ["f", "g", "h"], - [None, None, None, None, None], - [], - ], -) -def test_string_join_non_key_nulls(str_data_nulls): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - other_data_nulls = [6, 7, 8, 9, 10][: len(str_data_nulls)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["vals"] = pd.Series(str_data, dtype="str") - gdf["vals"] = cudf.Series(str_data, dtype="str") - pdf["key"] = other_data - gdf["key"] = other_data - - pdf2 = pd.DataFrame() - gdf2 = cudf.DataFrame() - pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") - gdf2["vals"] = cudf.Series(str_data_nulls, dtype="str") - pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") - gdf2["key"] = cudf.Series(other_data_nulls, dtype="int64") - - expect = pdf.merge(pdf2, on="key", how="left") - got = gdf.merge(gdf2, on="key", how="left") - - if len(expect) == 0 and len(got) == 0: - expect = expect.reset_index(drop=True) - got = got[expect.columns] - - assert_join_results_equal(expect, got, how="left") - - -def test_string_join_values_nulls(): - left_dict = [ - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "LEFT NO MATCH 1", "a": -1.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 1", "a": 1.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "MATCH 2", "a": 2.0}, - {"b": "LEFT NO MATCH 2", "a": -2.0}, - {"b": "MATCH 3", "a": 3.0}, - {"b": "MATCH 3", "a": 3.0}, - ] - - right_dict = [ - {"b": "RIGHT NO MATCH 1", "c": -1.0}, - {"b": "MATCH 3", "c": 3.0}, - {"b": "MATCH 2", "c": 2.0}, - {"b": "RIGHT NO MATCH 2", "c": -2.0}, - {"b": "RIGHT NO MATCH 3", "c": -3.0}, - {"b": "MATCH 1", "c": 1.0}, - ] - - left_pdf = pd.DataFrame(left_dict) - right_pdf = pd.DataFrame(right_dict) - - left_gdf = cudf.DataFrame.from_pandas(left_pdf) - right_gdf = cudf.DataFrame.from_pandas(right_pdf) - - expect = left_pdf.merge(right_pdf, how="left", on="b") - got = left_gdf.merge(right_gdf, how="left", on="b") - - expect = expect.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - got = got.sort_values(by=["a", "b", "c"]).reset_index(drop=True) - - assert_join_results_equal(expect, got, how="left") - - -@pytest.mark.parametrize( - "left_on,right_on", - [ - *product(["a", "b", "c"], ["a", "b"]), - *zip(combinations(["a", "b", "c"], 2), repeat(["a", "b"])), - ], -) -def test_merge_mixed_index_columns(left_on, right_on): - left = pd.DataFrame({"a": [1, 2, 1, 2], "b": [2, 3, 3, 4]}).set_index("a") - right = pd.DataFrame({"a": [1, 2, 1, 3], "b": [2, 30, 3, 4]}).set_index( - "a" - ) - - left["c"] = 10 - - expect = left.merge(right, left_on=left_on, right_on=right_on, how="outer") - cleft = cudf.from_pandas(left) - cright = cudf.from_pandas(right) - got = cleft.merge(cright, left_on=left_on, right_on=right_on, how="outer") - assert_join_results_equal(expect, got, how="outer") - - -def test_merge_multiindex_columns(): - lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) - lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) - rhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}) - rhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "z")]) - expect = lhs.merge(rhs, on=[("a", "x")], how="inner") - - lhs = cudf.from_pandas(lhs) - rhs = cudf.from_pandas(rhs) - got = lhs.merge(rhs, on=[("a", "x")], how="inner") - - assert_join_results_equal(expect, got, how="inner") - - -def test_join_multiindex_empty(): - lhs = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]}, index=["a", "b", "c"]) - lhs.columns = pd.MultiIndex.from_tuples([("a", "x"), ("a", "y")]) - rhs = pd.DataFrame(index=["a", "c", "d"]) - g_lhs = cudf.from_pandas(lhs) - g_rhs = cudf.from_pandas(rhs) - assert_exceptions_equal( - lfunc=lhs.join, - rfunc=g_lhs.join, - lfunc_args_and_kwargs=([rhs], {"how": "inner"}), - rfunc_args_and_kwargs=([g_rhs], {"how": "inner"}), - check_exception_type=False, - ) - - -def test_join_on_index_with_duplicate_names(): - # although index levels with duplicate names are poorly supported - # overall, we *should* be able to join on them: - lhs = pd.DataFrame({"a": [1, 2, 3]}) - rhs = pd.DataFrame({"b": [1, 2, 3]}) - lhs.index = pd.MultiIndex.from_tuples( - [(1, 1), (1, 2), (2, 1)], names=["x", "x"] - ) - rhs.index = pd.MultiIndex.from_tuples( - [(1, 1), (1, 3), (2, 1)], names=["x", "x"] - ) - expect = lhs.join(rhs, how="inner") - - lhs = cudf.from_pandas(lhs) - rhs = cudf.from_pandas(rhs) - got = lhs.join(rhs, how="inner") - - assert_join_results_equal(expect, got, how="inner") - - -def test_join_redundant_params(): - lhs = cudf.DataFrame( - {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c") - ) - rhs = cudf.DataFrame( - {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a") - ) - with pytest.raises(ValueError): - lhs.merge(rhs, on="a", left_index=True) - with pytest.raises(ValueError): - lhs.merge(rhs, left_on="a", left_index=True, right_index=True) - with pytest.raises(ValueError): - lhs.merge(rhs, right_on="a", left_index=True, right_index=True) - with pytest.raises(ValueError): - lhs.merge(rhs, left_on="c", right_on="b") - - -def test_join_multiindex_index(): - # test joining a MultiIndex with an Index with overlapping name - lhs = ( - cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]}) - .set_index(["a", "b"]) - .index - ) - rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index - expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner") - got = lhs.join(rhs, how="inner") - assert_join_results_equal(expect, got, how="inner") - - -def test_dataframe_join_on(): - """Verify that specifying the on parameter gives a NotImplementedError.""" - df = cudf.DataFrame({"a": [1, 2, 3]}) - with pytest.raises(NotImplementedError): - df.join(df, on="a") - - -def test_index_join_return_indexers_notimplemented(): - index = cudf.RangeIndex(start=0, stop=20, step=2) - other = cudf.Index([4, 4, 3, 3]) - with pytest.raises(NotImplementedError): - index.join(other, how="left", return_indexers=True) - - -@pytest.mark.parametrize("how", ["inner", "outer"]) -def test_index_join_names(request, how): - idx1 = cudf.Index([10, 1, 2, 4, 2, 1], name="a") - idx2 = cudf.Index([-10, 2, 3, 1, 2], name="b") - request.applymarker( - pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/57065", - ) - ) - pidx1 = idx1.to_pandas() - pidx2 = idx2.to_pandas() - - expected = pidx1.join(pidx2, how=how) - actual = idx1.join(idx2, how=how) - assert_join_results_equal(actual, expected, how=how) - - -@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) -def test_join_datetime_timedelta_error(dtype): - df1 = cudf.DataFrame({"a": cudf.Series([10, 20, 30], dtype=dtype)}) - df2 = df1.astype("int") - - with pytest.raises(TypeError): - df1.merge(df2) - - -@pytest.mark.parametrize("dtype1", TIMEDELTA_TYPES) -@pytest.mark.parametrize("dtype2", TIMEDELTA_TYPES) -def test_merge_timedelta_types(dtype1, dtype2): - df1 = cudf.DataFrame({"a": cudf.Series([10, 20, 30], dtype=dtype1)}) - df2 = cudf.DataFrame({"a": cudf.Series([20, 500, 33240], dtype=dtype2)}) - - pdf1 = df1.to_pandas() - pdf2 = df2.to_pandas() - actual = df1.merge(df2) - expected = pdf1.merge(pdf2) - - # Pandas is materializing the index, which is unnecessary - # hence the special handling. - assert_eq( - actual, - expected, - check_index_type=False - if isinstance(actual.index, cudf.RangeIndex) - and isinstance(expected.index, pd.Index) - else True, - check_dtype=len(actual) > 0, - ) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py deleted file mode 100644 index c81c2d1d94b..00000000000 --- a/python/cudf/cudf/tests/test_json.py +++ /dev/null @@ -1,1446 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import copy -import gzip -import itertools -import os -from io import BytesIO, StringIO -from pathlib import Path - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing._utils import ( - DATETIME_TYPES, - NUMERIC_TYPES, - TIMEDELTA_TYPES, - expect_warning_if, -) - - -def make_numeric_dataframe(nrows, dtype): - df = pd.DataFrame() - df["col1"] = np.arange(nrows, dtype=dtype) - df["col2"] = np.arange(1, 1 + nrows, dtype=dtype) - return df - - -@pytest.fixture(params=[0, 1, 10, 100]) -def pdf(request): - types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] - nrows = request.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) - for typ in types - } - ) - # Delete the name of the column index, and rename the row index - test_pdf.columns.name = None - test_pdf.index.name = "test_index" - - return test_pdf - - -@pytest.fixture -def gdf(pdf): - return cudf.DataFrame.from_pandas(pdf) - - -@pytest.fixture(params=[0, 1, 10, 100]) -def gdf_writer_types(request): - # datetime64[us], datetime64[ns] are unsupported due to a bug in parser - types = ( - NUMERIC_TYPES - + ["datetime64[s]", "datetime64[ms]"] - + TIMEDELTA_TYPES - + ["bool", "str"] - ) - typer = {"col_" + val: val for val in types} - ncols = len(types) - nrows = request.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = cudf.DataFrame( - [list(range(ncols * i, ncols * (i + 1))) for i in range(nrows)], - columns=pd.Index([f"col_{typ}" for typ in types]), - ) - - # Cast all the column dtypes to objects, rename them, and then cast to - # appropriate types - test_pdf = test_pdf.astype(typer) - - return test_pdf - - -index_params = [True, False] -compression_params = ["gzip", "bz2", "zip", "xz", None] -orient_params = ["columns", "records", "table", "split"] -params = itertools.product(index_params, compression_params, orient_params) - - -@pytest.fixture(params=params) -def json_files(request, tmp_path_factory, pdf): - index, compression, orient = request.param - if index is False and orient not in ("split", "table"): - pytest.skip( - "'index=False' is only valid when 'orient' is 'split' or " - "'table'" - ) - if index is False and orient == "table": - pytest.skip("'index=False' isn't valid when 'orient' is 'table'") - if index is True and orient not in ("split", "table", "index", "columns"): - pytest.skip("'index=False' isn't valid when 'orient' is 'table'") - fname_df = tmp_path_factory.mktemp("json") / "test_df.json" - fname_series = tmp_path_factory.mktemp("json") / "test_series.json" - pdf.to_json(fname_df, index=index, compression=compression, orient=orient) - pdf["col_int32"].to_json( - fname_series, index=index, compression=compression, orient=orient - ) - return (fname_df, fname_series, orient, compression) - - -@pytest.mark.filterwarnings("ignore:Strings are not yet supported") -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_json_reader(json_files): - path_df, path_series, orient, compression = json_files - expect_df = pd.read_json(path_df, orient=orient, compression=compression) - got_df = cudf.read_json(path_df, orient=orient, compression=compression) - if len(expect_df) == 0: - expect_df = expect_df.reset_index(drop=True) - expect_df.columns = expect_df.columns.astype("object") - if len(got_df) == 0: - got_df = got_df.reset_index(drop=True) - - assert_eq(expect_df, got_df, check_categorical=False) - - # Only these orients are allowed for Series, but isn't enforced by Pandas - if orient in ("split", "records", "index"): - expect_series = pd.read_json( - path_series, orient=orient, compression=compression, typ="series" - ) - got_series = cudf.read_json( - path_series, orient=orient, compression=compression, typ="series" - ) - if len(expect_series) == 0: - expect_series = expect_series.reset_index(drop=True) - if len(got_df) == 0: - got_series = got_series.reset_index(drop=True) - - assert_eq(expect_series, got_series) - - -@pytest.mark.filterwarnings("ignore:Can't infer compression") -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_json_writer(tmpdir, pdf, gdf): - pdf_df_fname = tmpdir.join("pdf_df.json") - gdf_df_fname = tmpdir.join("gdf_df.json") - - pdf.to_json(pdf_df_fname) - gdf.to_json(gdf_df_fname) - - assert os.path.exists(pdf_df_fname) - assert os.path.exists(gdf_df_fname) - - expect_df = pd.read_json(pdf_df_fname) - got_df = pd.read_json(gdf_df_fname) - - assert_eq(expect_df, got_df) - - for column in pdf.columns: - pdf_series_fname = tmpdir.join(column + "_" + "pdf_series.json") - gdf_series_fname = tmpdir.join(column + "_" + "gdf_series.json") - - pdf[column].to_json(pdf_series_fname) - gdf[column].to_json(gdf_series_fname) - - assert os.path.exists(pdf_series_fname) - assert os.path.exists(gdf_series_fname) - - expect_series = pd.read_json(pdf_series_fname, typ="series") - got_series = pd.read_json(gdf_series_fname, typ="series") - - assert_eq(expect_series, got_series) - - # Make sure results align for regular strings, not just files - pdf_string = pdf[column].to_json() - gdf_string = pdf[column].to_json() - assert_eq(pdf_string, gdf_string) - - -@pytest.mark.parametrize( - "lines", [True, False], ids=["lines=True", "lines=False"] -) -def test_cudf_json_writer(pdf, lines): - # removing datetime column because pandas doesn't support it - for col_name in pdf.columns: - if "datetime" in col_name: - pdf.drop(col_name, axis=1, inplace=True) - gdf = cudf.DataFrame.from_pandas(pdf) - pdf_string = pdf.to_json(orient="records", lines=lines) - gdf_string = gdf.to_json(orient="records", lines=lines, engine="cudf") - - assert_eq(pdf_string, gdf_string) - - gdf_string = gdf.to_json( - orient="records", lines=lines, engine="cudf", rows_per_chunk=8 - ) - - assert_eq(pdf_string, gdf_string) - - -def test_cudf_json_writer_read(gdf_writer_types): - dtypes = { - col_name: col_name[len("col_") :] - for col_name in gdf_writer_types.columns - } - gdf_string = gdf_writer_types.to_json( - orient="records", lines=True, engine="cudf" - ) - gdf2 = cudf.read_json( - StringIO(gdf_string), - lines=True, - engine="cudf", - dtype=dict(dtypes), - ) - pdf2 = pd.read_json(StringIO(gdf_string), lines=True, dtype=dict(dtypes)) - - # Bug in pandas https://github.com/pandas-dev/pandas/issues/28558 - if pdf2.empty: - pdf2.reset_index(drop=True, inplace=True) - pdf2.columns = pdf2.columns.astype("object") - - # Pandas moved to consistent datetimes parsing format: - # https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#datetimes-are-now-parsed-with-a-consistent-format - for unit in ["s", "ms"]: - if f"col_datetime64[{unit}]" in pdf2.columns: - pdf2[f"col_datetime64[{unit}]"] = ( - pd.to_datetime(pdf2[f"col_datetime64[{unit}]"], format="mixed") - .dt.tz_localize(None) - .astype(f"datetime64[{unit}]") - ) - assert_eq(pdf2, gdf2) - - -@pytest.mark.parametrize( - "jsonl_string, expected", - [ - # fixed width - ("""{"a":10, "b":1.1}\n {"a":20, "b":2.1}\n""", None), - # simple list - ("""{"a":[1, 2, 3], "b":1.1}\n {"a":[]}\n""", None), - # simple struct - ("""{"a":{"c": 123 }, "b":1.1}\n {"a": {"c": 456}}\n""", None), - # list of lists - ("""{"a":[[], [1, 2], [3, 4]], "b":1.1}\n""", None), - ("""{"a":[null, [1, 2], [null, 4]], "b":1.1}\n""", None), - # list of structs - # error ("""{"a":[null, {}], "b":1.1}\n""", None), - ( - """{"a":[null, {"L": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""", - None, - ), - ( - """{"a":[{"L": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}\n""", - None, - ), - # struct of lists - ( - """{"a":{"L": [1, 2, 3]}, "b":1.1}\n {"a": {"L": [4, 5, 6]}}\n""", - None, - ), - ("""{"a":{"L": [1, 2, null]}, "b":1.1}\n {"a": {"L": []}}\n""", None), - # struct of structs - ( - """{"a":{"L": {"M": 123}}, "b":1.1} - {"a": {"L": {"M": 456}}}\n""", - None, - ), - ( - """{"a":{"L": {"M": null}}, "b":1.1}\n {"a": {"L": {}}}\n""", - """{"a":{"L": {}}, "b":1.1}\n {"a": {"L": {}}}\n""", - ), - # list of structs of lists - ("""{"a":[{"L": [1, 2, 3]}, {"L": [4, 5, 6]}], "b":1.1}\n""", None), - ("""{"a":[{"L": [1, 2, null]}, {"L": []}], "b":1.1}\n""", None), - # struct of lists of structs - ("""{"a":{"L": [{"M": 123}, {"M": 456}]}, "b":1.1}\n""", None), - ( - """{"a":{"L": [{"M": null}, {}]}, "b":1.1}\n""", - """{"a":{"L": [{}, {}]}, "b":1.1}\n""", - ), - ], -) -def test_cudf_json_roundtrip(jsonl_string, expected): - gdf = cudf.read_json( - StringIO(jsonl_string), - lines=True, - engine="cudf", - # dtype=dict(dtypes), - ) - expected = jsonl_string if expected is None else expected - gdf_string = gdf.to_json( - orient="records", lines=True, engine="cudf", include_nulls=False - ) - assert_eq(gdf_string, expected.replace(" ", "")) - - -@pytest.mark.parametrize("sink", ["string", "file"]) -def test_cudf_json_writer_sinks(sink, tmp_path_factory): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - target = None - if sink == "string": - target = StringIO() - elif sink == "file": - target = tmp_path_factory.mktemp("json") / "test_df.json" - df.to_json(target, engine="cudf") - if sink == "string": - assert ( - target.getvalue() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' - ) - elif sink == "file": - assert os.path.exists(target) - with open(target, "r") as f: - assert f.read() == '[{"a":1,"b":4},{"a":2,"b":5},{"a":3,"b":6}]' - - -@pytest.fixture( - params=["string", "filepath", "pathobj", "bytes_io", "string_io", "url"] -) -def json_input(request, tmp_path_factory): - input_type = request.param - buffer = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n" - fname = tmp_path_factory.mktemp("json") / "test_df.json" - if not os.path.isfile(fname): - with open(str(fname), "w") as fp: - fp.write(buffer) - - if input_type == "string": - return buffer - if input_type == "filepath": - return str(fname) - if input_type == "pathobj": - return Path(fname) - if input_type == "bytes_io": - return BytesIO(buffer.encode()) - if input_type == "string_io": - return StringIO(buffer) - if input_type == "url": - return Path(fname).as_uri() - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"]) -def test_json_lines_basic(json_input, engine): - can_warn = isinstance(json_input, str) and not json_input.endswith(".json") - with expect_warning_if(can_warn): - cu_df = cudf.read_json(json_input, engine=engine, lines=True) - with expect_warning_if(can_warn): - pd_df = pd.read_json(json_input, lines=True) - - assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): - assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"]) -def test_nonexistent_json_correct_error(engine): - json_input = "doesnotexist.json" - with pytest.raises(FileNotFoundError): - cudf.read_json(json_input, engine=engine) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["auto", "cudf"]) -def test_json_lines_multiple(tmpdir, json_input, engine): - tmp_file1 = tmpdir.join("MultiInputs1.json") - tmp_file2 = tmpdir.join("MultiInputs2.json") - - with expect_warning_if( - isinstance(json_input, str) and not json_input.endswith(".json") - ): - pdf = pd.read_json(json_input, lines=True) - pdf.to_json(tmp_file1, compression="infer", lines=True, orient="records") - pdf.to_json(tmp_file2, compression="infer", lines=True, orient="records") - - cu_df = cudf.read_json([tmp_file1, tmp_file2], engine=engine, lines=True) - pd_df = pd.concat([pdf, pdf]) - - assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): - assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("engine", ["auto", "cudf"]) -def test_json_read_directory(tmpdir, json_input, engine): - with expect_warning_if( - isinstance(json_input, str) and not json_input.endswith(".json") - ): - pdf = pd.read_json(json_input, lines=True) - pdf.to_json( - tmpdir.join("MultiInputs1.json"), - compression="infer", - lines=True, - orient="records", - ) - pdf.to_json( - tmpdir.join("MultiInputs2.json"), - compression="infer", - lines=True, - orient="records", - ) - pdf.to_json( - tmpdir.join("MultiInputs3.json"), - compression="infer", - lines=True, - orient="records", - ) - - cu_df = cudf.read_json(tmpdir, engine=engine, lines=True) - pd_df = pd.concat([pdf, pdf, pdf]) - - assert all(cu_df.dtypes == ["int64", "int64", "int64"]) - for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): - assert str(cu_col) == str(pd_col) - np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy()) - - -def test_json_lines_byte_range(json_input): - # include the first row and half of the second row - # should parse the first two rows - will_warn = isinstance(json_input, str) and not json_input.endswith( - ".json" - ) - with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(0, 15) - ) - assert df.shape == (2, 3) - - # include half of the second row and half of the third row - # should parse only the third row - with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 10) - ) - assert df.shape == (1, 3) - - # include half of the second row and entire third row - # should parse only the third row - with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(15, 0) - ) - assert df.shape == (1, 3) - - # include half of the second row till past the end of the file - # should parse only the third row - with expect_warning_if(will_warn): - df = cudf.read_json( - copy.deepcopy(json_input), lines=True, byte_range=(10, 50) - ) - assert df.shape == (1, 3) - - -def test_json_lines_dtypes(json_input): - with expect_warning_if( - isinstance(json_input, str) and not json_input.endswith(".json") - ): - df = cudf.read_json( - json_input, lines=True, dtype={1: "int", 2: "short", 0: "float"} - ) - assert all(df.dtypes == ["float64", "int64", "int16"]) - - -@pytest.mark.parametrize( - "ext, out_comp, in_comp", - [ - (".geez", "gzip", "gzip"), - (".beez", "bz2", "bz2"), - (".gz", "gzip", "infer"), - (".bz2", "bz2", "infer"), - (".data", None, "infer"), - (".txt", None, None), - ("", None, None), - ], -) -def test_json_lines_compression(tmpdir, ext, out_comp, in_comp): - fname = tmpdir.mkdir("gdf_json").join("tmp_json_compression" + ext) - - nrows = 20 - pd_df = make_numeric_dataframe(nrows, np.int32) - pd_df.to_json(fname, compression=out_comp, lines=True, orient="records") - - cu_df = cudf.read_json( - str(fname), - compression=in_comp, - lines=True, - dtype={"col1": "int32", "col2": "int32"}, - ) - assert_eq(pd_df, cu_df) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_json_engine_selection(): - json = "[1, 2, 3]" - - # should use the cudf engine - df = cudf.read_json(StringIO(json), lines=True) - # column names are strings when parsing with cudf - for col_name in df.columns: - assert isinstance(col_name, str) - - # should use the pandas engine - df = cudf.read_json(StringIO(json), lines=False, engine="pandas") - # column names are ints when parsing with pandas - for col_name in df.columns: - assert isinstance(col_name, int) - - # should use the pandas engine - df = cudf.read_json(StringIO(json), lines=True, engine="pandas") - # column names are ints when parsing with pandas - for col_name in df.columns: - assert isinstance(col_name, int) - - -def test_json_bool_values(): - buffer = "[true,1]\n[false,false]\n[true,true]" - cu_df = cudf.read_json(StringIO(buffer), lines=True) - pd_df = pd.read_json(StringIO(buffer), lines=True) - - # types should be ['bool', 'int64'] - np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) - np.testing.assert_array_equal(pd_df[0], cu_df["0"].to_numpy()) - # boolean values should be converted to 0/1 - np.testing.assert_array_equal(pd_df[1], cu_df["1"].to_numpy()) - - cu_df = cudf.read_json( - StringIO(buffer), lines=True, dtype={"0": "bool", "1": "long"} - ) - np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes) - - -def test_json_bad_protocol_string(): - test_string = StringIO('{"field": "s3://path"}') - - expect = pd.DataFrame([{"field": "s3://path"}]) - got = cudf.read_json(test_string, lines=True) - - assert_eq(expect, got) - - -def test_json_corner_case_with_escape_and_double_quote_char_with_pandas( - tmpdir, -): - fname = tmpdir.mkdir("gdf_json").join("tmp_json_escape_double_quote") - - pdf = pd.DataFrame( - { - "a": ['ab"cd', "\\\b", "\r\\", "'"], - "b": ["a\tb\t", "\\", '\\"', "\t"], - "c": ["aeiou", "try", "json", "cudf"], - } - ) - pdf.to_json(fname, compression="infer", lines=True, orient="records") - - df = cudf.read_json( - fname, compression="infer", lines=True, orient="records" - ) - pdf = pd.read_json( - fname, compression="infer", lines=True, orient="records" - ) - - assert_eq(cudf.DataFrame(pdf), df) - - -def test_json_corner_case_with_escape_and_double_quote_char_with_strings(): - str_buffer = StringIO( - """{"a":"ab\\"cd","b":"a\\tb\\t","c":"aeiou"} - {"a":"\\\\\\b","b":"\\\\","c":"try"} - {"a":"\\r\\\\","b":"\\\\\\"","c":"json"} - {"a":"\'","b":"\\t","c":"cudf"}""" - ) - - df = cudf.read_json( - str_buffer, compression="infer", lines=True, orient="records" - ) - - expected = { - "a": ['ab"cd', "\\\b", "\r\\", "'"], - "b": ["a\tb\t", "\\", '\\"', "\t"], - "c": ["aeiou", "try", "json", "cudf"], - } - - num_rows = df.shape[0] - for col_name in df._data: - for i in range(num_rows): - assert expected[col_name][i] == df[col_name][i] - - -def test_json_to_json_special_characters(): - df = cudf.DataFrame( - { - "'a'": ['ab"cd', "\\\b", "\r\\", "'"], - "b": ["a\tb\t", "\\", '\\"', "\t"], - "c": ["aeiou", "try", "json", "cudf"], - } - ) - - actual = StringIO() - df.to_json(actual, engine="cudf", lines=True, orient="records") - expected = StringIO() - df.to_pandas().to_json(expected, lines=True, orient="records") - assert expected.getvalue() == actual.getvalue() - - -@pytest.mark.parametrize( - "gdf,pdf", - [ - ( - cudf.DataFrame( - { - "int col": cudf.Series( - [1, 2, None, 2, 2323, 234, None], dtype="int64" - ) - } - ), - pd.DataFrame( - { - "int col": pd.Series( - [1, 2, None, 2, 2323, 234, None], dtype=pd.Int64Dtype() - ) - } - ), - ), - ( - cudf.DataFrame( - { - "int64 col": cudf.Series( - [1, 2, None, 2323, None], dtype="int64" - ), - "string col": cudf.Series( - ["abc", "a", None, "", None], dtype="str" - ), - "float col": cudf.Series( - [0.234, None, 234234.2343, None, 0.0], dtype="float64" - ), - "bool col": cudf.Series( - [None, True, False, None, True], dtype="bool" - ), - "categorical col": cudf.Series( - [1, 2, 1, None, 2], dtype="category" - ), - "datetime col": cudf.Series( - [1231233, None, 2323234, None, 1], - dtype="datetime64[ns]", - ), - "timedelta col": cudf.Series( - [None, 34687236, 2323234, 1, None], - dtype="timedelta64[ns]", - ), - } - ), - pd.DataFrame( - { - "int64 col": pd.Series( - [1, 2, None, 2323, None], dtype=pd.Int64Dtype() - ), - "string col": pd.Series( - ["abc", "a", None, "", None], dtype=pd.StringDtype() - ), - "float col": pd.Series( - [0.234, None, 234234.2343, None, 0.0], dtype="float64" - ), - "bool col": pd.Series( - [None, True, False, None, True], - dtype=pd.BooleanDtype(), - ), - "categorical col": pd.Series( - [1, 2, 1, None, 2], dtype="category" - ), - "datetime col": pd.Series( - [1231233, None, 2323234, None, 1], - dtype="datetime64[ns]", - ), - "timedelta col": pd.Series( - [None, 34687236, 2323234, 1, None], - dtype="timedelta64[ns]", - ), - } - ), - ), - ], -) -def test_json_to_json_compare_contents(gdf, pdf): - expected_json = pdf.to_json(lines=True, orient="records") - with pytest.warns(UserWarning): - actual_json = gdf.to_json(lines=True, orient="records") - - assert expected_json == actual_json - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["cudf", "pandas"]) -def test_default_integer_bitwidth(default_integer_bitwidth, engine): - buf = BytesIO() - pd.DataFrame({"a": range(10)}).to_json(buf, lines=True, orient="records") - buf.seek(0) - df = cudf.read_json(buf, engine=engine, lines=True, orient="records") - - assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize( - "engine", - [ - "cudf", - "pandas", - ], -) -def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine): - buf = BytesIO() - pd.DataFrame({"a": range(10), "b": range(10, 20)}).to_json( - buf, lines=True, orient="records" - ) - buf.seek(0) - df = cudf.read_json( - buf, engine=engine, lines=True, orient="records", dtype={"b": "i8"} - ) - - assert df["a"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") - assert df["b"].dtype == np.dtype("i8") - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["cudf", "pandas"]) -def test_default_integer_bitwidth_extremes(default_integer_bitwidth, engine): - # Test that integer columns in json are _inferred_ as 32 bit columns. - buf = StringIO( - '{"u8":18446744073709551615, "i8":9223372036854775807}\n' - '{"u8": 0, "i8": -9223372036854775808}' - ) - df = cudf.read_json(buf, engine=engine, lines=True, orient="records") - - assert df["u8"].dtype == np.dtype(f"u{default_integer_bitwidth//8}") - assert df["i8"].dtype == np.dtype(f"i{default_integer_bitwidth//8}") - - -def test_default_float_bitwidth(default_float_bitwidth): - # Test that float columns in json are _inferred_ as 32 bit columns. - df = cudf.read_json( - StringIO('{"a": 1.0, "b": 2.5}\n{"a": 3.5, "b": 4.0}'), - engine="cudf", - lines=True, - orient="records", - ) - assert df["a"].dtype == np.dtype(f"f{default_float_bitwidth//8}") - assert df["b"].dtype == np.dtype(f"f{default_float_bitwidth//8}") - - -def test_json_nested_basic(): - bytes_obj = BytesIO() - data = { - "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], - "c2": [["l11", "l21"], ["l12", "l22"]], - } - pdf = pd.DataFrame(data) - pdf.to_json(bytes_obj, orient="records") - - df = cudf.read_json(bytes_obj, engine="cudf", orient="records") - bytes_obj.seek(0) - pdf = pd.read_json(bytes_obj, orient="records") - - assert_eq(pdf, df) - - -@pytest.mark.parametrize( - "data", - [ - { - "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], - "c2": [["l11", "l21"], ["l12", "l22"]], - }, - # Essential test case to handle omissions - { - "c1": [{"f2": "sf21"}, {"f1": "sf12"}], - "c2": [["l11", "l21"], []], - }, - # empty input - {}, - ], -) -@pytest.mark.parametrize("lines", [True, False]) -def test_json_nested_lines(data, lines): - bytes = BytesIO() - pdf = pd.DataFrame(data) - pdf.to_json(bytes, orient="records", lines=lines) - bytes.seek(0) - df = cudf.read_json(bytes, engine="cudf", orient="records", lines=lines) - bytes.seek(0) - pdf = pd.read_json(bytes, orient="records", lines=lines) - # In the second test-case we need to take a detour via pyarrow - # Pandas omits "f1" in first row, so we have to enforce a common schema, - # such that pandas would have the f1 member with null - # Also, pyarrow chooses to select different ordering of a nested column - # children though key-value pairs are correct. - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -def test_json_nested_data(): - json_str = ( - '[{"0":{},"2":{}},{"1":[[""],[]],"2":{"2":""}},' - '{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' - ) - df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") - pdf = pd.read_json(StringIO(json_str), orient="records") - pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -def test_json_empty_types(): - json_str = """ {} - {"a": [], "b": {}} - {"a": []} - {"b": {}} - {"c": {"d": []}} - {"e": [{}]} - """ - df = cudf.read_json(StringIO(json_str), orient="records", lines=True) - pdf = pd.read_json(StringIO(json_str), orient="records", lines=True) - assert_eq(df, pdf) - - -def test_json_types_data(): - # 0:<0:string,1:float> - # 1:list - # 2:<0:bool> - json_str = ( - '[{"0":null,"2":{}},' - '{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' - '{"0":{},"1":[],"2":{"0":null}}]' - ) - df = cudf.read_json(StringIO(json_str), engine="cudf", orient="records") - pdf = pd.read_json(StringIO(json_str), orient="records") - pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -@pytest.mark.parametrize( - "col_type,json_str,expected_data", - [ - # without quotes - ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes - ("int", '[{"k": "1"}, {"k": "2"}]', [1, 2]), - # with quotes, mixed - ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes, null, mixed - ( - "int", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # without quotes, null - ( - "int", - '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # without quotes - ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes - ("float", '[{"k": "1"}, {"k": "2"}]', [1, 2]), - # with quotes, mixed - ( - "float", - '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', - [1, 2, 3, 4], - ), - # with quotes, null, mixed - ( - "float", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # with quotes, NAN - ( - "float", - '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]', - [1, 2, np.nan, 4], - ), - # without quotes - ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes - ("str", '[{"k": "1"}, {"k": "2"}]', [1, 2]), - # with quotes, mixed - ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes, null, mixed - ( - "str", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # without quotes, null - ( - "str", - '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - ], -) -def test_json_quoted_values_with_schema(col_type, json_str, expected_data): - actual = cudf.read_json( - StringIO(json_str), - engine="cudf", - orient="records", - dtype={"k": col_type}, - ) - expected = cudf.DataFrame({"k": expected_data}, dtype=col_type) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "col_type,json_str,expected_data", - [ - # with quotes, mixed - ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]), - # with quotes, null, mixed - ( - "int", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - [1, 2, None, 4], - ), - # with quotes, mixed - ( - "str", - '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', - ["1", "2", "3", "4"], - ), - # with quotes, null, mixed - ( - "str", - '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]', - ["1", "2", None, "4"], - ), - ], -) -def test_json_quoted_values(col_type, json_str, expected_data): - actual = cudf.read_json( - StringIO(json_str), - engine="cudf", - orient="records", - dtype={"k": col_type}, - ) - expected = cudf.DataFrame({"k": expected_data}, dtype=col_type) - - assert_eq(expected, actual) - assert_eq(expected_data, actual.k.to_arrow().to_pylist()) - - -@pytest.mark.parametrize( - "keep_quotes,result", - [ - ( - True, - { - "c1": [ - {"f1": '"sf11"', "f2": '"sf21"'}, - {"f1": '"sf12"', "f2": '"sf22"'}, - ], - "c2": [['"l11"', '"l21"'], ['"l12"', '"l22"']], - }, - ), - ( - False, - { - "c1": [ - {"f1": "sf11", "f2": "sf21"}, - {"f1": "sf12", "f2": "sf22"}, - ], - "c2": [["l11", "l21"], ["l12", "l22"]], - }, - ), - ], -) -def test_json_keep_quotes(keep_quotes, result): - bytes_file = BytesIO() - data = { - "c1": [{"f1": "sf11", "f2": "sf21"}, {"f1": "sf12", "f2": "sf22"}], - "c2": [["l11", "l21"], ["l12", "l22"]], - } - pdf = pd.DataFrame(data) - pdf.to_json(bytes_file, orient="records", lines=True) - - actual = cudf.read_json( - bytes_file, - orient="records", - lines=True, - keep_quotes=keep_quotes, - ) - expected = pd.DataFrame(result) - - assert_eq(actual, expected) - - -def test_json_dtypes_nested_data(): - # a: StructDtype({'a': StructDtype({'b': dtype('float64')}), - # 'b': dtype('int64')}) - # b: ListDtype(ListDtype(float64)) - actual_json_str = ( - '{"a":{"a":{"b":10.0},"b":11},"b":[[10.0,1.1],[12.0,23.0]]}\n' - '{"a":{"a":{"b":107.0},"b":5},"b":[[10.0,11.2],[12.0,0.23]]}\n' - '{"a":{"a":{"b":50.7},"b":2},"b":[[10.0,11.3],[12.0,2.3]]}\n' - '{"a":{"a":{"b":1.2},"b":67},"b":[[6.0,7.0]]}\n' - '{"a":{"a":{"b":40.1},"b":1090},"b":null}\n' - ) - - """ - In [3]: df - Out[3]: - a b - 0 {'a': {'b': 10.0}, 'b': 11} [[10.0, 1.1], [12.0, 23.0]] - 1 {'a': {'b': 107.0}, 'b': 5} [[10.0, 11.2], [12.0, 0.23]] - 2 {'a': {'b': 50.7}, 'b': 2} [[10.0, 11.3], [12.0, 2.3]] - 3 {'a': {'b': 1.2}, 'b': 67} [[6.0, 7.0]] - 4 {'a': {'b': 40.1}, 'b': 1090} None - """ - - # a: StructDtype({'a': StructDtype({'b': dtype('int64')}), - # 'b': dtype('float64')}) - # b: ListDtype(ListDtype(int64)) - expected_json_str = ( - '{"a":{"a":{"b":10},"b":11.0},"b":[[10,1],[12,23]]}\n' - '{"a":{"a":{"b":107},"b":5.0},"b":[[10,11],[12,0]]}\n' - '{"a":{"a":{"b":50},"b":2.0},"b":[[10,11],[12,2]]}\n' - '{"a":{"a":{"b":1},"b":67.0},"b":[[6,7]]}\n' - '{"a":{"a":{"b":40},"b":1090.0},"b":null}\n' - ) - - """ - In [7]: df - Out[7]: - a b - 0 {'a': {'b': 10}, 'b': 11.0} [[10, 1], [12, 23]] - 1 {'a': {'b': 107}, 'b': 5.0} [[10, 11], [12, 0]] - 2 {'a': {'b': 50}, 'b': 2.0} [[10, 11], [12, 2]] - 3 {'a': {'b': 1}, 'b': 67.0} [[6, 7]] - 4 {'a': {'b': 40}, 'b': 1090.0} None - """ - - df = cudf.read_json( - StringIO(actual_json_str), - engine="cudf", - orient="records", - lines=True, - dtype={ - "a": cudf.StructDtype( - { - "a": cudf.StructDtype({"b": cudf.dtype("int64")}), - "b": cudf.dtype("float64"), - } - ), - "b": cudf.ListDtype(cudf.ListDtype("int64")), - }, - ) - - pdf = pd.read_json( - StringIO(expected_json_str), - orient="records", - lines=True, - ) - - assert_eq(df, pdf) - - pdf.columns = pdf.columns.astype("str") - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -@pytest.mark.parametrize( - "tag, data", - [ - ( - "normal", - """\ -{"a": 1, "b": 2} -{"a": 3, "b": 4}""", - ), - ( - "multiple", - """\ - { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } - { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } - { "a": { "y" : 6}, "b" : [6 ], "c": 13 } - { "a": { "y" : 6}, "b" : [7 ], "c": 14 }""", - ), - ( - "reordered", - """\ - { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } - { "a": { "y" : 6}, "c": 12 , "b" : [4, 5 ]} - { "b" : [6 ], "a": { "y" : 6}, "c": 13} - { "c" : 14, "a": { "y" : 6}, "b" : [7 ]} -""", - ), - ( - "missing", - """ - { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } - { "a": { "y" : 6}, "b" : [4, 5 ] } - { "a": { "y" : 6}, "c": 13 } - { "a": { "y" : 6}, "b" : [7 ], "c": 14 } -""", - ), - pytest.param( - "dtype_mismatch", - """\ - { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } - { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } - { "a": { "y" : 6}, "b" : [6 ], "c": 13 } - { "a": { "y" : 6}, "b" : [7 ], "c": 14.0 }""", - ), - ], -) -class TestNestedJsonReaderCommon: - @pytest.mark.parametrize("chunk_size", [10, 100, 1024, 1024 * 1024]) - def test_chunked_nested_json_reader(self, tag, data, chunk_size): - expected = cudf.read_json(StringIO(data), lines=True) - - source_size = len(data) - chunks = [] - for chunk_start in range(0, source_size, chunk_size): - chunks.append( - cudf.read_json( - StringIO(data), - byte_range=[chunk_start, chunk_size], - lines=True, - ) - ) - df = cudf.concat(chunks, ignore_index=True) - assert expected.to_arrow().equals(df.to_arrow()) - - @pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/pull/57439", - ) - def test_order_nested_json_reader(self, tag, data): - expected = pd.read_json(StringIO(data), lines=True) - target = cudf.read_json(StringIO(data), lines=True) - # Using pyarrow instead of assert_eq because pandas - # doesn't handle nested values comparisons correctly - if tag == "dtype_mismatch": - with pytest.raises(AssertionError): - # pandas parses integer values in float representation - # as integer - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) - elif tag == "missing": - with pytest.raises(AssertionError): - # pandas inferences integer with nulls as float64 - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) - else: - assert pa.Table.from_pandas(expected).equals(target.to_arrow()) - - -def test_json_round_trip_gzip(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]}) - bytes = BytesIO() - with gzip.open(bytes, mode="wb") as fo: - with pytest.warns(UserWarning): - df.to_json(fo, orient="records", lines=True) - bytes.seek(0) - with gzip.open(bytes, mode="rb") as fo: - written_df = cudf.read_json(fo, orient="records", lines=True) - assert_eq(written_df, df) - - # Testing writing from middle of the file. - loc = bytes.tell() - - with gzip.open(bytes, mode="wb") as fo: - fo.seek(loc) - with pytest.warns(UserWarning): - df.to_json(fo, orient="records", lines=True) - bytes.seek(loc) - with gzip.open(bytes, mode="rb") as fo: - fo.seek(loc) - written_df = cudf.read_json(fo, orient="records", lines=True) - assert_eq(written_df, df) - - -@pytest.mark.parametrize( - "data", - [ - # # empty input - # assert failing due to missing index size information - "", - "[]", - "[]\n[]\n[]", - # simple values - """[1]\n[2]\n[3]""", - """[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]""", - # nulls - """[1, 2, 3]\n[4, 5, null]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, null]\n[7, 8, 9]\n[null, null, null]""", - """[1, 2, 3]\n[4, 5, null]\n[]""", - # missing - """[1, 2, 3]\n[4, 5 ]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9, 10]""", - """[1, 2, 3]\n[4, 5, 6, {}]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, 6, []]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, 6, {"a": 10}]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, 6, [10]]\n[7, 8, 9]""", - # mixed - """[1, 2, 3]\n[4, 5, {}]\n[7, 8, 9]""", - """[1, 2, {}]\n[4, 5, 6]\n[7, 8, 9]""", - """[1, 2, 3]\n[4, 5, [6]]\n[7, 8, 9]""", - """[1, 2, [3]]\n[4, 5, 6]\n[7, 8, 9]""", - # nested - """[1, 2, [3]]\n[4, 5, [6]]\n[7, 8, [9]]""", - """[1, 2, {"a": 3}]\n[4, 5, {"b": 6}]\n[7, 8, {"c": 9}]""", - """[1, 2, [{"a": 3}, {"a": 3}]] - [4, 5, [{"b": 6}, {"b": 6}, {}, {"b": 6}]] - [7, 8, [{}]]""", - """[1, 2, {"a": [3, 3, 3]}] - [4, 5, {"b": [6, 6]}] - [7, 8, {"c": 9}]""", - """[1, 2, [{"a": 3}, {"a": null}]] - [4, 5, [{"b": [6.0, 6, 06]}, {"b": [6]}, {}, {"b": null}]] - [7, 8, [{}]]""", - ], -) -@pytest.mark.parametrize("lines", [True, False]) -def test_json_array_of_arrays(data, lines): - data = data if lines else "[" + data.replace("\n", ",") + "]" - pdf = pd.read_json(StringIO(data), orient="values", lines=lines) - df = cudf.read_json( - StringIO(data), - engine="cudf", - orient="values", - lines=lines, - ) - # if mixed with dict/list type, replace other types with None. - if 2 in pdf.columns and any( - pdf[2].apply(lambda x: isinstance(x, dict) or isinstance(x, list)) - ): - pdf[2] = pdf[2].apply( - lambda x: x if isinstance(x, dict) or isinstance(x, list) else None - ) - # TODO: Replace string column names with integer column names - # for values orient in cudf json reader - pdf.rename(columns={name: str(name) for name in pdf.columns}, inplace=True) - # assert_eq(pdf, df) - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=df.to_arrow().schema, safe=False - ) - assert df.to_arrow().equals(pa_table_pdf) - - -@pytest.mark.parametrize( - "jsonl_string", - [ - # simple list with mixed types - """{"a":[123, {}], "b":1.1}""", - """{"a":[123, {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[{"L": 123}, 123], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[123, {"0": 123}, 12.3], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[123, {"0": 123}, null], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":["123", {"0": 123}], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":[{"0": 123}, "123"], "b":1.0}\n {"b":1.1}\n {"b":2.1}""", - """{"a":["123", {"0": 123}, "123"], "b":1.0}\n {"b":1.1}""", - """{"a":[123]}\n {"a":[{"0": 123}], "b":1.0}\n {"b":1.1}""", - """{"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n {"b":1.1}""", - """{"a":[{"0": 123}]}\n {"a": []}\n {"a":[123], "b":1.0}\n{"b":1.1}""", - """{"b":1.0, "a":[{"0": 123}]}\n {"a":[123]}\n {"b":1.1}\n{"a": []}""", - """{"a": []}\n {"a":[{"0": 123}]}\n {"a":[123], "b":1.0}\n{"b":1.1}""", - """{"a": []}\n {"a":[123], "b":1.0}\n {"a":[{"0": 123}]}\n{"b":1.1}""", - # nested list with mixed types - """{"a":[123, [{"0": 123}, {}]], "b":1.0} - {"b":1.1} - {"a":[]} - {"a":[123]} - {"a":[[123], []]}""", - """{"a":[], "b":1.0} - {"a":[[[456]]]} - {"a":[[123]]} - {"a":[123]}""", - """{"a":[123], "b":1.0} - {"b":1.1} - {"b":2.1} - {"a":[[[[[[]]]]]]}""", - """{"a":[123], "b":1.0} - {"a":[[[[[[]]]]]]} - {"a":[[[[[[]]]]], [[[[[]]]]]]} - {"a":[[[[[[]]]], [[[[]]]]]]} - {"a":[[[[[[]]], [[[]]]]]]} - {"a":[[[[[[]], [[]]]]]]} - {"a":[[[[[[], 123, []]]]]]}""", - # mixed elements in multiple columns - """{"a":[123, {"0": 123}], "b":1.0} - {"c": ["abc"], "b":1.1} - {"c": ["abc", []] }""", - ], -) -def test_json_nested_mixed_types_in_list(jsonl_string): - # utility function for this test: - # replace list elements with None if it has dict and non-dict (ignore None) - def _replace_in_list(list_to_replace, replace_items): - return [ - _replace_in_list(x, replace_items) - if isinstance(x, list) - else None - if x in replace_items - else x - for x in list_to_replace - ] - - def _replace_with_nulls(df, replace_items): - for col in df.columns: - if df[col].dtype == "object": - df[col] = df[col].apply( - lambda x: _replace_in_list(x, replace_items) - if isinstance(x, list) - else x - ) - return df - - # both json lines and json string tested. - json_string = "[" + jsonl_string.replace("\n", ",") + "]" - pdf = pd.read_json(StringIO(jsonl_string), orient="records", lines=True) - pdf2 = pd.read_json(StringIO(json_string), orient="records", lines=False) - assert_eq(pdf, pdf2) - # replace list elements with None if it has dict and non-dict - # in above test cases, these items are mixed with dict/list items - # so, replace them with None. - pdf = _replace_with_nulls(pdf, [123, "123", 12.3, "abc"]) - gdf = cudf.read_json( - StringIO(jsonl_string), - orient="records", - lines=True, - ) - gdf2 = cudf.read_json( - StringIO(json_string), - engine="cudf", - orient="records", - lines=False, - ) - if """[{"0": 123}, {}]""" not in jsonl_string: - # {} in pandas is represented as {"0": None} in cudf - assert_eq(gdf, pdf) - assert_eq(gdf2, pdf) - pa_table_pdf = pa.Table.from_pandas( - pdf, schema=gdf.to_arrow().schema, safe=False - ) - assert gdf.to_arrow().equals(pa_table_pdf) - assert gdf2.to_arrow().equals(pa_table_pdf) - - -@pytest.mark.parametrize( - "jsonl_string", - [ - # mixed type in list (in different order) - """{"a":[[{"0": 123}, {}], {"1": 321}], "b":1.0}""", - """{"a":[{"1": 321}, [{"0": 123}, {}], ], "b":1.0}""", - """{"a":[123, [{"0": 123}, {}], {"1": 321}], "b":1.0}""", - """{"a":[null, [{"0": 123}, {}], {"1": 321}], "b":1.0}""", - # mixed type in struct (in different order) - """{"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} - {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", - """{"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0} - {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0}""", - """{"a": {"b": {"0": 123}, "c": null}, "d":1.0} - {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} - {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", - """{"a": {"b": {"0": 123}, "c": 123}, "d":1.0} - {"a": {"b": {"0": 123}, "c": {"1": 321}}, "d":1.0} - {"a": {"b": {"0": 123}, "c": [123, 123]}, "d":1.0}""", - ], -) -def test_json_nested_mixed_types_error(jsonl_string): - # mixing list and struct should raise an exception - with pytest.raises(RuntimeError): - cudf.read_json( - StringIO(jsonl_string), - orient="records", - lines=True, - ) - - -@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"]) -def test_json_reader_on_bad_lines(on_bad_lines): - json_input = StringIO( - '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n' - ) - if on_bad_lines == "error": - with pytest.raises(RuntimeError): - cudf.read_json( - json_input, - lines=True, - orient="records", - on_bad_lines=on_bad_lines, - ) - elif on_bad_lines == "recover": - actual = cudf.read_json( - json_input, lines=True, orient="records", on_bad_lines=on_bad_lines - ) - expected = cudf.DataFrame( - {"a": [1, 2, None, 3], "b": [10, 11, None, 12]} - ) - assert_eq(actual, expected) - else: - with pytest.raises(TypeError): - cudf.read_json( - json_input, - lines=True, - orient="records", - on_bad_lines=on_bad_lines, - ) - - -def test_chunked_json_reader(): - df = cudf.DataFrame( - { - "a": ["aaaa"] * 9_00_00_00, - "b": list(range(0, 9_00_00_00)), - } - ) - buf = BytesIO() - df.to_json(buf, lines=True, orient="records", engine="cudf") - buf.seek(0) - df = df.to_pandas() - with cudf.option_context("io.json.low_memory", True): - gdf = cudf.read_json(buf, lines=True) - assert_eq(df, gdf) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py deleted file mode 100644 index 7d87fc73621..00000000000 --- a/python/cudf/cudf/tests/test_list.py +++ /dev/null @@ -1,952 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import functools -import operator - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf import NA -from cudf._lib.copying import get_element -from cudf.api.types import is_scalar -from cudf.core.column.column import column_empty -from cudf.testing import assert_eq -from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES - - -@pytest.mark.parametrize( - "data", - [ - [[]], - [[[]]], - [[0]], - [[0, 1]], - [[0, 1], [2, 3]], - [[[0, 1], [2]], [[3, 4]]], - [[None]], - [[[None]]], - [[None], None], - [[1, None], [1]], - [[1, None], None], - [[[1, None], None], None], - ], -) -def test_create_list_series(data): - expect = pd.Series(data) - got = cudf.Series(data) - assert_eq(expect, got) - assert isinstance(got[0], type(expect[0])) - assert isinstance(got.to_pandas()[0], type(expect[0])) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [[]]}, - {"a": [[None]]}, - {"a": [[1, 2, 3]]}, - {"a": [[1, 2, 3]], "b": [[2, 3, 4]]}, - {"a": [[1, 2, 3, None], [None]], "b": [[2, 3, 4], [5]], "c": None}, - {"a": [[1]], "b": [[1, 2, 3]]}, - pd.DataFrame({"a": [[1, 2, 3]]}), - ], -) -def test_df_list_dtypes(data): - expect = pd.DataFrame(data) - got = cudf.DataFrame(data) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [[]], - [[[]]], - [[0]], - [[0, 1]], - [[0, 1], [2, 3]], - [[[0, 1], [2]], [[3, 4]]], - [[[0, 1, None], None], None, [[3, 2, None], None]], - [[["a", "c", None], None], None, [["b", "d", None], None]], - ], -) -def test_leaves(data): - pa_array = pa.array(data) - while hasattr(pa_array, "flatten"): - pa_array = pa_array.flatten() - - expect = cudf.Series(pa_array) - got = cudf.Series(data).list.leaves - assert_eq( - expect, - got, - check_dtype=not isinstance(pa_array, pa.NullArray), - ) - - -def test_list_to_pandas_nullable_true(): - df = cudf.DataFrame({"a": cudf.Series([[1, 2, 3]])}) - with pytest.raises(NotImplementedError): - df.to_pandas(nullable=True) - - -def test_listdtype_hash(): - a = cudf.core.dtypes.ListDtype("int64") - b = cudf.core.dtypes.ListDtype("int64") - - assert hash(a) == hash(b) - - c = cudf.core.dtypes.ListDtype("int32") - - assert hash(a) != hash(c) - - -@pytest.fixture(params=["int", "float", "datetime", "timedelta"]) -def leaf_value(request): - if request.param == "int": - return np.int32(1) - elif request.param == "float": - return np.float64(1) - elif request.param == "datetime": - return pd.to_datetime("1900-01-01") - elif request.param == "timedelta": - return pd.to_timedelta("10d") - else: - raise ValueError("Unhandled data type") - - -@pytest.fixture(params=["list", "struct"]) -def list_or_struct(request, leaf_value): - if request.param == "list": - return [[leaf_value], [leaf_value]] - elif request.param == "struct": - return {"a": leaf_value, "b": [leaf_value], "c": {"d": [leaf_value]}} - else: - raise ValueError("Unhandled data type") - - -@pytest.fixture(params=["list", "struct"]) -def nested_list(request, list_or_struct, leaf_value): - if request.param == "list": - return [list_or_struct, list_or_struct] - elif request.param == "struct": - return [ - { - "a": list_or_struct, - "b": leaf_value, - "c": {"d": list_or_struct, "e": leaf_value}, - } - ] - else: - raise ValueError("Unhandled data type") - - -def test_list_dtype_explode(nested_list): - sr = cudf.Series([nested_list]) - assert sr.dtype.element_type == sr.explode().dtype - - -@pytest.mark.parametrize( - "data", - [ - [[]], - [[1, 2, 3], [4, 5]], - [[1, 2, 3], [], [4, 5]], - [[1, 2, 3], None, [4, 5]], - [[None, None], [None]], - [[[[[[1, 2, 3]]]]]], - cudf.Series([[1, 2]]).iloc[0:0], - cudf.Series([None, [1, 2]]).iloc[0:1], - ], -) -def test_len(data): - gsr = cudf.Series(data) - psr = gsr.to_pandas() - - expect = psr.map(lambda x: len(x) if x is not None else None) - got = gsr.list.len() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - ("data", "idx"), - [ - ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[0, 1], [2], [1, 2]]), - ([[1, 2, 3], [3, 4, 5], [4, 5, 6]], [[1, 2, 0], [1, 0, 2], [0, 1, 2]]), - ([[1, 2, 3], []], [[0, 1], []]), - ([[1, 2, 3], [None]], [[0, 1], []]), - ([[1, None, 3], None], [[0, 1], []]), - ], -) -def test_take(data, idx): - ps = pd.Series(data) - gs = cudf.from_pandas(ps) - - expected = pd.Series(zip(ps, idx)).map( - lambda x: [x[0][i] for i in x[1]] if x[0] is not None else None - ) - got = gs.list.take(idx) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - ("invalid", "exception"), - [ - ([[0]], pytest.raises(ValueError, match="different size")), - ([1, 2, 3, 4], pytest.raises(ValueError, match="should be list type")), - ( - [["a", "b"], ["c"]], - pytest.raises( - TypeError, match="should be column of values of index types" - ), - ), - ( - [[[1], [0]], [[0]]], - pytest.raises( - TypeError, match="should be column of values of index types" - ), - ), - ([[0, 1], None], pytest.raises(ValueError, match="contains null")), - ], -) -def test_take_invalid(invalid, exception): - gs = cudf.Series([[0, 1], [2, 3]]) - with exception: - gs.list.take(invalid) - - -@pytest.mark.parametrize( - ("data", "expected"), - [ - ([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]), - ( - [[1.233, np.nan, 1.234, 3.141, np.nan, 1.234]], - [[1.233, 1.234, np.nan, 3.141]], - ), # duplicate nans - ([[1, 1, 2, 2, None, None]], [[1, 2, None]]), # duplicate nulls - ( - [[1.233, np.nan, None, 1.234, 3.141, np.nan, 1.234, None]], - [[1.233, 1.234, np.nan, None, 3.141]], - ), # duplicate nans and nulls - ([[2, None, 1, None, 2]], [[1, 2, None]]), - ([[], []], [[], []]), - ([[], None], [[], None]), - ], -) -def test_unique(data, expected): - """ - Pandas de-duplicates nans and nulls respectively in Series.unique. - `expected` is setup to mimic such behavior - """ - gs = cudf.Series(data, nan_as_null=False) - - got = gs.list.unique() - expected = cudf.Series(expected, nan_as_null=False).list.sort_values() - - got = got.list.sort_values() - - assert_eq(expected, got) - - -def key_func_builder(x, na_position): - if x is None: - if na_position == "first": - return -1e8 - else: - return 1e8 - else: - return x - - -@pytest.mark.parametrize( - "data", - [ - [[4, 2, None, 9], [8, 8, 2], [2, 1]], - [[4, 2, None, 9], [8, 8, 2], None], - [[4, 2, None, 9], [], None], - ], -) -@pytest.mark.parametrize( - "index", - [ - None, - pd.Index(["a", "b", "c"]), - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (1, "a")], names=["l0", "l1"] - ), - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_sort_values(data, index, ascending, na_position, ignore_index): - key_func = functools.partial(key_func_builder, na_position=na_position) - - ps = pd.Series(data, index=index) - gs = cudf.from_pandas(ps) - - expected = ps.apply( - lambda x: sorted(x, key=key_func, reverse=not ascending) - if x is not None - else None - ) - if ignore_index: - expected.reset_index(drop=True, inplace=True) - got = gs.list.sort_values( - ascending=ascending, na_position=na_position, ignore_index=ignore_index - ) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data, index, expect", - [ - ([[None, None], [None, None]], 0, [None, None]), - ([[1, 2], [3, 4]], 0, [1, 3]), - ([["a", "b"], ["c", "d"]], 1, ["b", "d"]), - ([[1, None], [None, 2]], 1, [None, 2]), - ([[[1, 2], [3, 4]], [[5, 6], [7, 8]]], 1, [[3, 4], [7, 8]]), - ], -) -def test_get(data, index, expect): - sr = cudf.Series(data) - expect = cudf.Series(expect) - got = sr.list.get(index) - - assert_eq(expect, got, check_dtype=not expect.isnull().all()) - - -@pytest.mark.parametrize( - "data", - [ - [{"k": "v1"}, {"k": "v2"}], - [[{"k": "v1", "b": "v2"}], [{"k": "v3", "b": "v4"}]], - [ - [{"k": "v1", "b": [{"c": 10, "d": "v5"}]}], - [{"k": "v3", "b": [{"c": 14, "d": "v6"}]}], - ], - ], -) -@pytest.mark.parametrize("index", [0, 1]) -def test_get_nested_struct_dtype_transfer(data, index): - sr = cudf.Series([data]) - expect = cudf.Series(data[index : index + 1]) - assert_eq(expect, sr.list.get(index)) - - -def test_get_nested_lists(): - sr = cudf.Series( - [ - [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [], [[3, 4], [7, 8]]], - [[], [[9, 10]], [[11, 12], [13, 14]]], - ] - ) - expect = cudf.Series([[[1, 2], [3, 4]], []]) - got = sr.list.get(0) - assert_eq(expect, got) - - -def test_get_default(): - sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) - - assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2)) - assert_eq(cudf.Series([cudf.NA, 5, 8]), sr.list.get(2, default=cudf.NA)) - assert_eq(cudf.Series([0, 5, 8]), sr.list.get(2, default=0)) - assert_eq(cudf.Series([0, 3, 7]), sr.list.get(-3, default=0)) - assert_eq(cudf.Series([2, 5, 9]), sr.list.get(-1)) - - string_sr = cudf.Series( - [["apple", "banana"], ["carrot", "daffodil", "elephant"]] - ) - assert_eq( - cudf.Series(["default", "elephant"]), - string_sr.list.get(2, default="default"), - ) - - sr_with_null = cudf.Series([[0, cudf.NA], [1]]) - assert_eq(cudf.Series([cudf.NA, 0]), sr_with_null.list.get(1, default=0)) - - sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) - assert_eq(cudf.Series([[3, 4], [7, 8]]), sr_nested.list.get(1)) - assert_eq(cudf.Series([[5, 6], cudf.NA]), sr_nested.list.get(2)) - assert_eq( - cudf.Series([[5, 6], [0, 0]]), sr_nested.list.get(2, default=[0, 0]) - ) - - -def test_get_ind_sequence(): - # test .list.get() when `index` is a sequence - sr = cudf.Series([[1, 2], [3, 4, 5], [6, 7, 8, 9]]) - assert_eq(cudf.Series([1, 4, 8]), sr.list.get([0, 1, 2])) - assert_eq(cudf.Series([1, 4, 8]), sr.list.get(cudf.Series([0, 1, 2]))) - assert_eq(cudf.Series([cudf.NA, 5, cudf.NA]), sr.list.get([2, 2, -5])) - assert_eq(cudf.Series([0, 5, 0]), sr.list.get([2, 2, -5], default=0)) - sr_nested = cudf.Series([[[1, 2], [3, 4], [5, 6]], [[5, 6], [7, 8]]]) - assert_eq(cudf.Series([[1, 2], [7, 8]]), sr_nested.list.get([0, 1])) - - -@pytest.mark.parametrize( - "data, scalar, expect", - [ - ( - [[1, 2, 3], []], - 1, - [True, False], - ), - ( - [[1, 2, 3], [], [3, 4, 5]], - 6, - [False, False, False], - ), - ( - [[1.0, 2.0, 3.0], None, []], - 2.0, - [True, None, False], - ), - ( - [[None, "b", "c"], [], ["b", "e", "f"]], - "b", - [True, False, True], - ), - ([[None, 2, 3], None, []], 1, [False, None, False]), - ( - [[None, "b", "c"], [], ["b", "e", "f"]], - "d", - [False, False, False], - ), - ], -) -def test_contains_scalar(data, scalar, expect): - sr = cudf.Series(data) - expect = cudf.Series(expect) - got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type)) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data, expect", - [ - ( - [[1, 2, 3], []], - [None, None], - ), - ( - [[1.0, 2.0, 3.0], None, []], - [None, None, None], - ), - ( - [[None, 2, 3], [], None], - [None, None, None], - ), - ( - [[1, 2, 3], [3, 4, 5]], - [None, None], - ), - ( - [[], [], []], - [None, None, None], - ), - ], -) -def test_contains_null_search_key(data, expect): - sr = cudf.Series(data) - expect = cudf.Series(expect, dtype="bool") - got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type)) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data, scalar", - [ - ( - [[9, 0, 2], [], [1, None, 0]], - "x", - ), - ( - [["z", "y", None], None, [None, "x"]], - 5, - ), - ], -) -def test_contains_invalid(data, scalar): - sr = cudf.Series(data) - with pytest.raises( - TypeError, - match="Type/Scale of search key does not " - "match list column element type.", - ): - sr.list.contains(scalar) - - -@pytest.mark.parametrize( - "data, search_key, expect", - [ - ( - [[1, 2, 3], [], [3, 4, 5]], - 3, - [2, -1, 0], - ), - ( - [[1.0, 2.0, 3.0], None, [2.0, 5.0]], - 2.0, - [1, None, 0], - ), - ( - [[None, "b", "c"], [], ["b", "e", "f"]], - "f", - [-1, -1, 2], - ), - ([[-5, None, 8], None, []], -5, [0, None, -1]), - ( - [[None, "x", None, "y"], ["z", "i", "j"]], - "y", - [3, -1], - ), - ( - [["h", "a", None], ["t", "g"]], - ["a", "b"], - [1, -1], - ), - ( - [None, ["h", "i"], ["p", "k", "z"]], - ["x", None, "z"], - [None, None, 2], - ), - ( - [["d", None, "e"], [None, "f"], []], - cudf.Scalar(cudf.NA, "O"), - [None, None, None], - ), - ( - [None, [10, 9, 8], [5, 8, None]], - cudf.Scalar(cudf.NA, "int64"), - [None, None, None], - ), - ], -) -def test_index(data, search_key, expect): - sr = cudf.Series(data) - expect = cudf.Series(expect, dtype="int32") - if is_scalar(search_key): - got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type)) - else: - got = sr.list.index( - cudf.Series(search_key, dtype=sr.dtype.element_type) - ) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data, search_key", - [ - ( - [[9, None, 8], [], [7, 6, 5]], - "c", - ), - ( - [["a", "b", "c"], None, [None, "d"]], - 2, - ), - ( - [["e", "s"], ["t", "w"]], - [5, 6], - ), - ], -) -def test_index_invalid_type(data, search_key): - sr = cudf.Series(data) - with pytest.raises( - TypeError, - match="Type/Scale of search key does not " - "match list column element type.", - ): - sr.list.index(search_key) - - -@pytest.mark.parametrize( - "data, search_key", - [ - ( - [[5, 8], [2, 6]], - [8, 2, 4], - ), - ( - [["h", "j"], ["p", None], ["t", "z"]], - ["j", "a"], - ), - ], -) -def test_index_invalid_length(data, search_key): - sr = cudf.Series(data) - with pytest.raises( - RuntimeError, - match="Number of search keys must match list column size.", - ): - sr.list.index(search_key) - - -@pytest.mark.parametrize( - "row", - [ - [[]], - [[1]], - [[1, 2]], - [[1, 2], [3, 4, 5]], - [[1, 2], [], [3, 4, 5]], - [[1, 2, None], [3, 4, 5]], - [[1, 2, None], None, [3, 4, 5]], - [[1, 2, None], None, [], [3, 4, 5]], - [[[1, 2], [3, 4]], [[5, 6, 7], [8, 9]]], - [[["a", "c", "de", None], None, ["fg"]], [["abc", "de"], None]], - ], -) -@pytest.mark.parametrize("dropna", [True, False]) -def test_concat_elements(row, dropna): - if any(x is None for x in row): - if dropna: - row = [x for x in row if x is not None] - result = functools.reduce(operator.add, row) - else: - result = None - else: - result = functools.reduce(operator.add, row) - - expect = pd.Series([result]) - got = cudf.Series([row]).list.concat(dropna=dropna) - assert_eq(expect, got) - - -def test_concat_elements_raise(): - s = cudf.Series([[1, 2, 3]]) # no nesting - with pytest.raises(ValueError): - s.list.concat() - - -def test_concatenate_rows_of_lists(): - pdf = pd.DataFrame({"val": [["a", "a"], ["b"], ["c"]]}) - gdf = cudf.from_pandas(pdf) - - expect = pdf["val"] + pdf["val"] - got = gdf["val"] + gdf["val"] - - assert_eq(expect, got) - - -def test_concatenate_list_with_nonlist(): - with pytest.raises(TypeError): - gdf1 = cudf.DataFrame({"A": [["a", "c"], ["b", "d"], ["c", "d"]]}) - gdf2 = cudf.DataFrame({"A": ["a", "b", "c"]}) - gdf1["A"] + gdf2["A"] - - -@pytest.mark.parametrize( - "data", - [ - [1], - [1, 2, 3], - [[1, 2, 3], [4, 5, 6]], - [NA], - [1, NA, 3], - [[1, NA, 3], [NA, 5, 6]], - ], -) -def test_list_getitem(data): - list_sr = cudf.Series([data]) - assert list_sr[0] == data - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - [[1, 2, 3], [4, 5, 6]], - ["a", "b", "c"], - [["a", "b", "c"], ["d", "e", "f"]], - [1.1, 2.2, 3.3], - [[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]], - [1, NA, 3], - [[1, NA, 3], [4, 5, NA]], - ["a", NA, "c"], - [["a", NA, "c"], ["d", "e", NA]], - [1.1, NA, 3.3], - [[1.1, NA, 3.3], [4.4, 5.5, NA]], - ], -) -def test_list_scalar_host_construction(data): - slr = cudf.Scalar(data) - assert slr.value == data - assert slr.device_value.value == data - - -@pytest.mark.parametrize( - "elem_type", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["str"] -) -@pytest.mark.parametrize("nesting_level", [1, 2, 3]) -def test_list_scalar_host_construction_null(elem_type, nesting_level): - dtype = cudf.ListDtype(elem_type) - for level in range(nesting_level - 1): - dtype = cudf.ListDtype(dtype) - - slr = cudf.Scalar(None, dtype=dtype) - assert slr.value is (cudf.NaT if slr.dtype.kind in "mM" else cudf.NA) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - [[1, 2, 3], [4, 5, 6]], - ["a", "b", "c"], - [["a", "b", "c"], ["d", "e", "f"]], - [1.1, 2.2, 3.3], - [[1.1, 2.2, 3.3], [4.4, 5.5, 6.6]], - [1, NA, 3], - [[1, NA, 3], [4, 5, NA]], - ["a", NA, "c"], - [["a", NA, "c"], ["d", "e", NA]], - [1.1, NA, 3.3], - [[1.1, NA, 3.3], [4.4, 5.5, NA]], - ], -) -def test_list_scalar_device_construction(data): - col = cudf.Series([data])._column - slr = get_element(col, 0) - assert slr.value == data - - -@pytest.mark.parametrize("nesting_level", [1, 2, 3]) -def test_list_scalar_device_construction_null(nesting_level): - data = [[]] - for i in range(nesting_level - 1): - data = [data] - - arrow_type = pa.infer_type(data) - arrow_arr = pa.array([None], type=arrow_type) - - col = cudf.Series(arrow_arr)._column - slr = get_element(col, 0) - - assert slr.value is cudf.NA - - -@pytest.mark.parametrize("input_obj", [[[1, NA, 3]], [[1, NA, 3], [4, 5, NA]]]) -def test_construction_series_with_nulls(input_obj): - expect = pa.array(input_obj, from_pandas=True) - got = cudf.Series(input_obj).to_arrow() - - assert expect == got - - -@pytest.mark.parametrize( - "data", - [ - {"a": [[]]}, - {"a": [[1, 2, None, 4]]}, - {"a": [["cat", None, "dog"]]}, - { - "a": [[1, 2, 3, None], [4, None, 5]], - "b": [None, ["fish", "bird"]], - "c": [[], []], - }, - {"a": [[1, 2, 3, None], [4, None, 5], None, [6, 7]]}, - ], -) -def test_serialize_list_columns(data): - df = cudf.DataFrame(data) - recreated = df.__class__.deserialize(*df.serialize()) - assert_eq(recreated, df) - - -@pytest.mark.parametrize( - "data,item", - [ - ( - # basic list into a list column - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - [0, 0, 0], - ), - ( - # nested list into nested list column - [ - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - ], - [[0, 0, 0], [0, 0, 0]], - ), - ( - # NA into a list column - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - NA, - ), - ( - # NA into nested list column - [ - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - ], - NA, - ), - ], -) -def test_listcol_setitem(data, item): - sr = cudf.Series(data) - - sr[1] = item - data[1] = item - expect = cudf.Series(data) - - assert_eq(expect, sr) - - -@pytest.mark.parametrize( - "data", - [ - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - [ - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - [[1, 2, 3], [4, 5, 6]], - ], - [[[1, 2, 3], [4, None, 6]], [], None, [[7, 8], [], None, [9]]], - [[1, 2, 3], [4, None, 6], [7, 8], [], None, [9]], - [[1.0, 2.0, 3.0], [4.0, None, 6.0], [7.0, 8.0], [], None, [9.0]], - ], -) -def test_listcol_as_string(data): - got = cudf.Series(data).astype("str") - expect = pd.Series(data).astype("str") - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data,item,error", - [ - ( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - [[1, 2, 3], [4, 5, 6]], - "list nesting level mismatch", - ), - ( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], - 0, - "Can not set 0 into ListColumn", - ), - ], -) -def test_listcol_setitem_error_cases(data, item, error): - sr = cudf.Series(data) - with pytest.raises(BaseException, match=error): - sr[1] = item - - -def test_listcol_setitem_retain_dtype(): - df = cudf.DataFrame( - {"a": cudf.Series([["a", "b"], []]), "b": [1, 2], "c": [123, 321]} - ) - df1 = df.head(0) - # Performing a setitem on `b` triggers a `column.column_empty_like` call - # which tries to create an empty ListColumn. - df1["b"] = df1["c"] - # Performing a copy to trigger a copy dtype which is obtained by accessing - # `ListColumn.children` that would have been corrupted in previous call - # prior to this fix: https://github.com/rapidsai/cudf/pull/10151/ - df2 = df1.copy() - assert df2["a"].dtype == df["a"].dtype - - -def test_list_astype(): - s = cudf.Series([[1, 2], [3, 4]]) - s2 = s.list.astype("float64") - assert s2.dtype == cudf.ListDtype("float64") - assert_eq(s.list.leaves.astype("float64"), s2.list.leaves) - - s = cudf.Series([[[1, 2], [3]], [[5, 6], None]]) - s2 = s.list.astype("string") - assert s2.dtype == cudf.ListDtype(cudf.ListDtype("string")) - assert_eq(s.list.leaves.astype("string"), s2.list.leaves) - - -def test_memory_usage(): - s1 = cudf.Series([[1, 2], [3, 4]]) - assert s1.memory_usage() == 44 - s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]]) - assert s2.memory_usage() == 68 - s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]]) - assert s3.memory_usage() == 40 - - -@pytest.mark.parametrize( - "data, idx", - [ - ( - [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]], - 0, - ), - ( - [ - [ - {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"}, - {"f1": "sf12", "f2": NA}, - ] - ], - 0, - ), - ( - [[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]], - 0, - ), - ([[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]], 2), - ([[[{"a": 1, "b": 2, "c": 10}]]], 0), - ], -) -def test_nested_list_extract_host_scalars(data, idx): - series = cudf.Series(data) - - assert series[idx] == data[idx] - - -def test_list_iterate_error(): - s = cudf.Series([[[[1, 2]], [[2], [3]]], [[[2]]], [[[3]]]]) - with pytest.raises(TypeError): - iter(s.list) - - -def test_list_struct_list_memory_usage(): - df = cudf.DataFrame({"a": [[{"b": [1]}]]}) - assert df.memory_usage().sum() == 16 - - -def test_empty_nested_list_uninitialized_offsets_memory_usage(): - col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64"))) - nested_col = col.children[1] - empty_inner = type(nested_col)( - data=None, - size=nested_col.size, - dtype=nested_col.dtype, - mask=nested_col.mask, - offset=nested_col.offset, - null_count=nested_col.null_count, - children=( - column_empty(0, nested_col.children[0].dtype), - nested_col.children[1], - ), - ) - col_empty_offset = type(col)( - data=None, - size=col.size, - dtype=col.dtype, - mask=col.mask, - offset=col.offset, - null_count=col.null_count, - children=(column_empty(0, col.children[0].dtype), empty_inner), - ) - ser = cudf.Series._from_column(col_empty_offset) - assert ser.memory_usage() == 8 diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py deleted file mode 100644 index 790e84559a9..00000000000 --- a/python/cudf/cudf/tests/test_monotonic.py +++ /dev/null @@ -1,356 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -""" -Tests related to is_unique, is_monotonic_increasing & -is_monotonic_decreasing attributes -""" - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Index, MultiIndex, Series -from cudf.core.index import CategoricalIndex, DatetimeIndex, RangeIndex -from cudf.testing import assert_eq - - -@pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)]) -def test_range_index(testrange): - index = RangeIndex( - start=testrange[0], stop=testrange[1], step=testrange[2] - ) - index_pd = pd.RangeIndex( - start=testrange[0], stop=testrange[1], step=testrange[2] - ) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [1, 2, 3, 4], - [1, 2, 3, 4, None], - [1, 2, 3, 3, 4], - [10, 9, 8, 7], - [10, 9, 8, 8, 7], - ["c", "d", "e", "f"], - ["c", "d", "e", "e", "f"], - ["c", "d", "e", "f", None], - ["z", "y", "x", "r"], - ["z", "y", "x", "x", "r"], - ], -) -def test_generic_index(testlist): - index = Index(testlist) - index_pd = pd.Index(testlist) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [1, 2, 3, 4, np.nan], - [10, 9, 8, np.nan, 7], - [10, 9, 8, 8, 7, np.nan], - ], -) -def test_float_index(testlist): - index_pd = pd.Index(testlist) - index = cudf.from_pandas(index_pd, nan_as_null=False) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - ["c", "d", "e", "f"], - ["c", "d", "e", "e", "f"], - ["z", "y", "x", "r"], - ["z", "y", "x", "x", "r"], - ], -) -def test_string_index(testlist): - index = cudf.Index(testlist) - index_pd = pd.Index(testlist) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", [["c", "d", "e", "f"], ["z", "y", "x", "r"]] -) -def test_categorical_index(testlist): - # Assuming unordered categorical data cannot be "monotonic" - raw_cat = pd.Categorical(testlist, ordered=True) - index = CategoricalIndex(raw_cat) - index_pd = pd.CategoricalIndex(raw_cat) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [ - "2001-01-01 00:00:00", - "2001-02-03 08:00:00", - "2001-03-08 16:00:00", - "2001-04-11 00:00:00", - ], - [ - "2001-04-11 00:00:00", - "2001-03-08 16:00:00", - "2001-02-03 08:00:00", - "2001-01-01 00:00:00", - ], - [ - "2001-04-11 00:00:00", - "2001-02-03 08:00:00", - "2001-03-08 16:00:00", - "2001-01-01 00:00:00", - ], - [ - "2001-04-11 00:00:00", - "2001-01-01 00:00:00", - "2001-02-03 08:00:00", - "2001-03-08 16:00:00", - "2001-01-01 00:00:00", - ], - ], -) -def test_datetime_index(testlist): - index = DatetimeIndex(testlist) - index_pd = pd.DatetimeIndex(testlist) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [1, 2, 3, 4], - [1, 2, 3, 3, 4], - [10, 9, 8, 7], - [10, 9, 8, 8, 7], - ["c", "d", "e", "f"], - ["c", "d", "e", "e", "f"], - ["z", "y", "x", "r"], - ["z", "y", "x", "x", "r"], - ], -) -def test_series(testlist): - series = Series(testlist) - series_pd = pd.Series(testlist) - - assert series.is_unique == series_pd.is_unique - assert series.is_monotonic_increasing == series_pd.is_monotonic_increasing - assert series.is_monotonic_decreasing == series_pd.is_monotonic_decreasing - - -def test_multiindex(): - pdf = pd.DataFrame(np.random.rand(7, 5)) - pdf.index = pd.MultiIndex( - [ - ["a", "b", "c"], - ["house", "store", "forest"], - ["clouds", "clear", "storm"], - ["fire", "smoke", "clear"], - ], - [ - [0, 0, 0, 0, 1, 1, 2], - [1, 1, 1, 1, 0, 0, 2], - [0, 0, 2, 2, 2, 0, 1], - [0, 0, 0, 1, 2, 0, 1], - ], - ) - pdf.index.names = ["alpha", "location", "weather", "sign"] - gdf = cudf.from_pandas(pdf) - - assert pdf.index.is_unique == gdf.index.is_unique - assert ( - pdf.index.is_monotonic_increasing == gdf.index.is_monotonic_increasing - ) - assert ( - pdf.index.is_monotonic_decreasing == gdf.index.is_monotonic_decreasing - ) - - -@pytest.mark.parametrize( - "testarr", - [ - ( - [ - ["bar", "bar", "foo", "foo", "qux", "qux", "qux"], - ["one", "two", "one", "two", "one", "two", "two"], - ], - ["first", "second"], - ), - ( - [ - ["bar", "bar", "foo", "foo", "qux", "qux"], - ["one", "two", "one", "two", "one", "two"], - ], - ["first", "second"], - ), - ], -) -def test_multiindex_tuples(testarr): - tuples = list(zip(*testarr[0])) - - index = MultiIndex.from_tuples(tuples, names=testarr[1]) - index_pd = pd.MultiIndex.from_tuples(tuples, names=testarr[1]) - - assert index.is_unique == index_pd.is_unique - assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing - assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing - - -@pytest.mark.parametrize( - "testlist", - [ - [10, 9, 8, 8, 7], - [2.0, 5.0, 4.0, 3.0, 7.0], - ["b", "d", "e", "a", "c"], - ["frog", "cat", "bat", "dog"], - ], -) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_get_slice_bound(testlist, side): - index = Index(testlist) - index_pd = pd.Index(testlist) - for label in testlist: - expect = index_pd.get_slice_bound(label, side) - got = index.get_slice_bound(label, side) - assert got == expect - - -@pytest.mark.parametrize("bounds", [(0, 10), (0, 1), (3, 4), (0, 0), (3, 3)]) -@pytest.mark.parametrize( - "indices", - [[-1, 0, 5, 10, 11], [-1, 0, 1, 2], [2, 3, 4, 5], [-1, 0, 1], [2, 3, 4]], -) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_rangeindex_get_slice_bound_basic(bounds, indices, side): - start, stop = bounds - pd_index = pd.RangeIndex(start, stop) - cudf_index = RangeIndex(start, stop) - for idx in indices: - expect = pd_index.get_slice_bound(idx, side) - got = cudf_index.get_slice_bound(idx, side) - assert expect == got - - -@pytest.mark.parametrize( - "bounds", - [(3, 20, 5), (20, 3, -5), (20, 3, 5), (3, 20, -5), (0, 0, 2), (3, 3, 2)], -) -@pytest.mark.parametrize( - "label", - [3, 8, 13, 18, 20, 15, 10, 5, -1, 0, 19, 21, 6, 11, 17], -) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_rangeindex_get_slice_bound_step(bounds, label, side): - start, stop, step = bounds - pd_index = pd.RangeIndex(start, stop, step) - cudf_index = RangeIndex(start, stop, step) - - expect = pd_index.get_slice_bound(label, side) - got = cudf_index.get_slice_bound(label, side) - assert expect == got - - -@pytest.mark.parametrize("label", [1, 3, 5, 7, 9, 11]) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_get_slice_bound_missing(label, side): - mylist = [2, 4, 6, 8, 10] - index = Index(mylist) - index_pd = pd.Index(mylist) - - expect = index_pd.get_slice_bound(label, side) - got = index.get_slice_bound(label, side) - assert got == expect - - -@pytest.mark.parametrize("label", ["a", "c", "e", "g"]) -@pytest.mark.parametrize("side", ["left", "right"]) -def test_get_slice_bound_missing_str(label, side): - mylist = ["b", "d", "f"] - index = Index(mylist) - index_pd = pd.Index(mylist) - got = index.get_slice_bound(label, side) - expect = index_pd.get_slice_bound(label, side) - assert got == expect - - -testdata = [ - ( - Series(["2018-01-01", "2019-01-31", None], dtype="datetime64[ms]"), - False, - ), - (Series([1, 2, 3, None]), False), - (Series([None, 1, 2, 3]), False), - (Series(["a", "b", "c", None]), False), - (Series([None, "a", "b", "c"]), False), -] - - -@pytest.mark.parametrize("data, expected", testdata) -def test_is_monotonic_always_falls_for_null(data, expected): - assert_eq(expected, data.is_monotonic_increasing) - assert_eq(expected, data.is_monotonic_decreasing) - - -@pytest.mark.parametrize("box", [Series, Index]) -@pytest.mark.parametrize( - "value,na_like", - [ - [1, None], - [np.datetime64("2020-01-01", "ns"), np.datetime64("nat", "ns")], - ["s", None], - [1.0, np.nan], - ], - ids=repr, -) -def test_is_unique(box, value, na_like): - obj = box([value], nan_as_null=False) - assert obj.is_unique - - obj = box([value, value], nan_as_null=False) - assert not obj.is_unique - - obj = box([None, value], nan_as_null=False) - assert obj.is_unique - - obj = box([None, None, value], nan_as_null=False) - assert not obj.is_unique - - if na_like is not None: - obj = box([na_like, value], nan_as_null=False) - assert obj.is_unique - - obj = box([na_like, na_like], nan_as_null=False) - assert not obj.is_unique - - try: - if not np.isnat(na_like): - # pyarrow coerces nat to null - obj = box([None, na_like, value], nan_as_null=False) - assert obj.is_unique - except TypeError: - pass diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py deleted file mode 100644 index c41be3e4428..00000000000 --- a/python/cudf/cudf/tests/test_multiindex.py +++ /dev/null @@ -1,2189 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -""" -Test related to MultiIndex -""" - -import datetime -import itertools -import operator -import pickle -import re -from contextlib import contextmanager -from io import BytesIO - -import cupy as cp -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.api.extensions import no_default -from cudf.core.column import as_column -from cudf.testing import assert_eq, assert_neq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@contextmanager -def expect_pandas_performance_warning(idx): - with expect_warning_if( - (not isinstance(idx[0], tuple) and len(idx) > 2) - or (isinstance(idx[0], tuple) and len(idx[0]) > 2), - pd.errors.PerformanceWarning, - ): - yield - - -def test_multiindex_levels_codes_validation(): - levels = [["a", "b"], ["c", "d"]] - - # Codes not a sequence of sequences - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([levels, [0, 1]],), - rfunc_args_and_kwargs=([levels, [0, 1]],), - ) - - # Codes don't match levels - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([levels, [[0], [1], [1]]],), - rfunc_args_and_kwargs=([levels, [[0], [1], [1]]],), - ) - - # Largest code greater than number of levels - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([levels, [[0, 1], [0, 2]]],), - rfunc_args_and_kwargs=([levels, [[0, 1], [0, 2]]],), - ) - - # Unequal code lengths - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([levels, [[0, 1], [0]]],), - rfunc_args_and_kwargs=([levels, [[0, 1], [0]]],), - ) - # Didn't pass levels and codes - assert_exceptions_equal(lfunc=pd.MultiIndex, rfunc=cudf.MultiIndex) - - # Didn't pass non zero levels and codes - assert_exceptions_equal( - lfunc=pd.MultiIndex, - rfunc=cudf.MultiIndex, - lfunc_args_and_kwargs=([[], []],), - rfunc_args_and_kwargs=([[], []],), - ) - - -def test_multiindex_construction(): - levels = [["a", "b"], ["c", "d"]] - codes = [[0, 1], [1, 0]] - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels, codes) - assert_eq(pmi, mi) - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels=levels, codes=codes) - assert_eq(pmi, mi) - - -def test_multiindex_types(): - codes = [[0, 1], [1, 0]] - levels = [[0, 1], [2, 3]] - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels, codes) - assert_eq(pmi, mi) - levels = [[1.2, 2.1], [1.3, 3.1]] - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels, codes) - assert_eq(pmi, mi) - levels = [["a", "b"], ["c", "d"]] - pmi = pd.MultiIndex(levels, codes) - mi = cudf.MultiIndex(levels, codes) - assert_eq(pmi, mi) - - -def test_multiindex_df_assignment(): - pdf = pd.DataFrame({"x": [1, 2, 3]}) - gdf = cudf.from_pandas(pdf) - pdf.index = pd.MultiIndex([["a", "b"], ["c", "d"]], [[0, 1, 0], [1, 0, 1]]) - gdf.index = cudf.MultiIndex( - levels=[["a", "b"], ["c", "d"]], codes=[[0, 1, 0], [1, 0, 1]] - ) - assert_eq(pdf, gdf) - - -def test_multiindex_series_assignment(): - ps = pd.Series([1, 2, 3]) - gs = cudf.from_pandas(ps) - ps.index = pd.MultiIndex([["a", "b"], ["c", "d"]], [[0, 1, 0], [1, 0, 1]]) - gs.index = cudf.MultiIndex( - levels=[["a", "b"], ["c", "d"]], codes=[[0, 1, 0], [1, 0, 1]] - ) - assert_eq(ps, gs) - - -def test_multiindex_swaplevel(): - midx = cudf.MultiIndex( - levels=[ - ["lama", "cow", "falcon"], - ["speed", "weight", "length"], - ["first", "second"], - ], - codes=[ - [0, 0, 0, 1, 1, 1, 2, 2, 2], - [0, 1, 2, 0, 1, 2, 0, 1, 2], - [0, 0, 0, 0, 0, 0, 1, 1, 1], - ], - names=["Col1", "Col2", "Col3"], - ) - pd_midx = midx.to_pandas() - - assert_eq(pd_midx.swaplevel(-1, -2), midx.swaplevel(-1, -2)) - assert_eq(pd_midx.swaplevel(2, 1), midx.swaplevel(2, 1)) - assert_eq(midx.swaplevel(2, 1), midx.swaplevel(1, 2)) - assert_eq(pd_midx.swaplevel(0, 2), midx.swaplevel(0, 2)) - assert_eq(pd_midx.swaplevel(2, 0), midx.swaplevel(2, 0)) - assert_eq(midx.swaplevel(1, 1), midx.swaplevel(1, 1)) - - -def test_string_index(): - pdf = pd.DataFrame(np.random.rand(5, 5)) - gdf = cudf.from_pandas(pdf) - stringIndex = ["a", "b", "c", "d", "e"] - pdf.index = stringIndex - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = np.array(["a", "b", "c", "d", "e"]) - pdf.index = stringIndex - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = cudf.Index(["a", "b", "c", "d", "e"], name="name") - pdf.index = stringIndex.to_pandas() - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = cudf.Index._from_column( - as_column(["a", "b", "c", "d", "e"]), name="name" - ) - pdf.index = stringIndex.to_pandas() - gdf.index = stringIndex - assert_eq(pdf, gdf) - - -def test_multiindex_row_shape(): - pdf = pd.DataFrame(np.random.rand(0, 5)) - gdf = cudf.from_pandas(pdf) - pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]]) - pdfIndex.names = ["alpha"] - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex, gdfIndex) - - assert_exceptions_equal( - lfunc=operator.setitem, - rfunc=operator.setitem, - lfunc_args_and_kwargs=([], {"a": pdf, "b": "index", "c": pdfIndex}), - rfunc_args_and_kwargs=([], {"a": gdf, "b": "index", "c": gdfIndex}), - ) - - -@pytest.fixture -def pdf(): - return pd.DataFrame(np.random.rand(7, 5)) - - -@pytest.fixture -def gdf(pdf): - return cudf.from_pandas(pdf) - - -@pytest.fixture -def pdfIndex(): - pdfIndex = pd.MultiIndex( - [ - ["a", "b", "c"], - ["house", "store", "forest"], - ["clouds", "clear", "storm"], - ["fire", "smoke", "clear"], - [ - np.datetime64("2001-01-01", "ns"), - np.datetime64("2002-01-01", "ns"), - np.datetime64("2003-01-01", "ns"), - ], - ], - [ - [0, 0, 0, 0, 1, 1, 2], - [1, 1, 1, 1, 0, 0, 2], - [0, 0, 2, 2, 2, 0, 1], - [0, 0, 0, 1, 2, 0, 1], - [1, 0, 1, 2, 0, 0, 1], - ], - ) - pdfIndex.names = ["alpha", "location", "weather", "sign", "timestamp"] - return pdfIndex - - -@pytest.fixture -def pdfIndexNulls(): - pdfIndex = pd.MultiIndex( - [ - ["a", "b", "c"], - ["house", "store", "forest"], - ["clouds", "clear", "storm"], - ], - [ - [0, 0, 0, -1, 1, 1, 2], - [1, -1, 1, 1, 0, 0, -1], - [-1, 0, 2, 2, 2, 0, 1], - ], - ) - pdfIndex.names = ["alpha", "location", "weather"] - return pdfIndex - - -def test_from_pandas(pdf, pdfIndex): - pdf.index = pdfIndex - gdf = cudf.from_pandas(pdf) - assert_eq(pdf, gdf) - - -def test_multiindex_transpose(pdf, pdfIndex): - pdf.index = pdfIndex - gdf = cudf.from_pandas(pdf) - assert_eq(pdf.transpose(), gdf.transpose()) - - -def test_from_pandas_series(): - pdf = pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} - ).set_index(["a", "b"]) - - result = cudf.from_pandas(pdf) - assert_eq(pdf, result) - - test_pdf = pdf["c"] - result = cudf.from_pandas(test_pdf) - assert_eq(test_pdf, result) - - -def test_series_multiindex(pdfIndex): - ps = pd.Series(np.random.rand(7)) - gs = cudf.from_pandas(ps) - ps.index = pdfIndex - gs.index = cudf.from_pandas(pdfIndex) - assert_eq(ps, gs) - - -def test_multiindex_take(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - assert_eq(pdf.index.take([0]), gdf.index.take([0])) - assert_eq(pdf.index.take(np.array([0])), gdf.index.take(np.array([0]))) - from cudf import Series - - assert_eq(pdf.index.take(pd.Series([0])), gdf.index.take(Series([0]))) - assert_eq(pdf.index.take([0, 1]), gdf.index.take([0, 1])) - assert_eq( - pdf.index.take(np.array([0, 1])), gdf.index.take(np.array([0, 1])) - ) - assert_eq( - pdf.index.take(pd.Series([0, 1])), gdf.index.take(Series([0, 1])) - ) - - -def test_multiindex_getitem(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - assert_eq(pdf.index[0], gdf.index[0]) - - -@pytest.mark.parametrize( - "key_tuple", - [ - # return 2 rows, 0 remaining keys = dataframe with entire index - ("a", "store", "clouds", "fire"), - (("a", "store", "clouds", "fire"), slice(None)), - # return 2 rows, 1 remaining key = dataframe with n-k index columns - ("a", "store", "storm"), - (("a", "store", "storm"), slice(None)), - # return 2 rows, 2 remaining keys = dataframe with n-k index columns - ("a", "store"), - (("a", "store"), slice(None)), - # return 2 rows, n-1 remaining keys = dataframe with n-k index columns - ("a",), - "a", - "b", - "c", - (("a",), slice(None)), - # return 1 row, 0 remaining keys = dataframe with entire index - ("a", "store", "storm", "smoke"), - (("a", "store", "storm", "smoke"), slice(None)), - # return 1 row and 1 remaining key = series - ("c", "forest", "clear"), - (("c", "forest", "clear"), slice(None)), - ], -) -def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple): - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex, gdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - # The index is unsorted, which makes things slow but is fine for testing. - with expect_pandas_performance_warning(key_tuple): - expected = pdf.loc[key_tuple] - got = gdf.loc[key_tuple].sort_index() - assert_eq(expected.sort_index(), got) - - with cudf.option_context("mode.pandas_compatible", True): - got = gdf.loc[key_tuple] - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "indexer", - [ - (([1, 1], [0, 1]), slice(None)), - (([1, 1], [1, 0]), slice(None)), - ], -) -def test_multiindex_compatible_ordering(indexer): - df = pd.DataFrame( - {"a": [1, 1, 2, 3], "b": [1, 0, 1, 1], "c": [1, 2, 3, 4]} - ).set_index(["a", "b"]) - cdf = cudf.from_pandas(df) - expect = df.loc[indexer] - with cudf.option_context("mode.pandas_compatible", True): - actual = cdf.loc[indexer] - assert_eq(actual, expect) - - -@pytest.mark.parametrize( - "arg", - [ - slice(("a", "store"), ("b", "house")), - slice(None, ("b", "house")), - slice(("a", "store"), None), - slice(None), - ], -) -def test_multiindex_loc_slice(pdf, gdf, pdfIndex, arg): - gdf = cudf.from_pandas(pdf) - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - assert_eq(pdf.loc[arg], gdf.loc[arg]) - - -def test_multiindex_loc_errors(pdf, gdf, pdfIndex): - gdf = cudf.from_pandas(pdf) - gdfIndex = cudf.from_pandas(pdfIndex) - gdf.index = gdfIndex - - with pytest.raises(KeyError): - gdf.loc[("a", "store", "clouds", "foo")] - with pytest.raises(IndexError): - gdf.loc[ - ("a", "store", "clouds", "fire", "x", "y") - ] # too many indexers - with pytest.raises(IndexError): - gdf.loc[slice(None, ("a", "store", "clouds", "fire", "x", "y"))] - - -def test_multiindex_loc_then_column(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex, gdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - # The index is unsorted, which makes things slow but is fine for testing. - with pytest.warns(pd.errors.PerformanceWarning): - expected = pdf.loc[("a", "store", "clouds", "fire"), :][0] - got = gdf.loc[("a", "store", "clouds", "fire"), :][0] - assert_eq(expected, got) - - -def test_multiindex_loc_rows_0(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - - assert_exceptions_equal( - lfunc=pdf.loc.__getitem__, - rfunc=gdf.loc.__getitem__, - lfunc_args_and_kwargs=([(("d",), slice(None, None, None))],), - rfunc_args_and_kwargs=([(("d",), slice(None, None, None))],), - ) - - -def test_multiindex_loc_rows_1_2_key(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - print(pdf.loc[("c", "forest"), :]) - print(gdf.loc[("c", "forest"), :].to_pandas()) - assert_eq(pdf.loc[("c", "forest"), :], gdf.loc[("c", "forest"), :]) - - -def test_multiindex_loc_rows_1_1_key(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - print(pdf.loc[("c",), :]) - print(gdf.loc[("c",), :].to_pandas()) - assert_eq(pdf.loc[("c",), :], gdf.loc[("c",), :]) - - -def test_multiindex_column_shape(): - pdf = pd.DataFrame(np.random.rand(5, 0)) - gdf = cudf.from_pandas(pdf) - pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]]) - pdfIndex.names = ["alpha"] - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex, gdfIndex) - - assert_exceptions_equal( - lfunc=operator.setitem, - rfunc=operator.setitem, - lfunc_args_and_kwargs=([], {"a": pdf, "b": "columns", "c": pdfIndex}), - rfunc_args_and_kwargs=([], {"a": gdf, "b": "columns", "c": gdfIndex}), - ) - - -@pytest.mark.parametrize( - "query", - [ - ("a", "store", "clouds", "fire"), - ("a", "store", "storm", "smoke"), - ("a", "store"), - ("b", "house"), - ("a", "store", "storm"), - ("a",), - ("c", "forest", "clear"), - ], -) -def test_multiindex_columns(pdf, gdf, pdfIndex, query): - pdf = pdf.T - gdf = cudf.from_pandas(pdf) - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex, gdfIndex) - pdf.columns = pdfIndex - gdf.columns = gdfIndex - # The index is unsorted, which makes things slow but is fine for testing. - with expect_pandas_performance_warning(query): - expected = pdf[query] - got = gdf[query] - assert_eq(expected, got) - - -def test_multiindex_from_tuples(): - arrays = [["a", "a", "b", "b"], ["house", "store", "house", "store"]] - tuples = list(zip(*arrays)) - pmi = pd.MultiIndex.from_tuples(tuples) - gmi = cudf.MultiIndex.from_tuples(tuples) - assert_eq(pmi, gmi) - - -def test_multiindex_from_dataframe(): - if not hasattr(pd.MultiIndex([[]], [[]]), "codes"): - pytest.skip() - pdf = pd.DataFrame( - [["a", "house"], ["a", "store"], ["b", "house"], ["b", "store"]] - ) - gdf = cudf.from_pandas(pdf) - pmi = pd.MultiIndex.from_frame(pdf, names=["alpha", "location"]) - gmi = cudf.MultiIndex.from_frame(gdf, names=["alpha", "location"]) - assert_eq(pmi, gmi) - - -@pytest.mark.parametrize( - "arrays", - [ - [["a", "a", "b", "b"], ["house", "store", "house", "store"]], - [["a", "n", "n"] * 1000, ["house", "store", "house", "store"]], - [ - ["a", "n", "n"], - ["house", "store", "house", "store", "store"] * 1000, - ], - [ - ["a", "a", "n"] * 50, - ["house", "store", "house", "store", "store"] * 100, - ], - ], -) -def test_multiindex_from_product(arrays): - pmi = pd.MultiIndex.from_product(arrays, names=["alpha", "location"]) - gmi = cudf.MultiIndex.from_product(arrays, names=["alpha", "location"]) - assert_eq(pmi, gmi) - - -def test_multiindex_index_and_columns(): - gdf = cudf.DataFrame() - gdf["x"] = np.random.randint(0, 5, 5) - gdf["y"] = np.random.randint(0, 5, 5) - pdf = gdf.to_pandas() - mi = cudf.MultiIndex( - levels=[[0, 1, 2], [3, 4]], - codes=[[0, 0, 1, 1, 2], [0, 1, 0, 1, 1]], - names=["x", "y"], - ) - gdf.index = mi - mc = cudf.MultiIndex( - levels=[["val"], ["mean", "min"]], codes=[[0, 0], [0, 1]] - ) - gdf.columns = mc - pdf.index = mi.to_pandas() - pdf.columns = mc.to_pandas() - assert_eq(pdf, gdf) - - -def test_multiindex_multiple_groupby(): - pdf = pd.DataFrame( - { - "a": [4, 17, 4, 9, 5], - "b": [1, 4, 4, 3, 2], - "x": np.random.normal(size=5), - } - ) - gdf = cudf.DataFrame.from_pandas(pdf) - pdg = pdf.groupby(["a", "b"], sort=True).sum() - gdg = gdf.groupby(["a", "b"], sort=True).sum() - assert_eq(pdg, gdg) - pdg = pdf.groupby(["a", "b"], sort=True).x.sum() - gdg = gdf.groupby(["a", "b"], sort=True).x.sum() - assert_eq(pdg, gdg) - - -@pytest.mark.parametrize( - "func", - [ - lambda df: df.groupby(["x", "y"], sort=True).z.sum(), - lambda df: df.groupby(["x", "y"], sort=True).sum(), - ], -) -def test_multi_column(func): - pdf = pd.DataFrame( - { - "x": np.random.randint(0, 5, size=1000), - "y": np.random.randint(0, 10, size=1000), - "z": np.random.normal(size=1000), - } - ) - gdf = cudf.DataFrame.from_pandas(pdf) - - a = func(pdf) - b = func(gdf) - - assert_eq(a, b) - - -def test_multiindex_equality(): - # mi made from groupby - # mi made manually to be identical - # are they equal? - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} - ) - mi1 = gdf.groupby(["x", "y"], sort=True).mean().index - mi2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1, mi2) - - # mi made from two groupbys, are they equal? - mi2 = gdf.groupby(["x", "y"], sort=True).max().index - assert_eq(mi1, mi2) - - # mi made manually twice are they equal? - mi1 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - mi2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1, mi2) - - # mi made from different groupbys are they not equal? - mi1 = gdf.groupby(["x", "y"]).mean().index - mi2 = gdf.groupby(["x", "z"]).mean().index - assert_neq(mi1, mi2) - - # mi made from different manuals are they not equal? - mi1 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - mi2 = cudf.MultiIndex( - levels=[[0, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_neq(mi1, mi2) - - -def test_multiindex_equals(): - # mi made from groupby - # mi made manually to be identical - # are they equal? - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} - ) - mi1 = gdf.groupby(["x", "y"], sort=True).mean().index - mi2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1.equals(mi2), True) - - # mi made from two groupbys, are they equal? - mi2 = gdf.groupby(["x", "y"], sort=True).max().index - assert_eq(mi1.equals(mi2), True) - - # mi made manually twice are they equal? - mi1 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - mi2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1.equals(mi2), True) - - # mi made from different groupbys are they not equal? - mi1 = gdf.groupby(["x", "y"], sort=True).mean().index - mi2 = gdf.groupby(["x", "z"], sort=True).mean().index - assert_eq(mi1.equals(mi2), False) - - # mi made from different manuals are they not equal? - mi1 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - mi2 = cudf.MultiIndex( - levels=[[0, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - assert_eq(mi1.equals(mi2), False) - - -@pytest.mark.parametrize( - "data", - [ - { - "Date": [ - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - ], - "Close": [ - 3400.00, - 3401.80, - 3450.96, - 226.58, - 228.91, - 225.53, - 505.13, - 525.91, - 534.98, - ], - "Symbol": [ - "AMZN", - "AMZN", - "AMZN", - "MSFT", - "MSFT", - "MSFT", - "NVDA", - "NVDA", - "NVDA", - ], - } - ], -) -@pytest.mark.parametrize("names", [["X", "Y"]]) -def test_multiindex_copy_sem(data, names): - """Test semantic equality for MultiIndex.copy""" - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - gdf = gdf.groupby(["Date", "Symbol"], sort=True).mean() - pdf = pdf.groupby(["Date", "Symbol"], sort=True).mean() - - gmi = gdf.index - gmi_copy = gmi.copy(names=names) - - pmi = pdf.index - pmi_copy = pmi.copy(names=names) - - for glv, plv in zip(gmi_copy.levels, pmi_copy.levels): - assert all(glv.values_host == plv.values) - for gval, pval in zip(gmi.codes, pmi.codes): - assert_eq(gval, pval) - assert_eq(gmi_copy.names, pmi_copy.names) - - # Test same behavior when used on DataFrame - gdf.index = gmi_copy - pdf.index = pmi_copy - assert repr(gdf) == repr(pdf) - - -@pytest.mark.parametrize( - "data", - [ - { - "Date": [ - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - "2020-08-27", - "2020-08-28", - "2020-08-31", - ], - "Close": [ - 3400.00, - 3401.80, - 3450.96, - 226.58, - 228.91, - 225.53, - 505.13, - 525.91, - 534.98, - ], - "Symbol": [ - "AMZN", - "AMZN", - "AMZN", - "MSFT", - "MSFT", - "MSFT", - "NVDA", - "NVDA", - "NVDA", - ], - }, - cudf.MultiIndex( - levels=[[1001, 1002], [2001, 2002]], - codes=[[1, 1, 0, 0], [0, 1, 0, 1]], - names=["col1", "col2"], - ), - ], -) -@pytest.mark.parametrize("copy_on_write", [True, False]) -@pytest.mark.parametrize("deep", [True, False]) -def test_multiindex_copy_deep(data, copy_on_write, deep): - """Test memory identity for deep copy - Case1: Constructed from GroupBy, StringColumns - Case2: Constructed from MultiIndex, NumericColumns - """ - original_cow_setting = cudf.get_option("copy_on_write") - cudf.set_option("copy_on_write", copy_on_write) - - if isinstance(data, dict): - import operator - from functools import reduce - - gdf = cudf.DataFrame(data) - mi1 = gdf.groupby(["Date", "Symbol"]).mean().index - mi2 = mi1.copy(deep=deep) - - lchildren = [col.children for col in mi1._columns] - rchildren = [col.children for col in mi2._columns] - - # Flatten - lchildren = reduce(operator.add, lchildren) - rchildren = reduce(operator.add, rchildren) - - lptrs = [child.base_data.get_ptr(mode="read") for child in lchildren] - rptrs = [child.base_data.get_ptr(mode="read") for child in rchildren] - - assert all((x == y) for x, y in zip(lptrs, rptrs)) - - elif isinstance(data, cudf.MultiIndex): - same_ref = (not deep) or ( - cudf.get_option("copy_on_write") and not deep - ) - mi1 = data - mi2 = mi1.copy(deep=deep) - - # Assert ._levels identity - lptrs = [ - lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels - ] - rptrs = [ - lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels - ] - - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - - # Assert ._codes identity - lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes] - rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes] - - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - - # Assert ._data identity - lptrs = [d.base_data.get_ptr(mode="read") for d in mi1._columns] - rptrs = [d.base_data.get_ptr(mode="read") for d in mi2._columns] - - assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) - cudf.set_option("copy_on_write", original_cow_setting) - - -@pytest.mark.parametrize( - "iloc_rows", - [ - 0, - 1, - slice(None, 0), - slice(None, 1), - slice(0, 1), - slice(1, 2), - slice(0, 2), - slice(0, None), - slice(1, None), - ], -) -@pytest.mark.parametrize( - "iloc_columns", - [ - 0, - 1, - slice(None, 0), - slice(None, 1), - slice(0, 1), - slice(1, 2), - slice(0, 2), - slice(0, None), - slice(1, None), - ], -) -def test_multiindex_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex, gdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - presult = pdf.iloc[iloc_rows, iloc_columns] - gresult = gdf.iloc[iloc_rows, iloc_columns] - if isinstance(gresult, cudf.DataFrame): - assert_eq( - presult, gresult, check_index_type=False, check_column_type=False - ) - else: - assert_eq(presult, gresult, check_index_type=False, check_dtype=False) - - -def test_multiindex_iloc_scalar(): - arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] - tuples = list(zip(*arrays)) - idx = cudf.MultiIndex.from_tuples(tuples) - gdf = cudf.DataFrame( - {"first": cp.random.rand(4), "second": cp.random.rand(4)} - ) - gdf.index = idx - - pdf = gdf.to_pandas() - assert_eq(pdf.iloc[3], gdf.iloc[3]) - - -@pytest.mark.parametrize( - "iloc_rows", - [ - 0, - 1, - slice(None, 0), - slice(None, 1), - slice(0, 1), - slice(1, 2), - slice(0, 2), - slice(0, None), - slice(1, None), - ], -) -@pytest.mark.parametrize( - "iloc_columns", - [ - 0, - 1, - slice(None, 0), - slice(None, 1), - slice(0, 1), - slice(1, 2), - slice(0, 2), - slice(0, None), - slice(1, None), - ], -) -def test_multicolumn_iloc(pdf, gdf, pdfIndex, iloc_rows, iloc_columns): - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex, gdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - pdf = pdf.T - gdf = gdf.T - presult = pdf.iloc[iloc_rows, iloc_columns] - gresult = gdf.iloc[iloc_rows, iloc_columns] - if hasattr(gresult, "name") and isinstance(gresult.name, tuple): - name = gresult.name[len(gresult.name) - 1] - if isinstance(name, str) and "cudf" in name: - gresult.name = name - if isinstance(presult, pd.DataFrame): - assert_eq( - presult, gresult, check_index_type=False, check_column_type=False - ) - else: - assert_eq(presult, gresult, check_index_type=False, check_dtype=False) - - -def test_multicolumn_item(): - gdf = cudf.DataFrame( - {"x": np.arange(10), "y": np.arange(10), "z": np.arange(10)} - ) - gdg = gdf.groupby(["x", "y"]).min() - gdgT = gdg.T - pdgT = gdgT.to_pandas() - assert_eq(gdgT[(0, 0)], pdgT[(0, 0)]) - - -def test_multiindex_to_frame(pdfIndex, pdfIndexNulls): - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex.to_frame(), gdfIndex.to_frame()) - - gdfIndex = cudf.from_pandas(pdfIndexNulls) - assert_eq( - pdfIndexNulls.to_frame().fillna("nan"), - gdfIndex.to_frame().fillna("nan"), - ) - - -def test_multiindex_groupby_to_frame(): - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"], sort=True).count() - pdg = pdf.groupby(["x", "y"], sort=True).count() - assert_eq(pdg.index.to_frame(), gdg.index.to_frame()) - - -def test_multiindex_reset_index(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - assert_eq(pdf.reset_index(), gdf.reset_index()) - - -def test_multiindex_groupby_reset_index(): - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [0, 1, 0, 1, 0]} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"], sort=True).sum() - pdg = pdf.groupby(["x", "y"], sort=True).sum() - assert_eq(pdg.reset_index(), gdg.reset_index()) - - -def test_multicolumn_reset_index(): - gdf = cudf.DataFrame({"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5]}) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x"], sort=True).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x"], sort=True).agg({"y": ["count", "mean"]}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x"], sort=True).agg({"y": ["count"]}) - pdg = pdf.groupby(["x"], sort=True).agg({"y": ["count"]}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x"], sort=True).agg({"y": "count"}) - pdg = pdf.groupby(["x"], sort=True).agg({"y": "count"}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - - -def test_multiindex_multicolumn_reset_index(): - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [1, 2, 3, 4, 5]} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"], sort=True).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x", "y"], sort=True).agg({"y": ["count", "mean"]}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - gdg = gdf.groupby(["x", "z"], sort=True).agg({"y": ["count", "mean"]}) - pdg = pdf.groupby(["x", "z"], sort=True).agg({"y": ["count", "mean"]}) - assert_eq(pdg.reset_index(), gdg.reset_index(), check_dtype=False) - - -def test_groupby_multiindex_columns_from_pandas(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - assert_eq(gdf, pdf) - assert_eq(gdf.T, pdf.T) - - -def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex): - gdfIndex = cudf.from_pandas(pdfIndex) - pdf.index = pdfIndex - gdf.index = gdfIndex - # The index is unsorted, which makes things slow but is fine for testing. - with pytest.warns(pd.errors.PerformanceWarning): - assert_eq( - pdf.loc[("a",), :].sort_index(), gdf.loc[("a",), :].sort_index() - ) - assert_eq( - pdf.loc[(("a"), ("store")), :].sort_index(), - gdf.loc[(("a"), ("store")), :].sort_index(), - ) - assert_eq( - pdf.loc[(("a"), ("store"), ("storm")), :].sort_index(), - gdf.loc[(("a"), ("store"), ("storm")), :].sort_index(), - ) - assert_eq( - pdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :].sort_index(), - gdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :].sort_index(), - ) - assert_eq( - pdf.loc[(slice(None), "store"), :].sort_index(), - gdf.loc[(slice(None), "store"), :].sort_index(), - ) - assert_eq( - pdf.loc[(slice(None), slice(None), "storm"), :].sort_index(), - gdf.loc[(slice(None), slice(None), "storm"), :].sort_index(), - ) - assert_eq( - pdf.loc[ - (slice(None), slice(None), slice(None), "smoke"), : - ].sort_index(), - gdf.loc[ - (slice(None), slice(None), slice(None), "smoke"), : - ].sort_index(), - ) - - -def test_multiindex_multicolumn_zero_row_slice(): - gdf = cudf.DataFrame( - {"x": [1, 5, 3, 4, 1], "y": [1, 1, 2, 2, 5], "z": [1, 2, 3, 4, 5]} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["x", "y"]).agg({"z": ["count"]}).iloc[:0] - pdg = pdf.groupby(["x", "y"]).agg({"z": ["count"]}).iloc[:0] - assert_eq(pdg, gdg, check_dtype=False) - - -def test_multicolumn_loc(pdf, pdfIndex): - pdf = pdf.T - pdf.columns = pdfIndex - gdf = cudf.from_pandas(pdf) - assert_eq(pdf.loc[:, "a"], gdf.loc[:, "a"]) - assert_eq(pdf.loc[:, ("a", "store")], gdf.loc[:, ("a", "store")]) - assert_eq(pdf.loc[:, "a":"b"], gdf.loc[:, "a":"b"]) - assert_eq(pdf.loc[:, ["a", "b"]], gdf.loc[:, ["a", "b"]]) - - -@pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/43351", -) -def test_multicolumn_set_item(pdf, pdfIndex): - pdf = pdf.T - pdf.columns = pdfIndex - gdf = cudf.from_pandas(pdf) - pdf["d"] = [1, 2, 3, 4, 5] - gdf["d"] = [1, 2, 3, 4, 5] - assert_eq(pdf, gdf) - - -def test_multiindex_iter_error(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - - with pytest.raises( - TypeError, - match=re.escape( - f"{midx.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - iter(midx) - - -def test_multiindex_values(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - - result = midx.values - - assert isinstance(result, cp.ndarray) - np.testing.assert_array_equal( - result.get(), np.array([[1, 1], [1, 5], [3, 2], [4, 2], [5, 1]]) - ) - - -def test_multiindex_values_host(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - pmidx = midx.to_pandas() - - assert_eq(midx.values_host, pmidx.values) - - -def test_multiindex_to_numpy(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - pmidx = midx.to_pandas() - - assert_eq(midx.to_numpy(), pmidx.to_numpy()) - - -@pytest.mark.parametrize( - "gdi, fill_value, expected", - [ - ( - cudf.MultiIndex( - levels=[[1, 3, 4, None], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - 5, - cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - ), - ( - cudf.MultiIndex( - levels=[[1, 3, 4, None], [1, None, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - 100, - cudf.MultiIndex( - levels=[[1, 3, 4, 100], [1, 100, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - ), - ( - cudf.MultiIndex( - levels=[["a", "b", "c", None], ["1", None, "5"]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - "100", - cudf.MultiIndex( - levels=[["a", "b", "c", "100"], ["1", "100", "5"]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - ), - ], -) -def test_multiindex_fillna(gdi, fill_value, expected): - assert_eq(expected, gdi.fillna(fill_value)) - - -@pytest.mark.parametrize( - "pdi", - [ - pd.MultiIndex( - levels=[[], [], []], - codes=[[], [], []], - names=["one", "two", "three"], - ), - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ] - ) - ) - ), - ], -) -def test_multiindex_empty(pdi): - gdi = cudf.from_pandas(pdi) - - assert_eq(pdi.empty, gdi.empty) - - -@pytest.mark.parametrize( - "pdi", - [ - pd.MultiIndex( - levels=[[], [], []], - codes=[[], [], []], - names=["one", "two", "three"], - ), - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ] - ) - ) - ), - ], -) -def test_multiindex_size(pdi): - gdi = cudf.from_pandas(pdi) - - assert_eq(pdi.size, gdi.size) - - -@pytest.mark.parametrize( - "level", - [ - [], - "alpha", - "location", - "weather", - 0, - 1, - [0, 1], - -1, - [-1, -2], - [-1, "weather"], - ], -) -def test_multiindex_droplevel_simple(pdfIndex, level): - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) - - -@pytest.mark.parametrize( - "level", - itertools.chain( - *( - itertools.combinations( - ("alpha", "location", "weather", "sign", "timestamp"), r - ) - for r in range(5) - ) - ), -) -def test_multiindex_droplevel_name(pdfIndex, level): - level = list(level) - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) - - -@pytest.mark.parametrize( - "level", - itertools.chain(*(itertools.combinations(range(5), r) for r in range(5))), -) -def test_multiindex_droplevel_index(pdfIndex, level): - level = list(level) - gdfIndex = cudf.from_pandas(pdfIndex) - assert_eq(pdfIndex.droplevel(level), gdfIndex.droplevel(level)) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("return_indexer", [True, False]) -@pytest.mark.parametrize( - "pmidx", - [ - pd.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - pd.MultiIndex.from_product( - [["bar", "baz", "foo", "qux"], ["one", "two"]], - names=["first", "second"], - ), - pd.MultiIndex( - levels=[[], [], []], - codes=[[], [], []], - names=["one", "two", "three"], - ), - pd.MultiIndex.from_tuples( - list( - zip( - *[ - [ - "bar", - "bar", - "baz", - "baz", - "foo", - "foo", - "qux", - "qux", - ], - [ - "one", - "two", - "one", - "two", - "one", - "two", - "one", - "two", - ], - ] - ) - ) - ), - ], -) -def test_multiindex_sort_values(pmidx, ascending, return_indexer): - pmidx = pmidx - midx = cudf.from_pandas(pmidx) - - expected = pmidx.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - actual = midx.sort_values( - ascending=ascending, return_indexer=return_indexer - ) - - if return_indexer: - expected_indexer = expected[1] - actual_indexer = actual[1] - - assert_eq(expected_indexer, actual_indexer) - - expected = expected[0] - actual = actual[0] - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "pdi", - [ - pd.MultiIndex( - levels=[[1, 3.0, 4, 5], [1, 2.3, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - pd.MultiIndex( - levels=[[1, 3, 4, -10], [1, 11, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - pd.MultiIndex( - levels=[["a", "b", "c", "100"], ["1", "100", "5"]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - pytest.param( - pd.MultiIndex( - levels=[[None, "b", "c", "a"], ["1", None, "5"]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ), - marks=[ - pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/35584" - ) - ], - ), - ], -) -@pytest.mark.parametrize("ascending", [True, False]) -def test_multiindex_argsort(pdi, ascending): - gdi = cudf.from_pandas(pdi) - - if not ascending: - expected = pdi.argsort()[::-1] - else: - expected = pdi.argsort() - - actual = gdi.argsort(ascending=ascending) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx", [pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]])] -) -@pytest.mark.parametrize( - "names", [[None, None], ["a", None], ["new name", "another name"]] -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names(idx, names, inplace): - pi = idx.copy() - gi = cudf.from_pandas(idx) - - expected = pi.set_names(names=names, inplace=inplace) - actual = gi.set_names(names=names, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] - ), - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], - names=[1, 0, 2], - ), - ], -) -@pytest.mark.parametrize( - "level, names", - [ - (0, "abc"), - (1, "xyz"), - ([2, 1], ["a", "b"]), - ([0, 1], ["aa", "bb"]), - (None, ["a", "b", "c"]), - (None, ["a", None, "c"]), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names_default_and_int_names( - idx, level, names, inplace -): - pi = idx.copy() - gi = cudf.from_pandas(idx) - - expected = pi.set_names(names=names, level=level, inplace=inplace) - actual = gi.set_names(names=names, level=level, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]], - names=["one", None, "three"], - ), - ], -) -@pytest.mark.parametrize( - "level, names", - [ - ([None], "abc"), - (["three", "one"], ["a", "b"]), - (["three", 1], ["a", "b"]), - ([0, "three", 1], ["a", "b", "z"]), - (["one", 1, "three"], ["a", "b", "z"]), - (["one", None, "three"], ["a", "b", "z"]), - ([2, 1], ["a", "b"]), - (1, "xyz"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_set_names_string_names(idx, level, names, inplace): - pi = idx.copy() - gi = cudf.from_pandas(idx) - - expected = pi.set_names(names=names, level=level, inplace=inplace) - actual = gi.set_names(names=names, level=level, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "level, names", [(1, ["a"]), (None, "a"), ([1, 2], ["a"]), (None, ["a"])] -) -def test_multiindex_set_names_error(level, names): - pi = pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] - ) - gi = cudf.from_pandas(pi) - - assert_exceptions_equal( - lfunc=pi.set_names, - rfunc=gi.set_names, - lfunc_args_and_kwargs=([], {"names": names, "level": level}), - rfunc_args_and_kwargs=([], {"names": names, "level": level}), - ) - - -@pytest.mark.parametrize( - "idx", - [ - pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]), - pd.MultiIndex.from_product( - [["python", "cobra"], [2018, 2019]], names=["old name", None] - ), - ], -) -@pytest.mark.parametrize( - "names", - [ - [None, None], - ["a", None], - ["new name", "another name"], - [1, None], - [2, 3], - [42, "name"], - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_multiindex_rename(idx, names, inplace): - pi = idx.copy() - gi = cudf.from_pandas(idx) - - expected = pi.rename(names=names, inplace=inplace) - actual = gi.rename(names=names, inplace=inplace) - - if inplace: - expected, actual = pi, gi - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "names", ["plain string", 123, ["str"], ["l1", "l2", "l3"]] -) -def test_multiindex_rename_error(names): - pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) - gi = cudf.from_pandas(pi) - - assert_exceptions_equal( - lfunc=pi.rename, - rfunc=gi.rename, - lfunc_args_and_kwargs=([], {"names": names}), - rfunc_args_and_kwargs=([], {"names": names}), - ) - - -@pytest.mark.parametrize( - "key", - [0, 1, [], [0, 1], slice(None), slice(0, 0), slice(0, 1), slice(0, 2)], -) -def test_multiindex_indexing(key): - gi = cudf.MultiIndex.from_frame( - cudf.DataFrame({"a": [1, 2, 3], "b": [True, False, False]}) - ) - pi = gi.to_pandas() - - assert_eq(gi[key], pi[key], exact=False) - - -def test_multiindex_duplicate_names(): - gi = cudf.MultiIndex( - levels=[["a", "b"], ["b", "a"]], - codes=[[0, 0], [0, 1]], - names=["a", "a"], - ) - pi = pd.MultiIndex( - levels=[["a", "b"], ["b", "a"]], - codes=[[0, 0], [0, 1]], - names=["a", "a"], - ) - - assert_eq(gi, pi) - - -def test_difference(): - midx = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], - names=["x", "y"], - ) - midx2 = cudf.MultiIndex( - levels=[[1, 3, 4, 5], [1, 2, 5]], - codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]], - names=["x", "y"], - ) - - expected = midx2.to_pandas().difference(midx.to_pandas()) - actual = midx2.difference(midx) - assert isinstance(actual, cudf.MultiIndex) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - ( - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]], - names=["a", "b"], - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]], - names=["x", "y"], - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]] - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - [(2, 6, 12)], - ), - ], -) -@pytest.mark.parametrize("sort", [None, False]) -def test_union_mulitIndex(idx1, idx2, sort): - expected = idx1.union(idx2, sort=sort) - - idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.MultiIndex) else idx1 - idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.MultiIndex) else idx2 - - actual = idx1.union(idx2, sort=sort) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "idx1, idx2", - [ - ( - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), - pd.MultiIndex.from_arrays( - [[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]] - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]], - names=["a", "b"], - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]], - names=["x", "y"], - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]] - ), - ), - ( - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - ), - ), - ], -) -@pytest.mark.parametrize("sort", [None, False]) -def test_intersection_mulitIndex(idx1, idx2, sort): - expected = idx1.intersection(idx2, sort=sort) - - idx1 = cudf.from_pandas(idx1) - idx2 = cudf.from_pandas(idx2) - - actual = idx1.intersection(idx2, sort=sort) - assert_eq(expected, actual, exact=False) - - -@pytest.mark.parametrize( - "names", - [ - ["a", "b", "c"], - [None, None, None], - ["aa", "aa", "aa"], - ["bb", "aa", "aa"], - None, - ], -) -def test_pickle_roundtrip_multiindex(names): - df = cudf.DataFrame( - { - "one": [1, 2, 3], - "two": [True, False, True], - "three": ["ab", "cd", "ef"], - "four": [0.2, 0.1, -10.2], - } - ) - expected_df = df.set_index(["one", "two", "three"]) - expected_df.index.names = names - local_file = BytesIO() - - pickle.dump(expected_df, local_file) - local_file.seek(0) - actual_df = pickle.load(local_file) - assert_eq(expected_df, actual_df) - - -@pytest.mark.parametrize( - "pidx", - [ - pd.MultiIndex.from_arrays( - [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[1.0, 2, 3, 4], [5, 6, 7.8, 10], [11, 12, 12, 13]], - ), - ], -) -@pytest.mark.parametrize( - "func", - [ - "is_numeric", - "is_boolean", - "is_integer", - "is_floating", - "is_object", - "is_categorical", - "is_interval", - ], -) -def test_multiindex_type_methods(pidx, func): - gidx = cudf.from_pandas(pidx) - - with pytest.warns(FutureWarning): - expected = getattr(pidx, func)() - - with pytest.warns(FutureWarning): - actual = getattr(gidx, func)() - - if func == "is_object": - assert_eq(False, actual) - else: - assert_eq(expected, actual) - - -def test_multiindex_index_single_row(): - arrays = [["a", "a", "b", "b"], [1, 2, 3, 4]] - tuples = list(zip(*arrays)) - idx = cudf.MultiIndex.from_tuples(tuples) - gdf = cudf.DataFrame( - {"first": cp.random.rand(4), "second": cp.random.rand(4)} - ) - gdf.index = idx - pdf = gdf.to_pandas() - assert_eq(pdf.loc[("b", 3)], gdf.loc[("b", 3)]) - - -def test_multiindex_levels(): - gidx = cudf.MultiIndex.from_product( - [range(3), ["one", "two"]], names=["first", "second"] - ) - pidx = gidx.to_pandas() - - assert_eq(gidx.levels[0], pidx.levels[0]) - assert_eq(gidx.levels[1], pidx.levels[1]) - - -def test_multiindex_empty_slice_pandas_compatibility(): - expected = pd.MultiIndex.from_tuples([("a", "b")])[:0] - with cudf.option_context("mode.pandas_compatible", True): - actual = cudf.from_pandas(expected) - assert_eq(expected, actual, exact=False) - - -@pytest.mark.parametrize( - "levels", - itertools.chain.from_iterable( - itertools.permutations(range(3), n) for n in range(1, 4) - ), - ids=str, -) -def test_multiindex_sort_index_partial(levels): - df = pd.DataFrame( - { - "a": [3, 3, 3, 1, 1, 1, 2, 2], - "b": [4, 2, 7, -1, 11, -2, 7, 7], - "c": [4, 4, 2, 3, 3, 3, 1, 1], - "val": [1, 2, 3, 4, 5, 6, 7, 8], - } - ).set_index(["a", "b", "c"]) - cdf = cudf.from_pandas(df) - - expect = df.sort_index(level=levels, sort_remaining=True) - got = cdf.sort_index(level=levels, sort_remaining=True) - assert_eq(expect, got) - - -def test_multiindex_to_series_error(): - midx = cudf.MultiIndex.from_tuples([("a", "b")]) - with pytest.raises(NotImplementedError): - midx.to_series() - - -@pytest.mark.parametrize( - "pidx", - [ - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "b", "c"], - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - names=["a", "a", "a"], - ), - pd.MultiIndex.from_arrays( - [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], - ), - ], -) -@pytest.mark.parametrize( - "name", [None, no_default, ["x", "y", "z"], ["rapids", "rapids", "rapids"]] -) -@pytest.mark.parametrize("allow_duplicates", [True, False]) -@pytest.mark.parametrize("index", [True, False]) -def test_multiindex_to_frame_allow_duplicates( - pidx, name, allow_duplicates, index -): - gidx = cudf.from_pandas(pidx) - - if name is None or ( - ( - len(pidx.names) != len(set(pidx.names)) - and not all(x is None for x in pidx.names) - ) - and not allow_duplicates - and name is no_default - ): - assert_exceptions_equal( - pidx.to_frame, - gidx.to_frame, - lfunc_args_and_kwargs=( - [], - { - "index": index, - "name": name, - "allow_duplicates": allow_duplicates, - }, - ), - rfunc_args_and_kwargs=( - [], - { - "index": index, - "name": name, - "allow_duplicates": allow_duplicates, - }, - ), - ) - else: - if ( - len(pidx.names) != len(set(pidx.names)) - and not all(x is None for x in pidx.names) - and not isinstance(name, list) - ) or (isinstance(name, list) and len(name) != len(set(name))): - # cudf doesn't have the ability to construct dataframes - # with duplicate column names - with pytest.raises(ValueError): - gidx.to_frame( - index=index, - name=name, - allow_duplicates=allow_duplicates, - ) - else: - expected = pidx.to_frame( - index=index, name=name, allow_duplicates=allow_duplicates - ) - actual = gidx.to_frame( - index=index, name=name, allow_duplicates=allow_duplicates - ) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("bad", ["foo", ["foo"]]) -def test_multiindex_set_names_validation(bad): - mi = cudf.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)]) - with pytest.raises(ValueError): - mi.names = bad - - -def test_multiindex_values_pandas_compatible(): - midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - midx.values - - -def test_multiindex_dtype_error(): - midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) - with pytest.raises(TypeError): - cudf.Index(midx, dtype="int64") - with pytest.raises(TypeError): - cudf.Index(midx.to_pandas(), dtype="int64") - - -def test_multiindex_codes(): - midx = cudf.MultiIndex.from_tuples( - [("a", "b"), ("a", "c"), ("b", "c")], names=["A", "Z"] - ) - - for p_array, g_array in zip(midx.to_pandas().codes, midx.codes): - assert_eq(p_array, g_array) - - -def test_multiindex_union_error(): - midx = cudf.MultiIndex.from_tuples([(10, 12), (8, 9), (3, 4)]) - pidx = midx.to_pandas() - - assert_exceptions_equal( - midx.union, - pidx.union, - lfunc_args_and_kwargs=(["a"],), - rfunc_args_and_kwargs=(["b"],), - ) - - -@pytest.mark.parametrize("idx_get", [(0, 0), (0, 1), (1, 0), (1, 1)]) -@pytest.mark.parametrize("cols_get", [0, 1, [0, 1], [1, 0], [1], [0]]) -def test_multiindex_loc_scalar(idx_get, cols_get): - idx = cudf.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)]) - df = cudf.DataFrame({0: range(4), 1: range(10, 50, 10)}, index=idx) - pdf = df.to_pandas() - - actual = df.loc[idx_get, cols_get] - expected = pdf.loc[idx_get, cols_get] - - assert_eq(actual, expected) - - -def test_multiindex_eq_other_multiindex(): - idx = cudf.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)]) - result = idx == idx - expected = np.array([True, True]) - assert_eq(result, expected) - - -@pytest.fixture( - params=[ - "from_product", - "from_tuples", - "from_arrays", - "init", - ] -) -def midx(request): - if request.param == "from_product": - return cudf.MultiIndex.from_product([[0, 1], [1, 0]]) - elif request.param == "from_tuples": - return cudf.MultiIndex.from_tuples([(0, 1), (0, 0), (1, 1), (1, 0)]) - elif request.param == "from_arrays": - return cudf.MultiIndex.from_arrays([[0, 0, 1, 1], [1, 0, 1, 0]]) - elif request.param == "init": - return cudf.MultiIndex( - levels=[[0, 1], [0, 1]], codes=[[0, 0, 1, 1], [1, 0, 1, 0]] - ) - else: - raise NotImplementedError(f"{request.param} not implemented") - - -def test_multindex_constructor_levels_always_indexes(midx): - assert_eq(midx.levels[0], cudf.Index([0, 1])) - assert_eq(midx.levels[1], cudf.Index([0, 1])) - - -@pytest.mark.parametrize( - "array", - [ - list, - tuple, - np.array, - cp.array, - pd.Index, - cudf.Index, - pd.Series, - cudf.Series, - ], -) -def test_multiindex_from_arrays(array): - pd_data = [[0, 0, 1, 1], [1, 0, 1, 0]] - cudf_data = [array(lst) for lst in pd_data] - result = pd.MultiIndex.from_arrays(pd_data) - expected = cudf.MultiIndex.from_arrays(cudf_data) - assert_eq(result, expected) - - -@pytest.mark.parametrize("arg", ["foo", ["foo"]]) -def test_multiindex_from_arrays_wrong_arg(arg): - with pytest.raises(TypeError): - cudf.MultiIndex.from_arrays(arg) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - ], -) -def test_index_to_pandas_arrow_type_nullable_raises(scalar): - pa_array = pa.array([scalar, None]) - midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) - with pytest.raises(ValueError): - midx.to_pandas(nullable=True, arrow_type=True) - - -@pytest.mark.parametrize( - "scalar", - [1, 1.0, "a", datetime.datetime(2020, 1, 1), datetime.timedelta(1)], -) -def test_index_to_pandas_arrow_type(scalar): - pa_array = pa.array([scalar, None]) - midx = cudf.MultiIndex(levels=[pa_array], codes=[[0]]) - result = midx.to_pandas(arrow_type=True) - expected = pd.MultiIndex( - levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]] - ) - pd.testing.assert_index_equal(result, expected) - - -def test_multi_index_contains_hashable(): - gidx = cudf.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3])) - pidx = gidx.to_pandas() - - assert_exceptions_equal( - lambda: [] in gidx, - lambda: [] in pidx, - lfunc_args_and_kwargs=((),), - rfunc_args_and_kwargs=((),), - ) - - -@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_nunique(array, dropna): - arrays = [array, [3, 4]] - gidx = cudf.MultiIndex.from_arrays(arrays) - pidx = pd.MultiIndex.from_arrays(arrays) - result = gidx.nunique(dropna=dropna) - expected = pidx.nunique(dropna=dropna) - assert result == expected - - -def test_bool_raises(): - assert_exceptions_equal( - lfunc=bool, - rfunc=bool, - lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]], - rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]], - ) - - -def test_unique_level(): - pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]]) - cudf_mi = cudf.MultiIndex.from_pandas(pd_mi) - - result = pd_mi.unique(level=1) - expected = cudf_mi.unique(level=1) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "idx", [pd.Index, pd.CategoricalIndex, pd.DatetimeIndex, pd.TimedeltaIndex] -) -def test_from_arrays_infer_names(idx): - arrays = [idx([1], name="foo"), idx([2], name="bar")] - expected = pd.MultiIndex.from_arrays(arrays) - result = cudf.MultiIndex.from_arrays(arrays) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py deleted file mode 100644 index 055bc5757b3..00000000000 --- a/python/cudf/cudf/tests/test_mvc.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -import subprocess -import sys - -import pytest -from packaging import version - -IS_CUDA_11 = False -IS_CUDA_12 = False -try: - from ptxcompiler.patch import safe_get_versions -except ModuleNotFoundError: - from cudf.utils._ptxcompiler import safe_get_versions - -# do not test cuda 12 if pynvjitlink isn't present -HAVE_PYNVJITLINK = False -try: - import numba - import pynvjitlink # noqa: F401 - - HAVE_PYNVJITLINK = version.parse(numba.__version__) >= version.parse( - "0.58" - ) -except ModuleNotFoundError: - pass - - -versions = safe_get_versions() -driver_version, runtime_version = versions - -if (11, 0) <= driver_version < (12, 0): - IS_CUDA_11 = True -if (12, 0) <= driver_version < (13, 0): - IS_CUDA_12 = True - - -TEST_BODY = """ -@numba.cuda.jit -def test_kernel(x): - id = numba.cuda.grid(1) - if id < len(x): - x[id] += 1 - -s = cudf.Series([1, 2, 3]) -with _CUDFNumbaConfig(): - test_kernel.forall(len(s))(s) -""" - -CUDA_11_TEST = ( - """ -import numba.cuda -import cudf -from cudf.utils._numba import _CUDFNumbaConfig, patch_numba_linker_cuda_11 - - -patch_numba_linker_cuda_11() -""" - + TEST_BODY -) - - -CUDA_12_TEST = ( - """ -import numba.cuda -import cudf -from cudf.utils._numba import _CUDFNumbaConfig -from pynvjitlink.patch import ( - patch_numba_linker as patch_numba_linker_pynvjitlink, -) - -patch_numba_linker_pynvjitlink() -""" - + TEST_BODY -) - - -@pytest.mark.parametrize( - "test", - [ - pytest.param( - CUDA_11_TEST, - marks=pytest.mark.skipif( - not IS_CUDA_11, - reason="Minor Version Compatibility test for CUDA 11", - ), - ), - pytest.param( - CUDA_12_TEST, - marks=pytest.mark.skipif( - not IS_CUDA_12 or not HAVE_PYNVJITLINK, - reason="Minor Version Compatibility test for CUDA 12", - ), - ), - ], -) -def test_numba_mvc(test): - cp = subprocess.run( - [sys.executable, "-c", test], - capture_output=True, - cwd="/", - ) - - assert cp.returncode == 0 diff --git a/python/cudf/cudf/tests/test_no_cuinit.py b/python/cudf/cudf/tests/test_no_cuinit.py deleted file mode 100644 index 45d812fe9a2..00000000000 --- a/python/cudf/cudf/tests/test_no_cuinit.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. - -import os -import subprocess -import sys -from shutil import which - -import pytest - -GDB_COMMANDS = """ -set confirm off -set breakpoint pending on -break cuInit -run -exit -""" - - -@pytest.fixture(scope="module") -def cuda_gdb(request): - gdb = which("cuda-gdb") - if gdb is None: - request.applymarker( - pytest.mark.xfail(reason="No cuda-gdb found, can't detect cuInit"), - ) - return gdb - else: - output = subprocess.run( - [gdb, "--version"], capture_output=True, text=True, cwd="/" - ) - if output.returncode != 0: - request.applymarker( - pytest.mark.xfail( - reason=( - "cuda-gdb not working on this platform, " - f"can't detect cuInit: {output.stderr}" - ) - ), - ) - return gdb - - -def test_cudf_import_no_cuinit(cuda_gdb): - # When RAPIDS_NO_INITIALIZE is set, importing cudf should _not_ - # create a CUDA context (i.e. cuInit should not be called). - # Intercepting the call to cuInit programmatically is tricky since - # the way it is resolved from dynamic libraries by - # cuda-python/numba/cupy is multitudinous (see discussion at - # https://github.com/rapidsai/cudf/pull/12361 which does this, but - # needs provide hooks that override dlsym, cuGetProcAddress, and - # cuInit. - # Instead, we just run under GDB and see if we hit a breakpoint - env = os.environ.copy() - env["RAPIDS_NO_INITIALIZE"] = "1" - output = subprocess.run( - [ - cuda_gdb, - "-x", - "-", - "--args", - sys.executable, - "-c", - "import cudf", - ], - input=GDB_COMMANDS, - env=env, - capture_output=True, - text=True, - cwd="/", - ) - - cuInit_called = output.stdout.find("in cuInit ()") - print("Command output:\n") - print("*** STDOUT ***") - print(output.stdout) - print("*** STDERR ***") - print(output.stderr) - assert output.returncode == 0 - assert cuInit_called < 0 - - -def test_cudf_create_series_cuinit(cuda_gdb): - # This tests that our gdb scripting correctly identifies cuInit - # when it definitely should have been called. - env = os.environ.copy() - env["RAPIDS_NO_INITIALIZE"] = "1" - output = subprocess.run( - [ - cuda_gdb, - "-x", - "-", - "--args", - sys.executable, - "-c", - "import cudf; cudf.Series([1])", - ], - input=GDB_COMMANDS, - env=env, - capture_output=True, - text=True, - cwd="/", - ) - - cuInit_called = output.stdout.find("in cuInit ()") - print("Command output:\n") - print("*** STDOUT ***") - print(output.stdout) - print("*** STDERR ***") - print(output.stderr) - assert output.returncode == 0 - assert cuInit_called >= 0 diff --git a/python/cudf/cudf/tests/test_no_device.py b/python/cudf/cudf/tests/test_no_device.py deleted file mode 100644 index 722762b2d0c..00000000000 --- a/python/cudf/cudf/tests/test_no_device.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -import os -import subprocess - - -def test_cudf_import_no_device(): - env = os.environ.copy() - env["CUDA_VISIBLE_DEVICES"] = "-1" - output = subprocess.run( - ["python", "-c", "import cudf"], - env=env, - capture_output=True, - text=True, - cwd="/", - ) - assert output.returncode == 0 diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py deleted file mode 100644 index b1a2f081cd2..00000000000 --- a/python/cudf/cudf/tests/test_numerical.py +++ /dev/null @@ -1,428 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_GE_220 -from cudf.testing import assert_eq -from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if -from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes - - -def test_can_cast_safely_same_kind(): - # 'i' -> 'i' - data = cudf.Series([1, 2, 3], dtype="int32")._column - to_dtype = np.dtype("int64") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 3], dtype="int64")._column - to_dtype = np.dtype("int32") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 2**31], dtype="int64")._column - assert not data.can_cast_safely(to_dtype) - - # 'u' -> 'u' - data = cudf.Series([1, 2, 3], dtype="uint32")._column - to_dtype = np.dtype("uint64") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 3], dtype="uint64")._column - to_dtype = np.dtype("uint32") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 2**33], dtype="uint64")._column - assert not data.can_cast_safely(to_dtype) - - # 'f' -> 'f' - data = cudf.Series([np.inf, 1.0], dtype="float64")._column - to_dtype = np.dtype("float32") - assert data.can_cast_safely(to_dtype) - - data = cudf.Series( - [float(np.finfo("float32").max) * 2, 1.0], dtype="float64" - )._column - to_dtype = np.dtype("float32") - assert not data.can_cast_safely(to_dtype) - - -def test_can_cast_safely_mixed_kind(): - data = cudf.Series([1, 2, 3], dtype="int32")._column - to_dtype = np.dtype("float32") - assert data.can_cast_safely(to_dtype) - - # too big to fit into f32 exactly - data = cudf.Series([1, 2, 2**24 + 1], dtype="int32")._column - assert not data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 3], dtype="uint32")._column - to_dtype = np.dtype("float32") - assert data.can_cast_safely(to_dtype) - - # too big to fit into f32 exactly - data = cudf.Series([1, 2, 2**24 + 1], dtype="uint32")._column - assert not data.can_cast_safely(to_dtype) - - to_dtype = np.dtype("float64") - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1.0, 2.0, 3.0], dtype="float32")._column - to_dtype = np.dtype("int32") - assert data.can_cast_safely(to_dtype) - - # not integer float - data = cudf.Series([1.0, 2.0, 3.5], dtype="float32")._column - assert not data.can_cast_safely(to_dtype) - - data = cudf.Series([10.0, 11.0, 2000.0], dtype="float64")._column - assert data.can_cast_safely(to_dtype) - - # float out of int range - data = cudf.Series([1.0, 2.0, 1.0 * (2**31)], dtype="float32")._column - assert not data.can_cast_safely(to_dtype) - - # negative signed integers casting to unsigned integers - data = cudf.Series([-1, 0, 1], dtype="int32")._column - to_dtype = np.dtype("uint32") - assert not data.can_cast_safely(to_dtype) - - -def test_to_pandas_nullable_integer(): - gsr_not_null = cudf.Series([1, 2, 3]) - gsr_has_null = cudf.Series([1, 2, None]) - - psr_not_null = pd.Series([1, 2, 3], dtype="int64") - psr_has_null = pd.Series([1, 2, None], dtype="Int64") - - assert_eq(gsr_not_null.to_pandas(), psr_not_null) - assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) - - -def test_to_pandas_nullable_bool(): - gsr_not_null = cudf.Series([True, False, True]) - gsr_has_null = cudf.Series([True, False, None]) - - psr_not_null = pd.Series([True, False, True], dtype="bool") - psr_has_null = pd.Series([True, False, None], dtype="boolean") - - assert_eq(gsr_not_null.to_pandas(), psr_not_null) - assert_eq(gsr_has_null.to_pandas(nullable=True), psr_has_null) - - -def test_can_cast_safely_has_nulls(): - data = cudf.Series([1, 2, 3, None], dtype="float32")._column - to_dtype = np.dtype("int64") - - assert data.can_cast_safely(to_dtype) - - data = cudf.Series([1, 2, 3.1, None], dtype="float32")._column - assert not data.can_cast_safely(to_dtype) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - (1.0, 2.0, 3.0), - [float("nan"), None], - np.array([1, 2.0, -3, float("nan")]), - pd.Series(["123", "2.0"]), - pd.Series(["1.0", "2.", "-.3", "1e6"]), - pd.Series( - ["1", "2", "3"], - dtype=pd.CategoricalDtype(categories=["1", "2", "3"]), - ), - pd.Series( - ["1.0", "2.0", "3.0"], - dtype=pd.CategoricalDtype(categories=["1.0", "2.0", "3.0"]), - ), - # Categories with nulls - pd.Series([1, 2, 3], dtype=pd.CategoricalDtype(categories=[1, 2])), - pd.Series( - [5.0, 6.0], dtype=pd.CategoricalDtype(categories=[5.0, 6.0]) - ), - pd.Series( - ["2020-08-01 08:00:00", "1960-08-01 08:00:00"], - dtype=np.dtype(" 0: - assert pyarrow_tbl.equals(gdf.to_arrow()) - else: - assert_eq(pyarrow_tbl.to_pandas(), gdf) - - -@pytest.mark.parametrize("columns", [None, ["lvl1_struct"], ["lvl1_list"]]) -def test_skip_rows_for_nested_types(columns, list_struct_buff): - with pytest.raises( - RuntimeError, match="skip_rows is not supported by nested column" - ): - cudf.read_orc( - list_struct_buff, - columns=columns, - use_index=True, - skiprows=5, - ) - - -def test_pyspark_struct(datadir): - path = datadir / "TestOrcFile.testPySparkStruct.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -def gen_map_buff(size): - from string import ascii_letters as al - - from pyarrow import orc - - rd = random.Random(1) - np.random.seed(seed=1) - - buff = BytesIO() - - lvl1_map = pa.array( - [ - rd.choice( - [ - None, - { - rd.choice(al): rd.choice( - [None, np.random.randint(1, 1500)] - ), - }, - ] - ) - for _ in range(size) - ], - type=pa.map_(pa.string(), pa.int64()), - ) - lvl2_map = pa.array( - [ - rd.choice( - [ - None, - *( - { - rd.choice(al): rd.choice( - [ - None, - [ - rd.choice( - [None, np.random.randint(1, 1500)] - ) - for _ in range(5) - ], - ] - ) - } - for _ in range(2) - ), - ] - ) - for _ in range(size) - ], - type=pa.map_(pa.string(), pa.list_(pa.int64())), - ) - lvl2_struct_map = pa.array( - [ - rd.choice( - [ - None, - *( - { - rd.choice(al): rd.choice( - [ - None, - { - "a": rd.choice( - [None, np.random.randint(1, 1500)] - ), - "b": rd.choice( - [None, np.random.randint(1, 1500)] - ), - }, - ] - ) - } - for _ in range(2) - ), - ] - ) - for _ in range(size) - ], - type=pa.map_( - pa.string(), pa.struct({"a": pa.int64(), "b": pa.int64()}) - ), - ) - - pa_table = pa.Table.from_arrays( - [lvl1_map, lvl2_map, lvl2_struct_map], - ["lvl1_map", "lvl2_map", "lvl2_struct_map"], - ) - - orc.write_table( - pa_table, buff, stripe_size=1024, compression="UNCOMPRESSED" - ) - - return buff - - -map_buff = gen_map_buff(size=100000) - - -@pytest.mark.parametrize( - "columns", - [None, ["lvl1_map", "lvl2_struct_map"], ["lvl2_struct_map", "lvl2_map"]], -) -@pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100000]) -@pytest.mark.parametrize("use_index", [True, False]) -def test_map_type_read(columns, num_rows, use_index): - from pyarrow import orc - - tbl = orc.read_table(map_buff) - - lvl1_map = ( - tbl["lvl1_map"] - .combine_chunks() - .view(pa.list_(pa.struct({"key": pa.string(), "value": pa.int64()}))) - ) - lvl2_map = ( - tbl["lvl2_map"] - .combine_chunks() - .view( - pa.list_( - pa.struct({"key": pa.string(), "value": pa.list_(pa.int64())}) - ) - ) - ) - lvl2_struct_map = ( - tbl["lvl2_struct_map"] - .combine_chunks() - .view( - pa.list_( - pa.struct( - { - "key": pa.string(), - "value": pa.struct({"a": pa.int64(), "b": pa.int64()}), - } - ) - ) - ) - ) - - expected_tbl = pa.table( - { - "lvl1_map": lvl1_map, - "lvl2_map": lvl2_map, - "lvl2_struct_map": lvl2_struct_map, - } - ) - gdf = cudf.read_orc( - map_buff, columns=columns, num_rows=num_rows, use_index=use_index - ) - - expected_tbl = ( - expected_tbl[:num_rows] - if columns is None - else expected_tbl.select(columns)[:num_rows] - ) - - if num_rows > 0: - assert expected_tbl.equals(gdf.to_arrow()) - else: - assert_eq(expected_tbl.to_pandas(), gdf) - - -def test_orc_reader_decimal(datadir): - path = datadir / "TestOrcFile.decimal.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -# This test case validates the issue raised in #8665, -# please check the issue for more details. -def test_orc_timestamp_read(datadir): - path = datadir / "TestOrcFile.timestamp.issue.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -def dec(num): - return decimal.Decimal(str(num)) - - -@pytest.mark.parametrize( - "data", - [ - # basic + nested strings - { - "lls": [[["a"], ["bb"]] * 5 for i in range(12345)], - "lls2": [[["ccc", "dddd"]] * 6 for i in range(12345)], - "ls_dict": [["X"] * 7 for i in range(12345)], - "ls_direct": [[str(i)] * 9 for i in range(12345)], - "li": [[i] * 11 for i in range(12345)], - "lf": [[i * 0.5] * 13 for i in range(12345)], - "ld": [[dec(i / 2)] * 15 for i in range(12345)], - }, - # with nulls - { - "ls": [ - [str(i) if i % 5 else None, str(2 * i)] if i % 2 else None - for i in range(12345) - ], - "li": [[i, i * i, i % 2] if i % 3 else None for i in range(12345)], - "ld": [ - [dec(i), dec(i / 2) if i % 7 else None] if i % 5 else None - for i in range(12345) - ], - }, - # with empty elements - { - "ls": [ - [str(i), str(2 * i)] if i % 2 else [] for i in range(12345) - ], - "lls": [ - [[str(i), str(2 * i)]] if i % 2 else [[], []] - for i in range(12345) - ], - "li": [[i, i * i, i % 2] if i % 3 else [] for i in range(12345)], - "lli": [ - [[i], [i * i], [i % 2]] if i % 3 else [[]] - for i in range(12345) - ], - "ld": [ - [dec(i), dec(i / 2)] if i % 5 else [] for i in range(12345) - ], - }, - # variable list lengths - { - "ls": [[str(i)] * i for i in range(123)], - "li": [[i, i * i] * i for i in range(123)], - "ld": [[dec(i), dec(i / 2)] * i for i in range(123)], - }, - # many child elements (more that max_stripe_rows) - {"li": [[i] * 1100 for i in range(11000)]}, - ], -) -def test_orc_writer_lists(data): - pdf_in = pd.DataFrame(data) - - buffer = BytesIO() - cudf.from_pandas(pdf_in).to_orc( - buffer, stripe_size_rows=2048, row_index_stride=512 - ) - - pdf_out = pd.read_orc(buffer) - assert_eq(pdf_out, pdf_in) - - -def test_chunked_orc_writer_lists(): - num_rows = 12345 - pdf_in = pd.DataFrame( - { - "ls": [[str(i), str(2 * i)] for i in range(num_rows)], - "ld": [[dec(i / 2)] * 5 for i in range(num_rows)], - } - ) - - gdf = cudf.from_pandas(pdf_in) - expect = pd.concat([pdf_in, pdf_in]).reset_index(drop=True) - - buffer = BytesIO() - writer = ORCWriter(buffer) - writer.write_table(gdf) - writer.write_table(gdf) - writer.close() - - got = pd.read_orc(buffer) - assert_eq(expect, got) - - -def test_writer_timestamp_stream_size(datadir, tmpdir): - pdf_fname = datadir / "TestOrcFile.largeTimestamps.orc" - gdf_fname = tmpdir.join("gdf.orc") - - expect = pd.read_orc(pdf_fname) - cudf.from_pandas(expect).to_orc(gdf_fname.strpath) - got = pd.read_orc(gdf_fname) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "fname", - [ - "TestOrcFile.NoIndStrm.StructWithNoNulls.orc", - "TestOrcFile.NoIndStrm.StructAndIntWithNulls.orc", - "TestOrcFile.NoIndStrm.StructAndIntWithNulls.TwoStripes.orc", - "TestOrcFile.NoIndStrm.IntWithNulls.orc", - ], -) -def test_no_row_group_index_orc_read(datadir, fname): - from pyarrow import orc - - fpath = datadir / fname - - expect = orc.ORCFile(fpath).read() - got = cudf.read_orc(fpath) - - assert expect.equals(got.to_arrow()) - - -def test_names_in_struct_dtype_nesting(datadir): - from pyarrow import orc - - fname = datadir / "TestOrcFile.NestedStructDataFrame.orc" - - expect = orc.ORCFile(fname).read() - got = cudf.read_orc(fname) - - # test dataframes - assert expect.equals(got.to_arrow()) - - edf = cudf.DataFrame(expect.to_pandas()) - # test schema - assert edf.dtypes.equals(got.dtypes) - - -def test_writer_lists_structs(list_struct_buff): - from pyarrow import orc - - df_in = cudf.read_orc(list_struct_buff) - - buff = BytesIO() - df_in.to_orc(buff) - - pyarrow_tbl = orc.ORCFile(buff).read() - - assert pyarrow_tbl.equals(df_in.to_arrow()) - - -@pytest.mark.parametrize( - "data", - [ - { - "with_pd": [ - [i if i % 3 else None] if i < 9999 or i > 20001 else None - for i in range(21000) - ], - "no_pd": [ - [i if i % 3 else None] if i < 9999 or i > 20001 else [] - for i in range(21000) - ], - }, - ], -) -def test_orc_writer_lists_empty_rg(data): - pdf_in = pd.DataFrame(data) - buffer = BytesIO() - cudf_in = cudf.from_pandas(pdf_in) - - cudf_in.to_orc(buffer) - - df = cudf.read_orc(buffer) - assert_eq(df, cudf_in) - - pdf_out = pd.read_orc(buffer) - assert_eq(pdf_in, pdf_out) - - -def test_statistics_sum_overflow(): - maxint64 = np.iinfo(np.int64).max - minint64 = np.iinfo(np.int64).min - - buff = BytesIO() - df = pd.DataFrame( - {"a": [maxint64, 1], "b": [minint64, -1], "c": [minint64, 1]} - ) - df.to_orc(buff) - - file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) - assert file_stats[0]["a"].get("sum") is None - assert file_stats[0]["b"].get("sum") is None - assert file_stats[0]["c"].get("sum") == minint64 + 1 - - assert stripe_stats[0]["a"].get("sum") is None - assert stripe_stats[0]["b"].get("sum") is None - assert stripe_stats[0]["c"].get("sum") == minint64 + 1 - - -def test_empty_statistics(): - from pyarrow import orc - - buff = BytesIO() - pa_table = pa.Table.from_arrays( - [ - pa.array([None], type=pa.int64()), - pa.array([None], type=pa.float64()), - pa.array([None], type=pa.string()), - pa.array([None], type=pa.decimal128(11, 2)), - pa.array([None], type=pa.timestamp("ns")), - pa.array([None], type=pa.date64()), - pa.array([None], type=pa.bool_()), - pa.array([None], type=pa.binary()), - pa.array([1], type=pa.int64()), - ], - ["a", "b", "c", "d", "e", "f", "g", "h", "i"], - ) - orc.write_table(pa_table, buff) - - got = cudf.io.orc.read_orc_statistics([buff]) - - # Check for both file and stripe stats - for stats in got: - # Similar expected stats for the first 6 columns in this case - for col_name in ascii_lowercase[:6]: - assert stats[0][col_name].number_of_values == 0 - assert stats[0][col_name].has_null is True - assert stats[0][col_name].get("minimum") is None - assert stats[0][col_name].get("maximum") is None - for col_name in ascii_lowercase[:3]: - assert stats[0][col_name].get("sum") == 0 - # Sum for decimal column is a string - assert stats[0]["d"].get("sum") == "0" - - assert stats[0]["g"].number_of_values == 0 - assert stats[0]["g"].has_null is True - assert stats[0]["g"].get("true_count") == 0 - assert stats[0]["g"].get("false_count") == 0 - - assert stats[0]["h"].number_of_values == 0 - assert stats[0]["h"].has_null is True - assert stats[0]["h"].get("sum") == 0 - - assert stats[0]["i"].number_of_values == 1 - assert stats[0]["i"].has_null is False - assert stats[0]["i"].get("minimum") == 1 - assert stats[0]["i"].get("maximum") == 1 - assert stats[0]["i"].get("sum") == 1 - - -@pytest.mark.parametrize( - "equivalent_columns", - [ - (["lvl1_struct.a", "lvl1_struct.b"], ["lvl1_struct"]), - (["lvl1_struct", "lvl1_struct.a"], ["lvl1_struct"]), - (["lvl1_struct.a", "lvl1_struct"], ["lvl1_struct"]), - (["lvl1_struct.b", "lvl1_struct.a"], ["lvl1_struct.b", "lvl1_struct"]), - (["lvl2_struct.lvl1_struct", "lvl2_struct"], ["lvl2_struct"]), - ( - ["lvl2_struct.a", "lvl2_struct.lvl1_struct.c", "lvl2_struct"], - ["lvl2_struct"], - ), - ], -) -def test_select_nested(list_struct_buff, equivalent_columns): - # The two column selections should be equivalent - df_cols1 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[0]) - df_cols2 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[1]) - assert_eq(df_cols1, df_cols2) - - -def test_orc_writer_rle_stream_size(datadir, tmpdir): - from pyarrow import orc - - original = datadir / "TestOrcFile.int16.rle.size.orc" - reencoded = tmpdir.join("int16_map.orc") - - df = cudf.read_orc(original) - df.to_orc(reencoded) - - # Segfaults when RLE stream sizes don't account for varint length - pa_out = orc.ORCFile(reencoded).read() - assert df.to_arrow().equals(pa_out) - - -def test_empty_columns(): - buffer = BytesIO() - # string and decimal columns have additional steps that need to be skipped - expected = cudf.DataFrame( - { - "string": cudf.Series([], dtype="str"), - "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)), - } - ) - expected.to_orc(buffer, compression="snappy") - - got_df = cudf.read_orc(buffer) - assert_eq(expected, got_df) - - -def test_orc_reader_zstd_compression(list_struct_buff): - from pyarrow import orc - - expected = cudf.read_orc(list_struct_buff) - # save with ZSTD compression - buffer = BytesIO() - pyarrow_tbl = orc.ORCFile(list_struct_buff).read() - writer = orc.ORCWriter(buffer, compression="zstd") - writer.write(pyarrow_tbl) - writer.close() - try: - got = cudf.read_orc(buffer) - assert_eq(expected, got) - except RuntimeError: - pytest.mark.xfail(reason="zstd support is not enabled") - - -def test_writer_protobuf_large_rowindexentry(): - s = [ - "Length of the two strings needs to add up to at least ~120", - "So that the encoded statistics are larger than 128 bytes", - ] * 5001 # generate more than 10K rows to have two row groups - df = cudf.DataFrame({"s1": s}) - - buff = BytesIO() - df.to_orc(buff) - - got = cudf.read_orc(buff) - assert_frame_equal(df, got) - - -@pytest.mark.parametrize("compression", ["ZLIB", "ZSTD"]) -def test_orc_writer_nvcomp(compression): - expected = cudf.datasets.randomdata( - nrows=12345, dtypes={"a": int, "b": str, "c": float}, seed=1 - ) - - buff = BytesIO() - try: - expected.to_orc(buff, compression=compression) - except RuntimeError: - pytest.mark.xfail(reason="Newer nvCOMP version is required") - else: - got = pd.read_orc(buff) - assert_eq(expected, got) - - -def run_orc_columns_and_index_param(index_obj, index, columns): - buffer = BytesIO() - df = cudf.DataFrame( - {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj - ) - df.to_orc(buffer, index=index) - - expected = pd.read_orc(buffer, columns=columns) - got = cudf.read_orc(buffer, columns=columns) - - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize("index_obj", [None, [10, 11, 12], ["x", "y", "z"]]) -@pytest.mark.parametrize("index", [True, False, None]) -@pytest.mark.parametrize( - "columns", - [ - None, - pytest.param( - [], - marks=pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Bug in older version of pandas", - ), - ), - ], -) -def test_orc_columns_and_index_param(index_obj, index, columns): - run_orc_columns_and_index_param(index_obj, index, columns) - - -@pytest.mark.parametrize( - "columns,index,index_obj", - [ - ( - ["a", "b"], - True, - None, - ), - ( - ["a", "b"], - True, - [10, 11, 12], - ), - ( - ["a", "b"], - True, - ["x", "y", "z"], - ), - ( - ["a", "b"], - None, - [10, 11, 12], - ), - ( - ["a", "b"], - None, - ["x", "y", "z"], - ), - ], -) -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12026") -def test_orc_columns_and_index_param_read_index(index_obj, index, columns): - run_orc_columns_and_index_param(index_obj, index, columns) - - -@pytest.mark.parametrize( - "columns,index,index_obj", - [ - (["a", "b"], False, None), - (["a", "b"], False, [10, 11, 12]), - (["a", "b"], False, ["x", "y", "z"]), - (["a", "b"], None, None), - ], -) -def test_orc_columns_and_index_param_no_read_index(index_obj, index, columns): - run_orc_columns_and_index_param(index_obj, index, columns) - - -@pytest.mark.parametrize( - "df_data,cols_as_map_type,expected_data", - [ - ( - {"a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]]}, - ["a"], - {"a": [[(10, 20)], [(1, 21)]]}, - ), - ( - { - "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "b": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - }, - ["b"], - { - "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "b": [[(10, 20)], [(1, 21)]], - }, - ), - ( - { - "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "b": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "c": [ - [{"a": {"a": 10}, "b": 20}], - [{"a": {"a": 12}, "b": 21}], - ], - }, - ["b", "c"], - { - "a": [[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]], - "b": [[(10, 20)], [(1, 21)]], - "c": [[({"a": 10}, 20)], [({"a": 12}, 21)]], - }, - ), - ], -) -def test_orc_writer_cols_as_map_type(df_data, cols_as_map_type, expected_data): - df = cudf.DataFrame(df_data) - buffer = BytesIO() - df.to_orc(buffer, cols_as_map_type=cols_as_map_type) - - got = pd.read_orc(buffer) - expected = pd.DataFrame(expected_data) - - assert_eq(got, expected) - - -def test_orc_writer_cols_as_map_type_error(): - df = cudf.DataFrame( - {"a": cudf.Series([[{"a": 10, "b": 20}], [{"a": 1, "b": 21}]])} - ) - buffer = BytesIO() - with pytest.raises( - TypeError, match="cols_as_map_type must be a list of column names." - ): - df.to_orc(buffer, cols_as_map_type=1) - - -@pytest.fixture -def negative_timestamp_df(): - return cudf.DataFrame( - { - "a": [ - pd.Timestamp("1969-12-31 23:59:59.000123"), - pd.Timestamp("1969-12-31 23:59:58.000999"), - pd.Timestamp("1969-12-31 23:59:58.001001"), - pd.Timestamp("1839-12-24 03:58:56.000826"), - ] - } - ) - - -@pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) -def test_orc_reader_negative_timestamp(negative_timestamp_df, engine): - buffer = BytesIO() - negative_timestamp_df.to_orc(buffer) - - # We warn the user that this function will fall back to the CPU for reading - # when the engine is pyarrow. - with expect_warning_if(engine == "pyarrow", UserWarning): - got = cudf.read_orc(buffer, engine=engine) - - assert_eq(negative_timestamp_df, got, check_dtype=False) - - -def test_orc_writer_negative_timestamp(negative_timestamp_df): - from pyarrow import orc - - buffer = BytesIO() - negative_timestamp_df.to_orc(buffer) - - assert_eq(negative_timestamp_df, pd.read_orc(buffer), check_dtype=False) - assert_eq( - negative_timestamp_df, orc.ORCFile(buffer).read(), check_dtype=False - ) - - -@pytest.mark.skip( - reason="Bug specific to rockylinux8: https://github.com/rapidsai/cudf/issues/15802", -) -def test_orc_reader_apache_negative_timestamp(datadir): - path = datadir / "TestOrcFile.apache_timestamp.orc" - - pdf = pd.read_orc(path) - gdf = cudf.read_orc(path) - - assert_eq(pdf, gdf) - - -def test_statistics_string_sum(): - strings = ["a string", "another string!"] - buff = BytesIO() - df = cudf.DataFrame({"str": strings}) - df.to_orc(buff) - - file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff]) - assert_eq(file_stats[0]["str"].get("sum"), sum(len(s) for s in strings)) - - -@pytest.mark.parametrize( - "fname", - [ - "TestOrcFile.Hive.OneEmptyMap.orc", - "TestOrcFile.Hive.OneEmptyList.orc", - "TestOrcFile.Hive.OneNullStruct.orc", - "TestOrcFile.Hive.EmptyListStripe.orc", - "TestOrcFile.Hive.NullStructStripe.orc", - "TestOrcFile.Hive.AllNulls.orc", - ], -) -def test_reader_empty_stripe(datadir, fname): - path = datadir / fname - - expected = pd.read_orc(path) - got = cudf.read_orc(path) - assert_eq(expected, got) - - -# needs enough data for multiple row groups -@pytest.mark.parametrize("data", [["*"] * 10001, ["**", None] * 5001]) -def test_reader_row_index_order(data): - expected = cudf.DataFrame({"str": data}, dtype="string") - - buffer = BytesIO() - expected.to_pandas().to_orc(buffer) - got = cudf.read_orc(buffer) - assert_eq(expected, got) - - -# Test the corner case where empty blocks are compressed -# Decompressed data size is zero, even though compressed data size is non-zero -# For more information see https://github.com/rapidsai/cudf/issues/13608 -def test_orc_reader_empty_decomp_data(datadir): - path = datadir / "TestOrcFile.Spark.EmptyDecompData.orc" - - expect = pd.read_orc(path) - got = cudf.read_orc(path) - - assert_eq(expect, got) - - -def test_orc_reader_empty_deeply_nested_level(datadir): - # Test the case where top level struct has nulls, but the nested struct is - # not nullable. In this case there is no data in the second level, but we - # still need to pass the parent null mask to the third level. - path = datadir / "TestOrcFile.Spark.NestedNotNullableStruct.orc" - - expect = pd.read_orc(path) - got = cudf.read_orc(path) - - assert_eq(expect, got) - - -def test_orc_chunked_writer_stripe_size(datadir): - from pyarrow import orc - - df = cudf.DataFrame({"col": gen_rand_series("int", 100000)}) - - buffer = BytesIO() - writer = ORCWriter(buffer, stripe_size_bytes=64 * 1024) - writer.write_table(df) - writer.close() - - orc_file = orc.ORCFile(buffer) - assert_eq(orc_file.nstripes, 10) - - buffer = BytesIO() - writer = ORCWriter(buffer, stripe_size_rows=20000) - writer.write_table(df) - writer.close() - - orc_file = orc.ORCFile(buffer) - assert_eq(orc_file.nstripes, 5) - - -def test_reader_lz4(): - from pyarrow import orc - - pdf = pd.DataFrame({"ints": [1, 2] * 5001}) - pa_table = pa.Table.from_pandas(pdf) - - buffer = BytesIO() - writer = orc.ORCWriter(buffer, compression="LZ4") - writer.write(pa_table) - writer.close() - - got = cudf.read_orc(buffer) - assert_eq(pdf, got) - - -def test_writer_lz4(): - gdf = cudf.DataFrame({"ints": [1, 2] * 5001}) - - buffer = BytesIO() - gdf.to_orc(buffer, compression="LZ4") - - got = pd.read_orc(buffer) - assert_eq(gdf, got) - - -def test_row_group_alignment(datadir): - path = datadir / "TestOrcFile.MapManyNulls.parquet" - - expected = cudf.read_parquet(path) - - buffer = BytesIO() - expected.to_orc(buffer) - - got = cudf.read_orc(buffer) - - assert_eq(expected, got) diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py deleted file mode 100644 index ad78621c5fa..00000000000 --- a/python/cudf/cudf/tests/test_pack.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pickle -import sys - -import numpy as np -import pandas as pd - -from cudf import DataFrame, Index, Series -from cudf._lib.copying import pack, unpack -from cudf.testing import assert_eq - - -def test_sizeof_packed_dataframe(): - np.random.seed(0) - df = DataFrame() - nelem = 1000 - df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) - df["vals"] = hvals = np.random.random(nelem) - packed = pack(df) - - nbytes = hkeys.nbytes + hvals.nbytes - sizeof = sys.getsizeof(packed) - assert sizeof < nbytes - - serialized_nbytes = len( - pickle.dumps(packed, protocol=pickle.HIGHEST_PROTOCOL) - ) - - # assert at least sizeof bytes were serialized - assert serialized_nbytes >= sizeof - - -def check_packed_equality(df): - # basic - assert_packed_frame_equality(df) - # sliced - assert_packed_frame_equality(df[:-1]) - assert_packed_frame_equality(df[1:]) - assert_packed_frame_equality(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, Index) - assert_packed_frame_equality(sortvaldf) - - -def assert_packed_frame_equality(df): - pdf = df.to_pandas() - - packed = pack(df) - del df - unpacked = unpack(packed) - - assert_eq(unpacked, pdf) - - -def test_packed_dataframe_equality_numeric(): - np.random.seed(0) - - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) - - check_packed_equality(df) - - -def test_packed_dataframe_equality_categorical(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = np.random.random(len(df)) - - check_packed_equality(df) - - -def test_packed_dataframe_equality_list(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) - - check_packed_equality(df) - - -def test_packed_dataframe_equality_struct(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) - df["vals"] = np.random.random(len(df)) - - check_packed_equality(df) - - -def check_packed_unique_pointers(df): - # basic - assert_packed_frame_unique_pointers(df) - # sliced - assert_packed_frame_unique_pointers(df[:-1]) - assert_packed_frame_unique_pointers(df[1:]) - assert_packed_frame_unique_pointers(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, Index) - assert_packed_frame_unique_pointers(sortvaldf) - - -def assert_packed_frame_unique_pointers(df): - unpacked = unpack(pack(df)) - - for col in df: - if df._data[col].data: - assert df._data[col].data.get_ptr(mode="read") != unpacked._data[ - col - ].data.get_ptr(mode="read") - - -def test_packed_dataframe_unique_pointers_numeric(): - np.random.seed(0) - - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) - - check_packed_unique_pointers(df) - - -def test_packed_dataframe_unique_pointers_categorical(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = np.random.random(len(df)) - - check_packed_unique_pointers(df) - - -def test_packed_dataframe_unique_pointers_list(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) - - check_packed_unique_pointers(df) - - -def test_packed_dataframe_unique_pointers_struct(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) - df["vals"] = np.random.random(len(df)) - - check_packed_unique_pointers(df) - - -def check_packed_pickled_equality(df): - # basic - assert_packed_frame_picklable(df) - # sliced - assert_packed_frame_picklable(df[:-1]) - assert_packed_frame_picklable(df[1:]) - assert_packed_frame_picklable(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, Index) - assert_packed_frame_picklable(sortvaldf) - # out-of-band - buffers = [] - serialbytes = pickle.dumps( - pack(df), protocol=5, buffer_callback=buffers.append - ) - for b in buffers: - assert isinstance(b, pickle.PickleBuffer) - loaded = unpack(pickle.loads(serialbytes, buffers=buffers)) - assert_eq(loaded, df) - - -def assert_packed_frame_picklable(df): - serialbytes = pickle.dumps(pack(df)) - loaded = unpack(pickle.loads(serialbytes)) - assert_eq(loaded, df) - - -def test_pickle_packed_dataframe_numeric(): - np.random.seed(0) - - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) - - check_packed_pickled_equality(df) - - -def test_pickle_packed_dataframe_categorical(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = np.random.random(len(df)) - - check_packed_pickled_equality(df) - - -def test_pickle_packed_dataframe_list(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) - - check_packed_pickled_equality(df) - - -def test_pickle_packed_dataframe_struct(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) - df["vals"] = np.random.random(len(df)) - - check_packed_pickled_equality(df) - - -def check_packed_serialized_equality(df): - # basic - assert_packed_frame_serializable(df) - # sliced - assert_packed_frame_serializable(df[:-1]) - assert_packed_frame_serializable(df[1:]) - assert_packed_frame_serializable(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, Index) - assert_packed_frame_serializable(sortvaldf) - - -def assert_packed_frame_serializable(df): - packed = pack(df) - header, frames = packed.serialize() - loaded = unpack(packed.deserialize(header, frames)) - assert_eq(loaded, df) - - -def test_serialize_packed_dataframe_numeric(): - np.random.seed(0) - - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) - - check_packed_serialized_equality(df) - - -def test_serialize_packed_dataframe_categorical(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = np.random.random(len(df)) - - check_packed_serialized_equality(df) - - -def test_serialize_packed_dataframe_list(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) - - check_packed_serialized_equality(df) - - -def test_serialize_packed_dataframe_struct(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = Series( - list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) - ) - df["vals"] = np.random.random(len(df)) - - check_packed_serialized_equality(df) diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py deleted file mode 100644 index 5782437e394..00000000000 --- a/python/cudf/cudf/tests/test_pandas_interop.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd - -import cudf -from cudf import DataFrame -from cudf.testing import assert_eq - - -def test_to_pandas(): - df = DataFrame() - df["a"] = np.arange(5, dtype=np.int32) - df["b"] = np.arange(10, 15, dtype=np.float64) - df["c"] = np.array([True, False, None, True, True]) - - pdf = df.to_pandas() - - assert tuple(df.columns) == tuple(pdf.columns) - - assert df["a"].dtype == pdf["a"].dtype - assert df["b"].dtype == pdf["b"].dtype - - # Notice, the dtype differ when Pandas and cudf boolean series - # contains None/NaN - assert df["c"].dtype == np.bool_ - assert pdf["c"].dtype == np.object_ - - assert len(df["a"]) == len(pdf["a"]) - assert len(df["b"]) == len(pdf["b"]) - assert len(df["c"]) == len(pdf["c"]) - - -def test_from_pandas(): - pdf = pd.DataFrame() - pdf["a"] = np.arange(10, dtype=np.int32) - pdf["b"] = np.arange(10, 20, dtype=np.float64) - - df = DataFrame.from_pandas(pdf) - - assert tuple(df.columns) == tuple(pdf.columns) - - assert df["a"].dtype == pdf["a"].dtype - assert df["b"].dtype == pdf["b"].dtype - - assert len(df["a"]) == len(pdf["a"]) - assert len(df["b"]) == len(pdf["b"]) - - -def test_from_pandas_ex1(): - pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - df = DataFrame.from_pandas(pdf) - - assert tuple(df.columns) == tuple(pdf.columns) - assert np.all(df["a"].to_numpy() == pdf["a"]) - matches = df["b"].to_numpy(na_value=np.nan) == pdf["b"] - # the 3d element is False due to (nan == nan) == False - assert np.all(matches == [True, True, False, True]) - assert np.isnan(df["b"].to_numpy(na_value=np.nan)[2]) - assert np.isnan(pdf["b"][2]) - - -def test_from_pandas_with_index(): - pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) - pdf = pdf.set_index(np.asarray([4, 3, 2, 1])) - df = DataFrame.from_pandas(pdf) - - # Check columns - assert_eq(df.a, pdf.a) - assert_eq(df.b, pdf.b) - # Check index - assert_eq(df.index.values, pdf.index.values) - # Check again using pandas testing tool on frames - assert_eq(df, pdf) - - -def test_from_pandas_rangeindex(): - idx1 = pd.RangeIndex(start=0, stop=4, step=1, name="myindex") - idx2 = cudf.from_pandas(idx1) - - # Check index - assert_eq(idx1.values, idx2.values) - assert idx1.name == idx2.name - - -def test_from_pandas_rangeindex_step(): - expected = pd.RangeIndex(start=0, stop=8, step=2, name="myindex") - actual = cudf.from_pandas(expected) - - assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py deleted file mode 100644 index 7f1b0b1cd46..00000000000 --- a/python/cudf/cudf/tests/test_parquet.py +++ /dev/null @@ -1,4238 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import datetime -import glob -import hashlib -import math -import os -import pathlib -import random -import string -from contextlib import contextmanager -from io import BytesIO -from string import ascii_letters - -import cupy -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest -from fsspec.core import get_fs_token_paths -from packaging import version -from pyarrow import parquet as pq - -import cudf -from cudf._lib.parquet import read_parquet_chunked -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.io.parquet import ( - ParquetDatasetWriter, - ParquetWriter, - merge_parquet_filemetadata, -) -from cudf.testing import assert_eq, dataset_generator as dg -from cudf.testing._utils import TIMEDELTA_TYPES, set_random_null_mask_inplace - - -@contextmanager -def _hide_pyarrow_parquet_cpu_warnings(engine): - if engine == "pyarrow": - with pytest.warns( - UserWarning, - match="Using CPU via PyArrow to read Parquet dataset. This option " - "is both inefficient and unstable!", - ): - yield - else: - yield - - -@pytest.fixture(scope="module") -def datadir(datadir): - return datadir / "parquet" - - -@pytest.fixture(params=[1, 5, 10, 100000]) -def simple_pdf(request): - types = [ - "bool", - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - # "uint32", pandas promotes uint32 to int64 - # https://issues.apache.org/jira/browse/ARROW-9215 - "uint64", - "float32", - "float64", - ] - nrows = request.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) - for typ in types - }, - # Need to ensure that this index is not a RangeIndex to get the - # expected round-tripping behavior from Parquet reader/writer. - index=pd.Index(list(range(nrows))), - ) - # Delete the name of the column index, and rename the row index - test_pdf.columns.name = None - test_pdf.index.name = "test_index" - - return test_pdf - - -@pytest.fixture -def simple_gdf(simple_pdf): - return cudf.DataFrame.from_pandas(simple_pdf) - - -def build_pdf(num_columns, day_resolution_timestamps): - types = [ - "bool", - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - # "uint32", pandas promotes uint32 to int64 - # https://issues.apache.org/jira/browse/ARROW-9215 - "uint64", - "float32", - "float64", - "datetime64[ms]", - "datetime64[us]", - "str", - ] - nrows = num_columns.param - - # Create a pandas dataframe with random data of mixed types - test_pdf = pd.DataFrame( - { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) - for typ in types - }, - # Need to ensure that this index is not a RangeIndex to get the - # expected round-tripping behavior from Parquet reader/writer. - index=pd.Index(list(range(nrows))), - ) - # Delete the name of the column index, and rename the row index - test_pdf.columns.name = None - test_pdf.index.name = "test_index" - - # make datetime64's a little more interesting by increasing the range of - # dates note that pandas will convert these to ns timestamps, so care is - # taken to avoid overflowing a ns timestamp. There is also the ability to - # request timestamps be whole days only via `day_resolution_timestamps`. - for t in [ - { - "name": "datetime64[ms]", - "nsDivisor": 1000000, - "dayModulus": 86400000, - }, - { - "name": "datetime64[us]", - "nsDivisor": 1000, - "dayModulus": 86400000000, - }, - ]: - data = [ - np.random.randint(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"])) - for i in range(nrows) - ] - if day_resolution_timestamps: - data = [int(d / t["dayModulus"]) * t["dayModulus"] for d in data] - test_pdf["col_" + t["name"]] = pd.Series( - np.asarray(data, dtype=t["name"]) - ) - - # Create non-numeric categorical data otherwise parquet may typecast it - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] - test_pdf["col_category"] = pd.Series(data, dtype="category") - - # Create non-numeric str data - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] - test_pdf["col_str"] = pd.Series(data, dtype="str") - - return test_pdf - - -@pytest.fixture(params=[0, 1, 10, 10000]) -def pdf(request): - return build_pdf(request, False) - - -@pytest.fixture(params=[0, 1, 10, 10000]) -def pdf_day_timestamps(request): - return build_pdf(request, True) - - -@pytest.fixture -def gdf(pdf): - return cudf.DataFrame.from_pandas(pdf) - - -@pytest.fixture -def gdf_day_timestamps(pdf_day_timestamps): - return cudf.DataFrame.from_pandas(pdf_day_timestamps) - - -@pytest.fixture(params=["snappy", "gzip", "brotli", None, np.str_("snappy")]) -def parquet_file(request, tmp_path_factory, pdf): - fname = tmp_path_factory.mktemp("parquet") / ( - str(request.param) + "_test.parquet" - ) - pdf.to_parquet(fname, engine="pyarrow", compression=request.param) - return fname - - -@pytest.fixture(scope="module") -def rdg_seed(): - return int(os.environ.get("TEST_CUDF_RDG_SEED", "42")) - - -def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64): - test_pdf = pd.DataFrame( - [list(range(ncolumns * i, ncolumns * (i + 1))) for i in range(nrows)], - columns=pd.Index(["foo"], name="bar"), - # Need to ensure that this index is not a RangeIndex to get the - # expected round-tripping behavior from Parquet reader/writer. - index=pd.Index(list(range(nrows))), - ) - test_pdf.columns.name = None - - if nvalids: - # Randomly but reproducibly mark subset of rows as invalid - random.seed(1337) - mask = random.sample(range(nrows), nvalids) - test_pdf[test_pdf.index.isin(mask)] = np.nan - if dtype: - test_pdf = test_pdf.astype(dtype) - - return test_pdf - - -@pytest.fixture -def parquet_path_or_buf(datadir): - fname = datadir / "spark_timestamp.snappy.parquet" - try: - with open(fname, "rb") as f: - buffer = BytesIO(f.read()) - except Exception as excpr: - if type(excpr).__name__ == "FileNotFoundError": - pytest.skip(".parquet file is not found") - else: - print(type(excpr).__name__) - - def _make_parquet_path_or_buf(src): - if src == "filepath": - return str(fname) - if src == "pathobj": - return fname - if src == "bytes_io": - return buffer - if src == "bytes": - return buffer.getvalue() - if src == "url": - return fname.as_uri() - - raise ValueError("Invalid source type") - - yield _make_parquet_path_or_buf - - -@pytest.fixture(scope="module") -def large_int64_gdf(): - return cudf.DataFrame.from_pandas(pd.DataFrame({"col": range(0, 1 << 20)})) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["pyarrow", "cudf"]) -@pytest.mark.parametrize( - "columns", - [ - ["col_int8"], - ["col_category"], - ["col_int32", "col_float32"], - ["col_int16", "col_float64", "col_int8"], - None, - ], -) -def test_parquet_reader_basic(parquet_file, columns, engine): - expect = pd.read_parquet(parquet_file, columns=columns) - got = cudf.read_parquet(parquet_file, engine=engine, columns=columns) - - # PANDAS returns category objects whereas cuDF returns hashes - if engine == "cudf": - if "col_category" in expect.columns: - expect = expect.drop(columns=["col_category"]) - if "col_category" in got.columns: - got = got.drop(columns=["col_category"]) - - assert_eq(expect, got) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -@pytest.mark.parametrize("engine", ["cudf"]) -def test_parquet_reader_empty_pandas_dataframe(tmpdir, engine): - df = pd.DataFrame() - fname = tmpdir.join("test_pq_reader_empty_pandas_dataframe.parquet") - df.to_parquet(fname) - assert os.path.exists(fname) - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname, engine=engine) - expect = expect.reset_index(drop=True) - got = got.reset_index(drop=True) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("has_null", [False, True]) -def test_parquet_reader_strings(tmpdir, has_null): - df = pd.DataFrame( - [(1, "aaa", 9.0), (2, "bbb", 8.0), (3, "ccc", 7.0)], - columns=pd.Index(list("abc")), - ) - if has_null: - df.at[1, "b"] = None - fname = tmpdir.join("test_pq_reader_strings.parquet") - df.to_parquet(fname) - assert os.path.exists(fname) - - gdf = cudf.read_parquet(fname, engine="cudf") - - assert gdf["b"].dtype == np.dtype("object") - assert_eq(gdf["b"], df["b"]) - - -@pytest.mark.parametrize("columns", [None, ["b"]]) -@pytest.mark.parametrize("index_col", ["b", "Nameless", None]) -def test_parquet_reader_index_col(tmpdir, index_col, columns): - df = pd.DataFrame({"a": range(3), "b": range(3, 6), "c": range(6, 9)}) - - if index_col is None: - # No index column - df.reset_index(drop=True, inplace=True) - elif index_col == "Nameless": - # Index column but no name - df.set_index("a", inplace=True) - df.index.name = None - else: - # Index column as normal - df.set_index(index_col, inplace=True) - - fname = tmpdir.join("test_pq_reader_index_col.parquet") - - # PANDAS' PyArrow backend always writes the index unless disabled - df.to_parquet(fname, index=(index_col is not None)) - assert os.path.exists(fname) - - pdf = pd.read_parquet(fname, columns=columns) - gdf = cudf.read_parquet(fname, engine="cudf", columns=columns) - - assert_eq(pdf, gdf, check_categorical=False) - - -@pytest.mark.parametrize("pandas_compat", [True, False]) -@pytest.mark.parametrize( - "columns", [["a"], ["d"], ["a", "b"], ["a", "d"], None] -) -def test_parquet_reader_pandas_metadata(tmpdir, columns, pandas_compat): - df = pd.DataFrame( - { - "a": range(6, 9), - "b": range(3, 6), - "c": range(6, 9), - "d": ["abc", "def", "xyz"], - } - ) - df.set_index("b", inplace=True) - - fname = tmpdir.join("test_pq_reader_pandas_metadata.parquet") - df.to_parquet(fname) - assert os.path.exists(fname) - - # PANDAS `read_parquet()` and PyArrow `read_pandas()` always includes index - # Instead, directly use PyArrow to optionally omit the index - expect = pa.parquet.read_table( - fname, columns=columns, use_pandas_metadata=pandas_compat - ).to_pandas() - got = cudf.read_parquet( - fname, columns=columns, use_pandas_metadata=pandas_compat - ) - - if pandas_compat or columns is None or "b" in columns: - assert got.index.name == "b" - else: - assert got.index.name is None - assert_eq(expect, got, check_categorical=False) - - -@pytest.mark.parametrize("pandas_compat", [True, False]) -@pytest.mark.parametrize("as_bytes", [True, False]) -def test_parquet_range_index_pandas_metadata(tmpdir, pandas_compat, as_bytes): - df = pd.DataFrame( - {"a": range(6, 9), "b": ["abc", "def", "xyz"]}, - index=pd.RangeIndex(3, 6, 1, name="c"), - ) - - fname = tmpdir.join("test_parquet_range_index_pandas_metadata") - df.to_parquet(fname) - assert os.path.exists(fname) - - # PANDAS `read_parquet()` and PyArrow `read_pandas()` always includes index - # Instead, directly use PyArrow to optionally omit the index - expect = pa.parquet.read_table( - fname, use_pandas_metadata=pandas_compat - ).to_pandas() - if as_bytes: - # Make sure we can handle RangeIndex parsing - # in pandas when the input is `bytes` - with open(fname, "rb") as f: - got = cudf.read_parquet( - f.read(), use_pandas_metadata=pandas_compat - ) - else: - got = cudf.read_parquet(fname, use_pandas_metadata=pandas_compat) - - assert_eq(expect, got) - - -def test_parquet_read_metadata(tmpdir, pdf): - if len(pdf) > 100: - pytest.skip("Skipping long setup test") - - def num_row_groups(rows, group_size): - return max(1, (rows + (group_size - 1)) // group_size) - - fname = tmpdir.join("metadata.parquet") - row_group_size = 5 - pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size) - - ( - num_rows, - row_groups, - col_names, - num_columns, - _, # rowgroup_metadata - ) = cudf.io.read_parquet_metadata(fname) - - assert num_columns == len(pdf.columns) - assert num_rows == len(pdf.index) - assert row_groups == num_row_groups(num_rows, row_group_size) - for a, b in zip(col_names, pdf.columns): - assert a == b - - -def test_parquet_read_filtered(tmpdir, rdg_seed): - # Generate data - fname = tmpdir.join("filtered.parquet") - dg.generate( - fname, - dg.Parameters( - num_rows=2048, - column_parameters=[ - dg.ColumnParameters( - cardinality=40, - null_frequency=0.05, - generator=lambda: [ - "".join( - random.sample( - string.ascii_letters, random.randint(4, 8) - ) - ) - for _ in range(40) - ], - is_sorted=False, - ), - dg.ColumnParameters( - 40, - 0.2, - lambda: np.random.default_rng().integers(0, 100, size=40), - True, - ), - ], - seed=rdg_seed, - ), - format={"name": "parquet", "row_group_size": 64}, - ) - - # Get dataframes to compare - df = cudf.read_parquet(fname) - df_filtered = cudf.read_parquet(fname, filters=[("1", ">", 60)]) - # PyArrow's read_table function does row-group-level filtering in addition - # to applying given filters once the table has been read into memory. - # Because of this, we aren't using PyArrow as a reference for testing our - # row-group selection method since the only way to only select row groups - # with PyArrow is with the method we use and intend to test. - tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)]) - - assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64) - print(len(df_filtered)) - print(len(tbl_filtered)) - assert len(df_filtered) < len(df) - assert len(tbl_filtered) <= len(df_filtered) - - -def test_parquet_read_filtered_everything(tmpdir): - # Generate data - fname = tmpdir.join("filtered_everything.parquet") - df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) - df.to_parquet(fname, row_group_size=2) - - # Check filter - df_filtered = cudf.read_parquet(fname, filters=[("x", "==", 12)]) - assert_eq(len(df_filtered), 0) - assert_eq(df_filtered["x"].dtype, "int64") - assert_eq(df_filtered["y"].dtype, "object") - - -def test_parquet_read_filtered_multiple_files(tmpdir): - # Generate data - fname_0 = tmpdir.join("filtered_multiple_files_0.parquet") - df = pd.DataFrame({"x": range(10), "y": list("aabbccddee")}) - df.to_parquet(fname_0, row_group_size=2) - fname_1 = tmpdir.join("filtered_multiple_files_1.parquet") - df = pd.DataFrame({"x": range(10), "y": list("aaccccddee")}) - df.to_parquet(fname_1, row_group_size=2) - fname_2 = tmpdir.join("filtered_multiple_files_2.parquet") - df = pd.DataFrame( - {"x": [0, 1, 9, 9, 4, 5, 6, 7, 8, 9], "y": list("aabbzzddee")} - ) - df.to_parquet(fname_2, row_group_size=2) - - # Check filter - filtered_df = cudf.read_parquet( - [fname_0, fname_1, fname_2], filters=[("x", "==", 2)] - ) - assert_eq( - filtered_df, - cudf.DataFrame({"x": [2, 2], "y": list("bc")}, index=[2, 2]), - ) - - -@pytest.mark.parametrize( - "predicate,expected_len", - [ - ([[("x", "==", 0)], [("z", "==", 0)]], 2), - ([("x", "==", 0), ("z", "==", 0)], 0), - ([("x", "==", 0), ("z", "!=", 0)], 1), - ([("y", "==", "c"), ("x", ">", 8)], 0), - ([("y", "==", "c"), ("x", ">=", 5)], 1), - ([[("y", "==", "c")], [("x", "<", 3)]], 5), - ([[("x", "not in", (0, 9)), ("z", "not in", (4, 5))]], 6), - ([[("y", "==", "c")], [("x", "in", (0, 9)), ("z", "in", (0, 9))]], 4), - ([[("x", "==", 0)], [("x", "==", 1)], [("x", "==", 2)]], 3), - ([[("x", "==", 0), ("z", "==", 9), ("y", "==", "a")]], 1), - ], -) -def test_parquet_read_filtered_complex_predicate( - tmpdir, predicate, expected_len -): - # Generate data - fname = tmpdir.join("filtered_complex_predicate.parquet") - df = pd.DataFrame( - { - "x": range(10), - "y": list("aabbccddee"), - "z": reversed(range(10)), - } - ) - df.to_parquet(fname, row_group_size=2) - - # Check filters - df_filtered = cudf.read_parquet(fname, filters=predicate) - assert_eq(cudf.io.read_parquet_metadata(fname)[1], 10 / 2) - assert_eq(len(df_filtered), expected_len) - - -@pytest.mark.parametrize("row_group_size", [1, 5, 100]) -def test_parquet_read_row_groups(tmpdir, pdf, row_group_size): - if len(pdf) > 100: - pytest.skip("Skipping long setup test") - - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - fname = tmpdir.join("row_group.parquet") - pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size) - - num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( - fname - ) - - gdf = [cudf.read_parquet(fname, row_groups=[i]) for i in range(row_groups)] - gdf = cudf.concat(gdf) - assert_eq(pdf.reset_index(drop=True), gdf.reset_index(drop=True)) - - # first half rows come from the first source, rest from the second - gdf = cudf.read_parquet( - [fname, fname], - row_groups=[ - list(range(row_groups // 2)), - list(range(row_groups // 2, row_groups)), - ], - ) - assert_eq(pdf.reset_index(drop=True), gdf.reset_index(drop=True)) - - -@pytest.mark.parametrize("row_group_size", [1, 5, 100]) -def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size): - if len(pdf) > 100: - pytest.skip("Skipping long setup test") - - fname = tmpdir.join("row_group.parquet") - pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size) - - num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( - fname - ) - - # alternate rows between the two sources - gdf = cudf.read_parquet( - [fname, fname], - row_groups=[ - list(range(0, row_groups, 2)), - list(range(1, row_groups, 2)), - ], - ) - - ref_df = [ - cudf.read_parquet(fname, row_groups=i) - for i in list(range(0, row_groups, 2)) + list(range(1, row_groups, 2)) - ] - ref_df = cudf.concat(ref_df) - - assert_eq(ref_df, gdf) - - -def test_parquet_reader_spark_timestamps(datadir): - fname = datadir / "spark_timestamp.snappy.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_spark_decimals(datadir): - fname = datadir / "spark_decimal.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None]) -def test_parquet_reader_decimal128(datadir, columns): - fname = datadir / "nested_decimal128_file.parquet" - got = cudf.read_parquet(fname, columns=columns) - expect = cudf.read_parquet(fname, columns=columns) - - assert_eq(expect, got) - - -def test_parquet_reader_microsecond_timestamps(datadir): - fname = datadir / "usec_timestamp.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_mixedcompression(datadir): - fname = datadir / "mixed_compression.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_select_columns(datadir): - fname = datadir / "nested_column_map.parquet" - - expect = cudf.read_parquet(fname).to_pandas()[["value"]] - got = cudf.read_parquet(fname, columns=["value"]) - - assert_eq(expect, got) - - -def test_parquet_reader_invalids(tmpdir): - test_pdf = make_pdf(nrows=1000, nvalids=1000 // 4, dtype="Int64") - - fname = tmpdir.join("invalids.parquet") - test_pdf.to_parquet(fname, engine="pyarrow") - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got.to_pandas(nullable=True)) - - -def test_parquet_reader_filenotfound(tmpdir): - with pytest.raises(FileNotFoundError): - cudf.read_parquet("TestMissingFile.parquet") - - with pytest.raises(FileNotFoundError): - cudf.read_parquet(tmpdir.mkdir("cudf_parquet")) - - -def test_parquet_reader_local_filepath(): - fname = "~/TestLocalFile.parquet" - if not os.path.isfile(fname): - pytest.skip("Local .parquet file is not found") - - cudf.read_parquet(fname) - - -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] -) -def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src): - expect = pd.read_parquet(parquet_path_or_buf("filepath")) - got = cudf.read_parquet(parquet_path_or_buf(src)) - - assert_eq(expect, got) - - -def test_parquet_reader_file_types(parquet_path_or_buf): - expect = cudf.read_parquet(parquet_path_or_buf("filepath")) - fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath")) - - # Pass open fsspec file - with fs.open(paths[0], mode="rb") as fil: - got1 = cudf.read_parquet(fil) - assert_eq(expect, got1) - - # Pass path only - got2 = cudf.read_parquet(paths[0]) - assert_eq(expect, got2) - - -def create_parquet_source(df, src_type, fname): - if src_type == "filepath": - df.to_parquet(fname, engine="pyarrow") - return str(fname) - if src_type == "pathobj": - df.to_parquet(fname, engine="pyarrow") - return fname - if src_type == "bytes_io": - buffer = BytesIO() - df.to_parquet(buffer, engine="pyarrow") - return buffer - if src_type == "bytes": - buffer = BytesIO() - df.to_parquet(buffer, engine="pyarrow") - return buffer.getvalue() - if src_type == "url": - df.to_parquet(fname, engine="pyarrow") - return pathlib.Path(fname).as_uri() - - -@pytest.mark.parametrize( - "src", ["filepath", "pathobj", "bytes_io", "bytes", "url"] -) -def test_parquet_reader_multiple_files(tmpdir, src): - test_pdf1 = make_pdf(nrows=1000, nvalids=1000 // 2, dtype="float64") - test_pdf2 = make_pdf(nrows=500, dtype="float64") - expect = pd.concat([test_pdf1, test_pdf2]) - - src1 = create_parquet_source(test_pdf1, src, tmpdir.join("multi1.parquet")) - src2 = create_parquet_source(test_pdf2, src, tmpdir.join("multi2.parquet")) - got = cudf.read_parquet([src1, src2]) - - assert_eq(expect, got) - - -def test_parquet_reader_reordered_columns(tmpdir): - src = pd.DataFrame( - {"name": ["cow", None, "duck", "fish", None], "id": [0, 1, 2, 3, 4]} - ) - fname = tmpdir.join("test_parquet_reader_reordered_columns.parquet") - src.to_parquet(fname) - assert os.path.exists(fname) - expect = pd.DataFrame( - {"id": [0, 1, 2, 3, 4], "name": ["cow", None, "duck", "fish", None]} - ) - got = cudf.read_parquet(fname, columns=["id", "name"]) - assert_eq(expect, got, check_dtype=False) - - -def test_parquet_reader_reordered_columns_mixed(tmpdir): - src = pd.DataFrame( - { - "name": ["cow", None, "duck", "fish", None], - "list0": [ - [[1, 2], [3, 4]], - None, - [[5, 6], None], - [[1]], - [[5], [6, None, 8]], - ], - "id": [0, 1, 2, 3, 4], - "list1": [ - [[1, 2], [3, 4]], - [[0, 0]], - [[5, 6], [10, 12]], - [[1]], - [[5], [6, 8]], - ], - } - ) - fname = tmpdir.join("test_parquet_reader_reordered_columns.parquet") - src.to_parquet(fname) - assert os.path.exists(fname) - expect = pd.DataFrame( - { - "list1": [ - [[1, 2], [3, 4]], - [[0, 0]], - [[5, 6], [10, 12]], - [[1]], - [[5], [6, 8]], - ], - "id": [0, 1, 2, 3, 4], - "list0": [ - [[1, 2], [3, 4]], - None, - [[5, 6], None], - [[1]], - [[5], [6, None, 8]], - ], - "name": ["cow", None, "duck", "fish", None], - } - ) - got = cudf.read_parquet(fname, columns=["list1", "id", "list0", "name"]) - assert_eq(expect, got, check_dtype=False) - - -def test_parquet_reader_list_basic(tmpdir): - expect = pd.DataFrame({"a": [[[1, 2], [3, 4]], None, [[5, 6], None]]}) - fname = tmpdir.join("test_parquet_reader_list_basic.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert_eq(expect, got) - - -def test_parquet_reader_list_table(tmpdir): - expect = pd.DataFrame( - { - "a": [[[1, 2], [3, 4]], None, [[5, 6], None]], - "b": [[None, None], None, [None, None]], - "c": [[[1, 2, 3]], [[None]], [[], None]], - "d": [[[]], [[None]], [[1, 2, 3], None]], - "e": [[["cows"]], [["dogs"]], [["cats", "birds", "owls"], None]], - } - ) - fname = tmpdir.join("test_parquet_reader_list_table.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert pa.Table.from_pandas(expect).equals(got.to_arrow()) - - -def int_gen(first_val, i): - """ - Returns an integer based on an absolute index and a starting value. Used - as input to `list_gen`. - """ - return int(i + first_val) - - -strings = [ - "cats", - "dogs", - "cows", - "birds", - "fish", - "sheep", - "owls", - "bears", - "ants", -] - - -def string_gen(first_val, i): - """ - Returns a string based on an absolute index and a starting value. Used as - input to `list_gen`. - """ - return strings[int_gen(first_val, i) % len(strings)] - - -def list_row_gen( - gen, first_val, list_size, lists_per_row, include_validity=False -): - """ - Generate a single row for a List> column based on input parameters. - - Parameters - ---------- - gen : A callable which generates an individual leaf element based on an - absolute index. - first_val : Generate the column as if it had started at 'first_val' - instead of 0. - list_size : Size of each generated list. - lists_per_row : Number of lists to generate per row. - include_validity : Whether or not to include nulls as part of the - column. If true, it will add a selection of nulls at both the - topmost row level and at the leaf level. - - Returns - ------- - The generated list column. - """ - - def L(list_size, first_val): - return [ - (gen(first_val, i) if i % 2 == 0 else None) - if include_validity - else (gen(first_val, i)) - for i in range(list_size) - ] - - return [ - (L(list_size, first_val + (list_size * i)) if i % 2 == 0 else None) - if include_validity - else L(list_size, first_val + (list_size * i)) - for i in range(lists_per_row) - ] - - -def list_gen(gen, num_rows, lists_per_row, list_size, include_validity=False): - """ - Generate a list column based on input parameters. - - Parameters - ---------- - gen : A callable which generates an individual leaf element based on an - absolute index. - num_rows : Number of rows to generate. - lists_per_row : Number of lists to generate per row. - list_size : Size of each generated list. - include_validity : Whether or not to include nulls as part of the - column. If true, it will add a selection of nulls at both the - topmost row level and at the leaf level. - - Returns - ------- - The generated list column. - """ - - def L(list_size, first_val): - return [ - (gen(first_val, i) if i % 2 == 0 else None) - if include_validity - else (gen(first_val, i)) - for i in range(list_size) - ] - - def R(first_val, lists_per_row, list_size): - return [ - L(list_size, first_val + (list_size * i)) - for i in range(lists_per_row) - ] - - return [ - ( - R( - lists_per_row * list_size * i, - lists_per_row, - list_size, - ) - if i % 2 == 0 - else None - ) - if include_validity - else R( - lists_per_row * list_size * i, - lists_per_row, - list_size, - ) - for i in range(num_rows) - ] - - -def test_parquet_reader_list_large(tmpdir): - expect = pd.DataFrame({"a": list_gen(int_gen, 256, 80, 50)}) - fname = tmpdir.join("test_parquet_reader_list_large.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert_eq(expect, got, check_dtype=False) - - -def test_parquet_reader_list_validity(tmpdir): - expect = pd.DataFrame( - {"a": list_gen(int_gen, 256, 80, 50, include_validity=True)} - ) - fname = tmpdir.join("test_parquet_reader_list_validity.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert_eq(expect, got, check_dtype=False) - - -def test_parquet_reader_list_large_mixed(tmpdir): - expect = pd.DataFrame( - { - "a": list_gen(string_gen, 128, 80, 50), - "b": list_gen(int_gen, 128, 80, 50), - "c": list_gen(int_gen, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 128, 80, 50, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_reader_list_large_mixed.parquet") - expect.to_parquet(fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert pa.Table.from_pandas(expect).equals(got.to_arrow()) - - -def test_parquet_reader_list_large_multi_rowgroup(tmpdir): - # > 40 row groups - num_rows = 100000 - num_docs = num_rows / 2 - num_categories = 1_000 - row_group_size = 1000 - - cupy.random.seed(0) - - # generate a random pairing of doc: category - documents = cudf.DataFrame( - { - "document_id": cupy.random.randint(num_docs, size=num_rows), - "category_id": cupy.random.randint(num_categories, size=num_rows), - } - ) - - # group categories by document_id to create a list column - expect = documents.groupby("document_id").agg({"category_id": ["collect"]}) - expect.columns = expect.columns.get_level_values(0) - expect.reset_index(inplace=True) - - # round trip the dataframe to/from parquet - fname = tmpdir.join( - "test_parquet_reader_list_large_multi_rowgroup.parquet" - ) - expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir): - # 25 row groups - num_rows = 25000 - row_group_size = 1000 - - expect = cudf.DataFrame( - {"a": list_gen(int_gen, num_rows, 3, 2, include_validity=True)} - ) - - # round trip the dataframe to/from parquet - fname = tmpdir.join( - "test_parquet_reader_list_large_multi_rowgroup_nulls.parquet" - ) - expect.to_pandas().to_parquet(fname, row_group_size=row_group_size) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert_eq(expect, got) - - -def struct_gen(gen, skip_rows, num_rows, include_validity=False): - """ - Generate a struct column based on input parameters. - - Parameters - ---------- - gen : A array of callables which generate an individual row based on an - absolute index. - skip_rows : Generate the column as if it had started at 'skip_rows' - instead of 0. The intent here is to emulate the skip_rows - parameter of the parquet reader. - num_fields : Number of fields in the struct. - include_validity : Whether or not to include nulls as part of the - column. If true, it will add a selection of nulls at both the - field level and at the value level. - - Returns - ------- - The generated struct column. - """ - - def R(first_val, num_fields): - return { - "col" + str(f): ( - gen[f](first_val, first_val) if f % 4 != 0 else None - ) - if include_validity - else (gen[f](first_val, first_val)) - for f in range(len(gen)) - } - - return [ - (R((i + skip_rows), len(gen)) if (i + skip_rows) % 4 != 0 else None) - if include_validity - else R((i + skip_rows), len(gen)) - for i in range(num_rows) - ] - - -@pytest.mark.parametrize( - "data", - [ - # struct - [ - {"a": 1, "b": 2}, - {"a": 10, "b": 20}, - {"a": None, "b": 22}, - {"a": None, "b": None}, - {"a": 15, "b": None}, - ], - # struct-of-list - [ - {"a": 1, "b": 2, "c": [1, 2, 3]}, - {"a": 10, "b": 20, "c": [4, 5]}, - {"a": None, "b": 22, "c": [6]}, - {"a": None, "b": None, "c": None}, - {"a": 15, "b": None, "c": [-1, -2]}, - None, - {"a": 100, "b": 200, "c": [-10, None, -20]}, - ], - # list-of-struct - [ - [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], - None, - [{"a": 10, "b": 20}], - [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], - ], - # struct-of-struct - [ - {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, - {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, - {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, - ], - ], -) -def test_parquet_reader_struct_basic(tmpdir, data): - expect = pa.Table.from_pydict({"struct": data}) - fname = tmpdir.join("test_parquet_reader_struct_basic.parquet") - pa.parquet.write_table(expect, fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) - - -def select_columns_params(): - dfs = [ - # struct - ( - [ - {"a": 1, "b": 2}, - {"a": 10, "b": 20}, - {"a": None, "b": 22}, - {"a": None, "b": None}, - {"a": 15, "b": None}, - ], - [["struct"], ["struct.a"], ["struct.b"], ["c"]], - ), - # struct-of-list - ( - [ - {"a": 1, "b": 2, "c": [1, 2, 3]}, - {"a": 10, "b": 20, "c": [4, 5]}, - {"a": None, "b": 22, "c": [6]}, - {"a": None, "b": None, "c": None}, - {"a": 15, "b": None, "c": [-1, -2]}, - None, - {"a": 100, "b": 200, "c": [-10, None, -20]}, - ], - [ - ["struct"], - ["struct.c"], - ["struct.c.list"], - ["struct.c.list.item"], - ["struct.b", "struct.c"], - ["struct.b", "struct.d", "struct.c"], - ], - ), - # list-of-struct - ( - [ - [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], - None, - [{"a": 10, "b": 20}], - [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], - ], - [ - ["struct"], - ["struct.list"], - ["struct.list.item"], - ["struct.list.item.a", "struct.list.item.b"], - ["struct.list.item.c"], - ], - ), - # struct with "." in field names - ( - [ - {"a.b": 1, "b.a": 2}, - {"a.b": 10, "b.a": 20}, - {"a.b": None, "b.a": 22}, - {"a.b": None, "b.a": None}, - {"a.b": 15, "b.a": None}, - ], - [["struct"], ["struct.a"], ["struct.b.a"]], - ), - ] - for df_col_pair in dfs: - for cols in df_col_pair[1]: - yield df_col_pair[0], cols - - -@pytest.mark.parametrize("data, columns", select_columns_params()) -def test_parquet_reader_struct_select_columns(tmpdir, data, columns): - table = pa.Table.from_pydict({"struct": data}) - buff = BytesIO() - - pa.parquet.write_table(table, buff) - - expect = pq.ParquetFile(buff).read(columns=columns) - got = cudf.read_parquet(buff, columns=columns) - assert expect.equals(got.to_arrow()) - - -def test_parquet_reader_struct_los_large(tmpdir): - num_rows = 256 - list_size = 64 - data = [ - struct_gen([string_gen, int_gen, string_gen], 0, list_size, False) - if i % 2 == 0 - else None - for i in range(num_rows) - ] - expect = pa.Table.from_pydict({"los": data}) - fname = tmpdir.join("test_parquet_reader_struct_los_large.parquet") - pa.parquet.write_table(expect, fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) - - -@pytest.mark.parametrize( - "params", [[3, 4, 32, False], [3, 4, 32, True], [100, 25, 256, True]] -) -def test_parquet_reader_struct_sol_table(tmpdir, params): - # Struct> - lists_per_row = params[0] - list_size = params[1] - num_rows = params[2] - include_validity = params[3] - - def list_gen_wrapped(x, y): - return list_row_gen( - int_gen, x * list_size * lists_per_row, list_size, lists_per_row - ) - - def string_list_gen_wrapped(x, y): - return list_row_gen( - string_gen, - x * list_size * lists_per_row, - list_size, - lists_per_row, - include_validity, - ) - - data = struct_gen( - [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped], - 0, - num_rows, - include_validity, - ) - expect = pa.Table.from_pydict({"sol": data}) - fname = tmpdir.join("test_parquet_reader_struct_sol_table.parquet") - pa.parquet.write_table(expect, fname) - assert os.path.exists(fname) - got = cudf.read_parquet(fname) - assert expect.equals(got.to_arrow()) - - -def test_parquet_reader_v2(tmpdir, simple_pdf): - pdf_fname = tmpdir.join("pdfv2.parquet") - simple_pdf.to_parquet(pdf_fname, data_page_version="2.0") - assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) - - cudf.from_pandas(simple_pdf).to_parquet(pdf_fname, header_version="2.0") - assert_eq(cudf.read_parquet(pdf_fname), simple_pdf) - - -def test_parquet_delta_byte_array(datadir): - fname = datadir / "delta_byte_arr.parquet" - assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname)) - - -# values chosen to exercise: -# 1 - header only, no bitpacked values -# 2 - one bitpacked value -# 23 - one partially filled miniblock -# 32 - almost full miniblock -# 33 - one full miniblock -# 34 - one full miniblock plus one value in new miniblock -# 128 - almost full block -# 129 - one full block -# 130 - one full block plus one value in new block -# 1000 - multiple blocks -def delta_num_rows(): - return [1, 2, 23, 32, 33, 34, 128, 129, 130, 1000] - - -@pytest.mark.parametrize("nrows", delta_num_rows()) -@pytest.mark.parametrize("add_nulls", [True, False]) -@pytest.mark.parametrize( - "dtype", - [ - "int8", - "int16", - "int32", - "int64", - ], -) -def test_delta_binary(nrows, add_nulls, dtype, tmpdir): - null_frequency = 0.25 if add_nulls else 0 - - # Create a pandas dataframe with random data of mixed types - arrow_table = dg.rand_dataframe( - dtypes_meta=[ - { - "dtype": dtype, - "null_frequency": null_frequency, - "cardinality": nrows, - }, - ], - rows=nrows, - seed=0, - use_threads=False, - ) - # Roundabout conversion to pandas to preserve nulls/data types - cudf_table = cudf.DataFrame.from_arrow(arrow_table) - test_pdf = cudf_table.to_pandas(nullable=True) - pdf_fname = tmpdir.join("pdfv2.parquet") - test_pdf.to_parquet( - pdf_fname, - version="2.6", - column_encoding="DELTA_BINARY_PACKED", - data_page_version="2.0", - data_page_size=64 * 1024, - engine="pyarrow", - use_dictionary=False, - ) - cdf = cudf.read_parquet(pdf_fname) - pcdf = cudf.from_pandas(test_pdf) - assert_eq(cdf, pcdf) - - # Write back out with cudf and make sure pyarrow can read it - cudf_fname = tmpdir.join("cudfv2.parquet") - pcdf.to_parquet( - cudf_fname, - compression=None, - header_version="2.0", - use_dictionary=False, - ) - - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) - - -@pytest.mark.parametrize("nrows", delta_num_rows()) -@pytest.mark.parametrize("add_nulls", [True, False]) -@pytest.mark.parametrize("max_string_length", [12, 48, 96, 128]) -@pytest.mark.parametrize( - "str_encoding", ["DELTA_BYTE_ARRAY", "DELTA_LENGTH_BYTE_ARRAY"] -) -def test_delta_byte_array_roundtrip( - nrows, add_nulls, max_string_length, str_encoding, tmpdir -): - null_frequency = 0.25 if add_nulls else 0 - - # Create a pandas dataframe with random data of mixed lengths - test_pdf = dg.rand_dataframe( - dtypes_meta=[ - { - "dtype": "str", - "null_frequency": null_frequency, - "cardinality": nrows, - "max_string_length": max_string_length, - }, - ], - rows=nrows, - seed=0, - use_threads=False, - ).to_pandas() - - pdf_fname = tmpdir.join("pdfdeltaba.parquet") - test_pdf.to_parquet( - pdf_fname, - version="2.6", - column_encoding=str_encoding, - data_page_version="2.0", - data_page_size=64 * 1024, - engine="pyarrow", - use_dictionary=False, - ) - cdf = cudf.read_parquet(pdf_fname) - pcdf = cudf.from_pandas(test_pdf) - assert_eq(cdf, pcdf) - - # Write back out with cudf and make sure pyarrow can read it - cudf_fname = tmpdir.join("cdfdeltaba.parquet") - pcdf.to_parquet( - cudf_fname, - compression="snappy", - header_version="2.0", - use_dictionary=False, - ) - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) - - -@pytest.mark.parametrize("nrows", delta_num_rows()) -@pytest.mark.parametrize("add_nulls", [True, False]) -@pytest.mark.parametrize( - "str_encoding", ["DELTA_BYTE_ARRAY", "DELTA_LENGTH_BYTE_ARRAY"] -) -def test_delta_struct_list(tmpdir, nrows, add_nulls, str_encoding): - # Struct> - lists_per_row = 3 - list_size = 4 - num_rows = nrows - include_validity = add_nulls - - def list_gen_wrapped(x, y): - return list_row_gen( - int_gen, x * list_size * lists_per_row, list_size, lists_per_row - ) - - def string_list_gen_wrapped(x, y): - return list_row_gen( - string_gen, - x * list_size * lists_per_row, - list_size, - lists_per_row, - include_validity, - ) - - data = struct_gen( - [int_gen, string_gen, list_gen_wrapped, string_list_gen_wrapped], - 0, - num_rows, - include_validity, - ) - test_pdf = pa.Table.from_pydict({"sol": data}).to_pandas() - pdf_fname = tmpdir.join("pdfdeltaba.parquet") - test_pdf.to_parquet( - pdf_fname, - version="2.6", - column_encoding={ - "sol.col0": "DELTA_BINARY_PACKED", - "sol.col1": str_encoding, - "sol.col2.list.element.list.element": "DELTA_BINARY_PACKED", - "sol.col3.list.element.list.element": str_encoding, - }, - data_page_version="2.0", - data_page_size=64 * 1024, - engine="pyarrow", - use_dictionary=False, - ) - # sanity check to verify file is written properly - assert_eq(test_pdf, pd.read_parquet(pdf_fname)) - cdf = cudf.read_parquet(pdf_fname) - pcdf = cudf.from_pandas(test_pdf) - assert_eq(cdf, pcdf) - - # Write back out with cudf and make sure pyarrow can read it - cudf_fname = tmpdir.join("cdfdeltaba.parquet") - pcdf.to_parquet( - cudf_fname, - compression="snappy", - header_version="2.0", - use_dictionary=False, - ) - cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname)) - assert_eq(cdf2, cdf) - - -@pytest.mark.parametrize( - "data", - [ - # Structs - { - "being": [ - None, - {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}}, - {"human?": None, "Deets": {"Name": "Angua", "Age": 25}}, - {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}}, - {"human?": False, "Deets": None}, - {"human?": None, "Deets": {"Name": "Mr", "Age": None}}, - ] - }, - # List of Structs - { - "family": [ - [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], - [ - {"human?": None, "deets": {"weight": 5.3, "age": 25}}, - {"human?": False, "deets": {"weight": 8.0, "age": 31}}, - {"human?": False, "deets": None}, - ], - [], - [{"human?": None, "deets": {"weight": 6.9, "age": None}}], - ] - }, - # Struct of Lists - { - "Real estate records": [ - None, - { - "Status": "NRI", - "Ownerships": { - "land_unit": [None, 2, None], - "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], - }, - }, - { - "Status": None, - "Ownerships": { - "land_unit": [4, 5], - "flats": [[7, 8], []], - }, - }, - { - "Status": "RI", - "Ownerships": {"land_unit": None, "flats": [[]]}, - }, - {"Status": "RI", "Ownerships": None}, - { - "Status": None, - "Ownerships": { - "land_unit": [7, 8, 9], - "flats": [[], [], []], - }, - }, - ] - }, - ], -) -def test_parquet_reader_nested_v2(tmpdir, data): - expect = pd.DataFrame(data) - pdf_fname = tmpdir.join("pdfv2.parquet") - expect.to_parquet(pdf_fname, data_page_version="2.0") - assert_eq(cudf.read_parquet(pdf_fname), expect) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_parquet_writer_cpu_pyarrow( - tmpdir, pdf_day_timestamps, gdf_day_timestamps -): - pdf_fname = tmpdir.join("pdf.parquet") - gdf_fname = tmpdir.join("gdf.parquet") - - if len(pdf_day_timestamps) == 0: - pdf_day_timestamps = pdf_day_timestamps.reset_index(drop=True) - gdf_day_timestamps = pdf_day_timestamps.reset_index(drop=True) - - pdf_day_timestamps.to_parquet(pdf_fname.strpath) - gdf_day_timestamps.to_parquet(gdf_fname.strpath, engine="pyarrow") - - assert os.path.exists(pdf_fname) - assert os.path.exists(gdf_fname) - - expect = pa.parquet.read_pandas(pdf_fname) - got = pa.parquet.read_pandas(gdf_fname) - - assert_eq(expect, got) - - def clone_field(table, name, datatype): - f = table.schema.field(name) - return pa.field(f.name, datatype, f.nullable, f.metadata) - - # Pandas uses a datetime64[ns] while we use a datetime64[ms] - for t in [expect, got]: - for t_col in ["col_datetime64[ms]", "col_datetime64[us]"]: - idx = t.schema.get_field_index(t_col) - field = clone_field(t, t_col, pa.timestamp("ms")) - t = t.set_column(idx, field, t.column(idx).cast(field.type)) - t = t.replace_schema_metadata() - - assert_eq(expect, got) - - -@pytest.mark.filterwarnings("ignore:Using CPU") -def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf): - gdf_fname = tmpdir.join("gdf.parquet") - - if len(pdf) == 0: - pdf = pdf.reset_index(drop=True) - gdf = gdf.reset_index(drop=True) - - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - if "col_category" in gdf.columns: - gdf = gdf.drop(columns=["col_category"]) - - assert_eq(pdf, gdf) - - # Write out the gdf using the GPU accelerated writer with INT96 timestamps - gdf.to_parquet( - gdf_fname.strpath, - index=None, - int96_timestamps=True, - ) - - assert os.path.exists(gdf_fname) - - expect = pdf - got = pd.read_parquet(gdf_fname) - - # verify INT96 timestamps were converted back to the same data. - assert_eq(expect, got, check_categorical=False, check_dtype=False) - - -def test_multifile_parquet_folder(tmpdir): - test_pdf1 = make_pdf(nrows=10, nvalids=10 // 2, dtype="float64") - test_pdf2 = make_pdf(nrows=20, dtype="float64") - expect = pd.concat([test_pdf1, test_pdf2]) - - tmpdir.mkdir("multi_part") - - create_parquet_source( - test_pdf1, "filepath", tmpdir.join("multi_part/multi1.parquet") - ) - create_parquet_source( - test_pdf2, "filepath", tmpdir.join("multi_part/multi2.parquet") - ) - - got1 = cudf.read_parquet(tmpdir.join("multi_part/*.parquet")) - assert_eq(expect, got1) - - got2 = cudf.read_parquet(tmpdir.join("multi_part")) - assert_eq(expect, got2) - - -# Validates the metadata return path of the parquet writer -def test_parquet_writer_return_metadata(tmpdir, simple_gdf): - gdf_fname = tmpdir.join("data1.parquet") - - # Write out the gdf using the GPU accelerated writer - df_metadata = simple_gdf.to_parquet( - gdf_fname.strpath, index=None, metadata_file_path="test/data1.parquet" - ) - # Verify that we got a valid parquet signature in the initial metadata blob - assert df_metadata.tobytes()[0:4] == b"PAR1" - - df_metadata_list1 = [df_metadata] - df_metadata_list2 = [df_metadata, df_metadata] - merged_metadata1 = merge_parquet_filemetadata(df_metadata_list1) - merged_metadata2 = merge_parquet_filemetadata(df_metadata_list2) - - # Verify that we got a valid parquet signature in the final metadata blob - assert merged_metadata1.tobytes()[0:4] == b"PAR1" - assert merged_metadata2.tobytes()[0:4] == b"PAR1" - - # Make sure aggregation is combining metadata correctly - fmd1 = pa.parquet.ParquetFile(BytesIO(merged_metadata1.tobytes())).metadata - fmd2 = pa.parquet.ParquetFile(BytesIO(merged_metadata2.tobytes())).metadata - assert fmd2.num_columns == fmd1.num_columns - assert fmd2.num_rows == 2 * fmd1.num_rows - assert fmd2.num_row_groups == 2 * fmd1.num_row_groups - - -# Validates the integrity of the GPU accelerated parquet writer. -def test_parquet_writer_gpu_none_index(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - pdf_fname = tmpdir.join("pdf.parquet") - - assert_eq(simple_pdf, simple_gdf) - - # Write out the gdf using the GPU accelerated writer - simple_gdf.to_parquet(gdf_fname.strpath, index=None) - simple_pdf.to_parquet(pdf_fname.strpath, index=None) - - assert os.path.exists(gdf_fname) - assert os.path.exists(pdf_fname) - - expect = pd.read_parquet(pdf_fname) - got = pd.read_parquet(gdf_fname) - - assert_eq(expect, got, check_categorical=False) - - -def test_parquet_writer_gpu_true_index(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - pdf_fname = tmpdir.join("pdf.parquet") - - assert_eq(simple_pdf, simple_gdf) - - # Write out the gdf using the GPU accelerated writer - simple_gdf.to_parquet(gdf_fname.strpath, index=True) - simple_pdf.to_parquet(pdf_fname.strpath, index=True) - - assert os.path.exists(gdf_fname) - assert os.path.exists(pdf_fname) - - expect = pd.read_parquet(pdf_fname) - got = pd.read_parquet(gdf_fname) - - assert_eq(expect, got, check_categorical=False) - - -def test_parquet_writer_gpu_false_index(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - pdf_fname = tmpdir.join("pdf.parquet") - - assert_eq(simple_pdf, simple_gdf) - - # Write out the gdf using the GPU accelerated writer - simple_gdf.to_parquet(gdf_fname.strpath, index=False) - simple_pdf.to_parquet(pdf_fname.strpath, index=False) - - assert os.path.exists(gdf_fname) - assert os.path.exists(pdf_fname) - - expect = pd.read_parquet(pdf_fname) - got = pd.read_parquet(gdf_fname) - - assert_eq(expect, got, check_categorical=False) - - -def test_parquet_writer_gpu_multi_index(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - pdf_fname = tmpdir.join("pdf.parquet") - - simple_pdf = simple_pdf.set_index(["col_bool", "col_int8"]) - simple_gdf = simple_gdf.set_index(["col_bool", "col_int8"]) - - assert_eq(simple_pdf, simple_gdf) - - print("PDF Index Type: " + str(type(simple_pdf.index))) - print("GDF Index Type: " + str(type(simple_gdf.index))) - - # Write out the gdf using the GPU accelerated writer - simple_gdf.to_parquet(gdf_fname.strpath, index=None) - simple_pdf.to_parquet(pdf_fname.strpath, index=None) - - assert os.path.exists(gdf_fname) - assert os.path.exists(pdf_fname) - - expect = pd.read_parquet(pdf_fname) - got = pd.read_parquet(gdf_fname) - - assert_eq(expect, got, check_categorical=False) - - -def test_parquet_writer_gpu_chunked(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - - writer = ParquetWriter(gdf_fname) - writer.write_table(simple_gdf) - writer.write_table(simple_gdf) - writer.close() - - assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf])) - - -def test_parquet_writer_gpu_chunked_context(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - - with ParquetWriter(gdf_fname) as writer: - writer.write_table(simple_gdf) - writer.write_table(simple_gdf) - - got = pd.read_parquet(gdf_fname) - expect = pd.concat([simple_pdf, simple_pdf]) - assert_eq(got, expect) - - -def test_parquet_write_bytes_io(simple_gdf): - output = BytesIO() - simple_gdf.to_parquet(output) - assert_eq(cudf.read_parquet(output), simple_gdf) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_writer_bytes_io(simple_gdf, store_schema): - output = BytesIO() - - writer = ParquetWriter(output, store_schema=store_schema) - writer.write_table(simple_gdf) - writer.write_table(simple_gdf) - writer.close() - - assert_eq(cudf.read_parquet(output), cudf.concat([simple_gdf, simple_gdf])) - - -@pytest.mark.parametrize( - "row_group_size_kwargs", - [ - {"row_group_size_bytes": 4 * 1024}, - {"row_group_size_rows": 5000}, - ], -) -def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs): - # Check that row_group_size options are exposed in Python - # See https://github.com/rapidsai/cudf/issues/10978 - - size = 20000 - gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) - - fname = tmpdir.join("gdf.parquet") - with ParquetWriter(fname, **row_group_size_kwargs) as writer: - writer.write_table(gdf) - - # Simple check for multiple row-groups - nrows, nrow_groups, columns, _, _ = cudf.io.parquet.read_parquet_metadata( - fname - ) - assert nrows == size - assert nrow_groups > 1 - assert columns == ["a", "b"] - - # Know the specific row-group count for row_group_size_rows - if "row_group_size_rows" in row_group_size_kwargs: - assert ( - nrow_groups == size // row_group_size_kwargs["row_group_size_rows"] - ) - - assert_eq(cudf.read_parquet(fname), gdf) - - -def test_parquet_writer_column_index(tmpdir): - # Simple test for presence of indices. validity is checked - # in libcudf tests. - # Write 2 files, one with column index set, one without. - # Make sure the former is larger in size. - - size = 20000 - gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) - - fname = tmpdir.join("gdf.parquet") - with ParquetWriter(fname, statistics="ROWGROUP") as writer: - writer.write_table(gdf) - s1 = os.path.getsize(fname) - - fname = tmpdir.join("gdfi.parquet") - with ParquetWriter(fname, statistics="COLUMN") as writer: - writer.write_table(gdf) - s2 = os.path.getsize(fname) - assert s2 > s1 - - -@pytest.mark.parametrize( - "max_page_size_kwargs", - [ - {"max_page_size_bytes": 4 * 1024}, - {"max_page_size_rows": 5000}, - ], -) -def test_parquet_writer_max_page_size(tmpdir, max_page_size_kwargs): - # Check that max_page_size options are exposed in Python - # Since we don't have access to page metadata, instead check that - # file written with more pages will be slightly larger - - size = 20000 - gdf = cudf.DataFrame({"a": range(size), "b": [1] * size}) - - fname = tmpdir.join("gdf.parquet") - with ParquetWriter(fname, **max_page_size_kwargs) as writer: - writer.write_table(gdf) - s1 = os.path.getsize(fname) - - assert_eq(cudf.read_parquet(fname), gdf) - - fname = tmpdir.join("gdf0.parquet") - with ParquetWriter(fname) as writer: - writer.write_table(gdf) - s2 = os.path.getsize(fname) - - assert_eq(cudf.read_parquet(fname), gdf) - assert s1 > s2 - - -@pytest.mark.parametrize("use_dict", [False, True]) -@pytest.mark.parametrize("max_dict_size", [0, 1048576]) -def test_parquet_writer_dictionary_setting(use_dict, max_dict_size): - # Simple test for checking the validity of dictionary encoding setting - # and behavior of ParquetWriter in cudf. - # Write a table with repetitive data with varying dictionary settings. - # Make sure the written columns are dictionary-encoded accordingly. - - # Table with repetitive data - table = cudf.DataFrame( - { - "int32": cudf.Series([1024] * 1024, dtype="int64"), - } - ) - - # Write to Parquet using ParquetWriter - buffer = BytesIO() - writer = ParquetWriter( - buffer, - use_dictionary=use_dict, - max_dictionary_size=max_dict_size, - ) - writer.write_table(table) - writer.close() - - # Read encodings from parquet file - got = pq.ParquetFile(buffer) - encodings = got.metadata.row_group(0).column(0).encodings - - # Check for `PLAIN_DICTIONARY` encoding if dictionary encoding enabled - # and dictionary page limit > 0 - if use_dict is True and max_dict_size > 0: - assert "PLAIN_DICTIONARY" in encodings - else: - assert "PLAIN_DICTIONARY" not in encodings - - -@pytest.mark.parametrize("filename", ["myfile.parquet", None]) -@pytest.mark.parametrize("cols", [["b"], ["c", "b"]]) -def test_parquet_partitioned(tmpdir_factory, cols, filename): - # Checks that write_to_dataset is wrapping to_parquet - # as expected - gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - size = 100 - pdf = pd.DataFrame( - { - "a": np.arange(0, stop=size, dtype="int64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), - } - ) - pdf.to_parquet(pdf_dir, index=False, partition_cols=cols) - gdf = cudf.from_pandas(pdf) - gdf.to_parquet( - gdf_dir, index=False, partition_cols=cols, partition_file_name=filename - ) - - # Read back with pandas to compare - expect_pd = pd.read_parquet(pdf_dir) - got_pd = pd.read_parquet(gdf_dir) - assert_eq(expect_pd, got_pd) - - # Check that cudf and pd return the same read - got_cudf = cudf.read_parquet(gdf_dir) - if isinstance(got_pd["c"].dtype, pd.CategoricalDtype): - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - got_pd["c"] = got_pd["c"].astype( - pd.CategoricalDtype( - categories=got_pd["c"].dtype.categories.astype("int64"), - ordered=got_pd["c"].dtype.ordered, - ) - ) - assert_eq(got_pd, got_cudf) - - # If filename is specified, check that it is correct - if filename: - for _, _, files in os.walk(gdf_dir): - for fn in files: - assert fn == filename - - -@pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}]) -def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs): - # Checks that write_to_dataset is wrapping to_parquet - # as expected - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - size = 100 - pdf = pd.DataFrame( - { - "a": np.arange(0, stop=size, dtype="int64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), - } - ) - pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"]) - - with pytest.raises(NotImplementedError): - cudf.read_parquet(pdf_dir, **kwargs) - - -@pytest.mark.parametrize("return_meta", [True, False]) -def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta): - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) - - df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) - df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) - - cw = ParquetDatasetWriter(gdf_dir, partition_cols=["a"], index=False) - cw.write_table(df1) - cw.write_table(df2) - meta_byte_array = cw.close(return_metadata=return_meta) - pdf = cudf.concat([df1, df2]).to_pandas() - pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) - - if return_meta: - fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata - assert fmd.num_rows == len(pdf) - assert fmd.num_row_groups == 4 - files = { - os.path.join(directory, files[0]) - for directory, _, files in os.walk(gdf_dir) - if files - } - meta_files = { - os.path.join(gdf_dir, fmd.row_group(i).column(c).file_path) - for i in range(fmd.num_row_groups) - for c in range(fmd.row_group(i).num_columns) - } - assert files == meta_files - - # Read back with pandas to compare - expect_pd = pd.read_parquet(pdf_dir) - got_pd = pd.read_parquet(gdf_dir) - assert_eq(expect_pd, got_pd) - - # Check that cudf and pd return the same read - got_cudf = cudf.read_parquet(gdf_dir) - - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - got_pd["a"] = got_pd["a"].astype( - pd.CategoricalDtype( - categories=got_pd["a"].dtype.categories.astype("int64"), - ordered=got_pd["a"].dtype.ordered, - ) - ) - assert_eq(got_pd, got_cudf) - - -@pytest.mark.parametrize( - "max_file_size,max_file_size_in_bytes", - [("500KB", 500000), ("MB", 1000000)], -) -def test_parquet_writer_chunked_max_file_size( - tmpdir_factory, max_file_size, max_file_size_in_bytes -): - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) - - df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1] * 10000, "b": range(0, 50000)}) - df2 = cudf.DataFrame( - {"a": [1, 3, 3, 1, 3] * 10000, "b": range(50000, 100000)} - ) - - cw = ParquetDatasetWriter( - gdf_dir, - partition_cols=["a"], - max_file_size=max_file_size, - file_name_prefix="sample", - ) - cw.write_table(df1) - cw.write_table(df2) - cw.close() - pdf = cudf.concat([df1, df2]).to_pandas() - pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) - - expect_pd = pd.read_parquet(pdf_dir) - got_pd = pd.read_parquet(gdf_dir) - - assert_eq( - expect_pd.sort_values(["b"]).reset_index(drop=True), - got_pd.sort_values(["b"]).reset_index(drop=True), - ) - - # Check that cudf and pd return the same read - got_cudf = cudf.read_parquet(gdf_dir) - - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - got_pd["a"] = got_pd["a"].astype( - pd.CategoricalDtype( - categories=got_pd["a"].dtype.categories.astype("int64"), - ordered=got_pd["a"].dtype.ordered, - ) - ) - assert_eq( - got_pd.sort_values(["b"]).reset_index(drop=True), - got_cudf.sort_values(["b"]).reset_index(drop=True), - ) - - all_files = glob.glob(gdf_dir + "/**/*.parquet", recursive=True) - for each_file in all_files: - # Validate file sizes with some extra 1000 - # bytes buffer to spare - assert os.path.getsize(each_file) <= ( - max_file_size_in_bytes - ), "File exceeded max_file_size" - - -def test_parquet_writer_chunked_max_file_size_error(): - with pytest.raises( - ValueError, - match="file_name_prefix cannot be None if max_file_size is passed", - ): - ParquetDatasetWriter("sample", partition_cols=["a"], max_file_size=100) - - -def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): - pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) - gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) - - df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]}) - df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]}) - - with ParquetDatasetWriter( - gdf_dir, partition_cols=["a"], index=False - ) as cw: - cw.write_table(df1) - cw.write_table(df2) - - pdf = cudf.concat([df1, df2]).to_pandas() - pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"]) - - # Read back with pandas to compare - expect_pd = pd.read_parquet(pdf_dir) - got_pd = pd.read_parquet(gdf_dir) - assert_eq(expect_pd, got_pd) - - # Check that cudf and pd return the same read - got_cudf = cudf.read_parquet(gdf_dir) - - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - got_pd["a"] = got_pd["a"].astype( - pd.CategoricalDtype( - categories=got_pd["a"].dtype.categories.astype("int64"), - ordered=got_pd["a"].dtype.ordered, - ) - ) - assert_eq(got_pd, got_cudf) - - -@pytest.mark.parametrize("cols", [None, ["b"]]) -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): - dir1 = tmpdir_factory.mktemp("dir1") - dir2 = tmpdir_factory.mktemp("dir2") - if cols is None: - dir1 = dir1.join("file.pq") - dir2 = dir2.join("file.pq") - dir1 = str(dir1) - dir2 = str(dir2) - - size = 100 - gdf = cudf.DataFrame( - { - "a": np.arange(0, stop=size), - "b": np.random.choice(np.arange(4), size=size), - } - ) - gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) - cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols) - - # Read back with cudf - expect = cudf.read_parquet(dir1) - got = cudf.read_parquet(dir2) - assert_eq(expect, got) - - gdf = cudf.DataFrame( - { - "a": cudf.Series([1, 2, 3]), - "b": cudf.Series([1, 2, 3]), - "c": cudf.Series(["a", "b", "c"], dtype="category"), - } - ) - with pytest.raises(ValueError): - gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) - - -@pytest.mark.parametrize( - "pfilters", - [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]], -) -@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"]) -@pytest.mark.parametrize("use_cat", [True, False]) -def test_read_parquet_partitioned_filtered( - tmpdir, pfilters, selection, use_cat -): - rng = np.random.default_rng(2) - path = str(tmpdir) - size = 100 - df = cudf.DataFrame( - { - "a": np.arange(0, stop=size, dtype="int64"), - "b": rng.choice(list("abcd"), size=size), - "c": rng.choice(np.arange(4), size=size), - } - ) - df.to_parquet(path, partition_cols=["c", "b"]) - - if selection == "files": - # Pass in a list of paths - fs = get_fs_token_paths(path)[0] - read_path = fs.find(path) - row_groups = None - elif selection == "row-groups": - # Pass in a list of paths AND row-group ids - fs = get_fs_token_paths(path)[0] - read_path = fs.find(path) - row_groups = [[0] for p in read_path] - else: - # Pass in a directory path - # (row-group selection not allowed in this case) - read_path = path - row_groups = None - - # Filter on partitioned columns - expect = pd.read_parquet(read_path, filters=pfilters) - got = cudf.read_parquet( - read_path, - filters=pfilters, - row_groups=row_groups, - categorical_partitions=use_cat, - ) - expect["b"] = expect["b"].astype(str) - expect["c"] = expect["c"].astype(int) - if use_cat: - assert got.dtypes["b"] == "category" - assert got.dtypes["c"] == "category" - got["b"] = got["b"].astype(str) - got["c"] = got["c"].astype(int) - else: - # Check that we didn't get categorical - # columns, but convert back to categorical - # for comparison with pandas - assert got.dtypes["b"] == "object" - assert got.dtypes["c"] == "int" - assert_eq(expect, got) - - # Filter on non-partitioned column - filters = [("a", "==", 10)] - got = cudf.read_parquet(read_path, filters=filters) - expect = pd.read_parquet(read_path, filters=filters) - - # Filter on both kinds of columns - filters = [[("a", "==", 10)], [("c", "==", 1)]] - got = cudf.read_parquet(read_path, filters=filters) - expect = pd.read_parquet(read_path, filters=filters) - - # Work-around for pandas bug: - # https://github.com/pandas-dev/pandas/issues/53345 - expect["c"] = expect["c"].astype( - pd.CategoricalDtype( - categories=expect["c"].dtype.categories.astype("int64"), - ordered=expect["c"].dtype.ordered, - ) - ) - assert_eq(expect, got) - - -def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf): - gdf_fname = tmpdir.join("gdf.parquet") - test_path = "test/path" - - writer = ParquetWriter(gdf_fname) - writer.write_table(simple_gdf) - writer.write_table(simple_gdf) - meta_byte_array = writer.close(metadata_file_path=test_path) - fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata - - assert fmd.num_rows == 2 * len(simple_gdf) - assert fmd.num_row_groups == 2 - - for r in range(fmd.num_row_groups): - for c in range(fmd.num_columns): - assert fmd.row_group(r).column(c).file_path == test_path - - -def test_write_read_cudf(tmpdir, pdf): - file_path = tmpdir.join("cudf.parquet") - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - - gdf = cudf.from_pandas(pdf) - gdf.to_parquet(file_path) - gdf = cudf.read_parquet(file_path) - - assert_eq(gdf, pdf, check_index_type=not pdf.empty) - - -def test_write_cudf_read_pandas_pyarrow(tmpdir, pdf): - cudf_path = tmpdir.join("cudf.parquet") - pandas_path = tmpdir.join("pandas.parquet") - - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category"]) - - df = cudf.from_pandas(pdf) - - df.to_parquet(cudf_path) - pdf.to_parquet(pandas_path) - - cudf_res = pd.read_parquet(cudf_path) - pd_res = pd.read_parquet(pandas_path) - - assert_eq(pd_res, cudf_res, check_index_type=not pdf.empty) - - cudf_res = pa.parquet.read_table( - cudf_path, use_pandas_metadata=True - ).to_pandas() - pd_res = pa.parquet.read_table( - pandas_path, use_pandas_metadata=True - ).to_pandas() - - assert_eq(cudf_res, pd_res, check_index_type=not pdf.empty) - - -def test_parquet_writer_criteo(tmpdir): - # To run this test, download the day 0 of criteo dataset from - # http://labs.criteo.com/2013/12/download-terabyte-click-logs/ - # and place the uncompressed dataset in the home directory - fname = os.path.expanduser("~/day_0") - if not os.path.isfile(fname): - pytest.skip("Local criteo day 0 tsv file is not found") - - cudf_path = tmpdir.join("cudf.parquet") - - cont_names = ["I" + str(x) for x in range(1, 14)] - cat_names = ["C" + str(x) for x in range(1, 27)] - cols = ["label"] + cont_names + cat_names - - df = cudf.read_csv(fname, sep="\t", names=cols, byte_range=(0, 1000000000)) - df = df.drop(columns=cont_names) - - df.to_parquet(cudf_path) - - -def test_trailing_nans(datadir, tmpdir): - fname = "trailing_nans.parquet" - file_path = datadir / fname - cu_df = cudf.read_parquet(file_path) - - tmp_file_path = tmpdir.join(fname) - cu_df.to_parquet(tmp_file_path) - - pd.read_parquet(tmp_file_path) - - -def test_parquet_writer_sliced(tmpdir): - cudf_path = tmpdir.join("cudf.parquet") - - df = pd.DataFrame() - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df = cudf.from_pandas(df) - - df_select = df.iloc[1:3] - - df_select.to_parquet(cudf_path) - assert_eq(cudf.read_parquet(cudf_path), df_select) - - -def test_parquet_writer_list_basic(tmpdir): - expect = pd.DataFrame({"a": [[[1, 2], [3, 4]], None, [[5, 6], None]]}) - fname = tmpdir.join("test_parquet_writer_list_basic.parquet") - - gdf = cudf.from_pandas(expect) - - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert_eq(expect, got) - - -def test_parquet_writer_list_large(tmpdir): - expect = pd.DataFrame({"a": list_gen(int_gen, 256, 80, 50)}) - fname = tmpdir.join("test_parquet_writer_list_large.parquet") - - gdf = cudf.from_pandas(expect) - - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert_eq(expect, got) - - -def test_parquet_writer_list_large_mixed(tmpdir): - expect = pd.DataFrame( - { - "a": list_gen(string_gen, 128, 80, 50), - "b": list_gen(int_gen, 128, 80, 50), - "c": list_gen(int_gen, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 128, 80, 50, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_writer_list_large_mixed.parquet") - gdf = cudf.from_pandas(expect) - - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_writer_list_chunked(tmpdir, store_schema): - if store_schema and version.parse(pa.__version__) < version.parse( - "15.0.0" - ): - pytest.skip("https://github.com/apache/arrow/pull/37792") - table1 = cudf.DataFrame( - { - "a": list_gen(string_gen, 128, 80, 50), - "b": list_gen(int_gen, 128, 80, 50), - "c": list_gen(int_gen, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 128, 80, 50, include_validity=True), - } - ) - table2 = cudf.DataFrame( - { - "a": list_gen(string_gen, 128, 80, 50), - "b": list_gen(int_gen, 128, 80, 50), - "c": list_gen(int_gen, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 128, 80, 50, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_writer_list_chunked.parquet") - expect = cudf.concat([table1, table2]) - expect = expect.reset_index(drop=True) - - writer = ParquetWriter(fname, store_schema=store_schema) - writer.write_table(table1) - writer.write_table(table2) - writer.close() - - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) -def test_parquet_nullable_boolean(tmpdir, engine): - pandas_path = tmpdir.join("pandas_bools.parquet") - - pdf = pd.DataFrame( - { - "a": pd.Series( - [True, False, None, True, False], dtype=pd.BooleanDtype() - ) - } - ) - expected_gdf = cudf.DataFrame({"a": [True, False, None, True, False]}) - - pdf.to_parquet(pandas_path) - with _hide_pyarrow_parquet_cpu_warnings(engine): - actual_gdf = cudf.read_parquet(pandas_path, engine=engine) - - assert_eq(actual_gdf, expected_gdf) - - -def run_parquet_index(pdf, index): - pandas_buffer = BytesIO() - cudf_buffer = BytesIO() - - gdf = cudf.from_pandas(pdf) - - pdf.to_parquet(pandas_buffer, index=index) - gdf.to_parquet(cudf_buffer, index=index) - - expected = pd.read_parquet(cudf_buffer) - actual = cudf.read_parquet(pandas_buffer) - - assert_eq(expected, actual, check_index_type=True) - - expected = pd.read_parquet(pandas_buffer) - actual = cudf.read_parquet(cudf_buffer) - - assert_eq( - expected, - actual, - check_index_type=True, - ) - - -@pytest.mark.parametrize( - "pdf", - [ - pd.DataFrame(index=[1, 2, 3]), - pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]), - pd.DataFrame( - {"b": [11, 22, 33], "c": ["a", "b", "c"]}, - index=pd.Index(["a", "b", "c"], name="custom name"), - ), - pd.DataFrame( - {"a": [10, 11, 12], "b": [99, 88, 77]}, - index=pd.RangeIndex(12, 17, 2), - ), - pd.DataFrame( - {"b": [99, 88, 77]}, - index=pd.RangeIndex(22, 27, 2, name="hello index"), - ), - pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")), - pd.DataFrame( - {"a": ["a", "bb", "cc"], "b": [10, 21, 32]}, - index=pd.MultiIndex.from_tuples([[1, 2], [10, 11], [15, 16]]), - ), - pd.DataFrame( - {"a": ["a", "bb", "cc"], "b": [10, 21, 32]}, - index=pd.MultiIndex.from_tuples( - [[1, 2], [10, 11], [15, 16]], names=["first", "second"] - ), - ), - ], -) -@pytest.mark.parametrize("index", [None, True, False]) -def test_parquet_index(pdf, index): - run_parquet_index(pdf, index) - - -@pytest.mark.parametrize( - "index", - [ - pytest.param( - None, - marks=pytest.mark.xfail( - reason="https://github.com/apache/arrow/issues/40743" - ), - ), - True, - ], -) -def test_parquet_index_empty(index): - pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) - run_parquet_index(pdf, index) - - -def test_parquet_no_index_empty(): - pdf = pd.DataFrame(index=pd.RangeIndex(0, 10, 1)) - run_parquet_index(pdf, index=False) - - -@pytest.mark.parametrize("engine", ["cudf", "pyarrow"]) -def test_parquet_allnull_str(tmpdir, engine): - pandas_path = tmpdir.join("pandas_allnulls.parquet") - - pdf = pd.DataFrame( - {"a": pd.Series([None, None, None, None, None], dtype="str")} - ) - expected_gdf = cudf.DataFrame( - {"a": cudf.Series([None, None, None, None, None], dtype="str")} - ) - - pdf.to_parquet(pandas_path) - with _hide_pyarrow_parquet_cpu_warnings(engine): - actual_gdf = cudf.read_parquet(pandas_path, engine=engine) - - assert_eq(actual_gdf, expected_gdf) - - -def normalized_equals(value1, value2): - if value1 is pd.NA or value1 is pd.NaT: - value1 = None - if value2 is pd.NA or value2 is pd.NaT: - value2 = None - if isinstance(value1, np.datetime64): - value1 = pd.Timestamp(value1).to_pydatetime() - if isinstance(value2, np.datetime64): - value2 = pd.Timestamp(value2).to_pydatetime() - if isinstance(value1, pd.Timestamp): - value1 = value1.to_pydatetime() - if isinstance(value2, pd.Timestamp): - value2 = value2.to_pydatetime() - if isinstance(value1, datetime.datetime): - value1 = value1.replace(tzinfo=None) - if isinstance(value2, datetime.datetime): - value2 = value2.replace(tzinfo=None) - if isinstance(value1, pd.Timedelta): - unit = "ms" if value1.unit == "s" else value1.unit - value2 = pd.Timedelta(value2, unit=unit) - - # if one is datetime then both values are datetimes now - if isinstance(value1, datetime.datetime): - return value1 == value2 - - # Compare integers with floats now - if isinstance(value1, float) or isinstance(value2, float): - return math.isclose(value1, value2) - - return value1 == value2 - - -@pytest.mark.parametrize("add_nulls", [True, False]) -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema): - if store_schema and version.parse(pa.__version__) < version.parse( - "15.0.0" - ): - pytest.skip("https://github.com/apache/arrow/pull/37792") - file_path = tmpdir.join("cudf.parquet") - if "col_category" in pdf.columns: - pdf = pdf.drop(columns=["col_category", "col_bool"]) - - if not add_nulls: - # Timedelta types convert NaT to None when reading from parquet into - # pandas which interferes with series.max()/min() - for t in TIMEDELTA_TYPES: - pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t) - # pyarrow can't read values with non-zero nanoseconds - pdf["col_timedelta64[ns]"] = pdf["col_timedelta64[ns]"] * 1000 - - gdf = cudf.from_pandas(pdf) - if add_nulls: - for col in gdf: - set_random_null_mask_inplace(gdf[col]) - gdf.to_parquet(file_path, index=False, store_schema=store_schema) - - # Read back from pyarrow - pq_file = pq.ParquetFile(file_path) - # verify each row group's statistics - for rg in range(0, pq_file.num_row_groups): - pd_slice = pq_file.read_row_group(rg).to_pandas() - - # statistics are per-column. So need to verify independently - for i, col in enumerate(pd_slice): - stats = pq_file.metadata.row_group(rg).column(i).statistics - - actual_min = pd_slice[col].min() - stats_min = stats.min - assert normalized_equals(actual_min, stats_min) - - actual_max = pd_slice[col].max() - stats_max = stats.max - assert normalized_equals(actual_max, stats_max) - - assert stats.null_count == pd_slice[col].isna().sum() - assert stats.num_values == pd_slice[col].count() - - -def test_parquet_writer_list_statistics(tmpdir): - df = pd.DataFrame( - { - "a": list_gen(string_gen, 128, 80, 50), - "b": list_gen(int_gen, 128, 80, 50), - "c": list_gen(int_gen, 128, 80, 50, include_validity=True), - "d": list_gen(string_gen, 128, 80, 50, include_validity=True), - } - ) - fname = tmpdir.join("test_parquet_writer_list_statistics.parquet") - gdf = cudf.from_pandas(df) - - gdf.to_parquet(fname) - assert os.path.exists(fname) - - # Read back from pyarrow - pq_file = pq.ParquetFile(fname) - # verify each row group's statistics - for rg in range(0, pq_file.num_row_groups): - pd_slice = pq_file.read_row_group(rg).to_pandas() - - # statistics are per-column. So need to verify independently - for i, col in enumerate(pd_slice): - stats = pq_file.metadata.row_group(rg).column(i).statistics - - actual_min = pd_slice[col].explode().explode().dropna().min() - stats_min = stats.min - assert normalized_equals(actual_min, stats_min) - - actual_max = pd_slice[col].explode().explode().dropna().max() - stats_max = stats.max - assert normalized_equals(actual_max, stats_max) - - -@pytest.mark.parametrize( - "data", - [ - # Structs - { - "being": [ - None, - {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}}, - {"human?": None, "Deets": {"Name": "Angua", "Age": 25}}, - {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}}, - {"human?": False, "Deets": None}, - {"human?": None, "Deets": {"Name": "Mr", "Age": None}}, - ] - }, - # List of Structs - { - "family": [ - [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}], - [ - {"human?": None, "deets": {"weight": 5.3, "age": 25}}, - {"human?": False, "deets": {"weight": 8.0, "age": 31}}, - {"human?": False, "deets": None}, - ], - [], - [{"human?": None, "deets": {"weight": 6.9, "age": None}}], - ] - }, - # Struct of Lists - { - "Real estate records": [ - None, - { - "Status": "NRI", - "Ownerships": { - "land_unit": [None, 2, None], - "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]], - }, - }, - { - "Status": None, - "Ownerships": { - "land_unit": [4, 5], - "flats": [[7, 8], []], - }, - }, - { - "Status": "RI", - "Ownerships": {"land_unit": None, "flats": [[]]}, - }, - {"Status": "RI", "Ownerships": None}, - { - "Status": None, - "Ownerships": { - "land_unit": [7, 8, 9], - "flats": [[], [], []], - }, - }, - ] - }, - ], -) -def test_parquet_writer_nested(tmpdir, data): - expect = pd.DataFrame(data) - gdf = cudf.from_pandas(expect) - - fname = tmpdir.join("test_parquet_writer_nested.parquet") - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -@pytest.mark.parametrize("data", [[1, 2, 3], [0.00, 0.01, None, 0.5]]) -def test_parquet_writer_decimal(decimal_type, data): - gdf = cudf.DataFrame({"val": data}) - - gdf["dec_val"] = gdf["val"].astype(decimal_type(7, 2)) - - buff = BytesIO() - gdf.to_parquet(buff) - - got = pd.read_parquet(buff, dtype_backend="numpy_nullable") - assert_eq(gdf["val"].to_pandas(nullable=True), got["val"]) - assert_eq(gdf["dec_val"].to_pandas(), got["dec_val"]) - - -def test_parquet_writer_column_validation(): - cudf_parquet = BytesIO() - pandas_parquet = BytesIO() - df = cudf.DataFrame({1: [1, 2, 3], "a": ["a", "b", "c"]}) - pdf = df.to_pandas() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.warns(UserWarning): - df.to_parquet(cudf_parquet) - - with pytest.warns(UserWarning): - pdf.to_parquet(pandas_parquet) - - assert_eq( - pd.read_parquet(cudf_parquet), - cudf.read_parquet(pandas_parquet), - ) - assert_eq( - cudf.read_parquet(cudf_parquet), - pd.read_parquet(pandas_parquet), - ) - - with cudf.option_context("mode.pandas_compatible", False): - with pytest.raises(ValueError): - df.to_parquet(cudf_parquet) - - -def test_parquet_writer_nulls_pandas_read(tmpdir, pdf): - if "col_bool" in pdf.columns: - pdf.drop(columns="col_bool", inplace=True) - if "col_category" in pdf.columns: - pdf.drop(columns="col_category", inplace=True) - gdf = cudf.from_pandas(pdf) - - num_rows = len(gdf) - - if num_rows > 0: - for col in gdf.columns: - gdf[col][random.randint(0, num_rows - 1)] = None - - fname = tmpdir.join("test_parquet_writer_nulls_pandas_read.parquet") - gdf.to_parquet(fname) - assert os.path.exists(fname) - - got = pd.read_parquet(fname) - nullable = num_rows > 0 - - if nullable: - gdf = gdf.drop(columns="col_datetime64[ms]") - gdf = gdf.drop(columns="col_datetime64[us]") - got = got.drop(columns="col_datetime64[ms]") - got = got.drop(columns="col_datetime64[us]") - - assert_eq(gdf.to_pandas(nullable=nullable), got) - - -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_parquet_decimal_precision(tmpdir, decimal_type): - df = cudf.DataFrame({"val": ["3.5", "4.2"]}).astype(decimal_type(5, 2)) - assert df.val.dtype.precision == 5 - - fname = tmpdir.join("decimal_test.parquet") - df.to_parquet(fname) - df = cudf.read_parquet(fname) - assert df.val.dtype.precision == 5 - - -def test_parquet_decimal_precision_empty(tmpdir): - df = ( - cudf.DataFrame({"val": ["3.5", "4.2"]}) - .astype(cudf.Decimal64Dtype(5, 2)) - .iloc[:0] - ) - assert df.val.dtype.precision == 5 - - fname = tmpdir.join("decimal_test.parquet") - df.to_parquet(fname) - df = cudf.read_parquet(fname) - assert df.val.dtype.precision == 5 - - -def test_parquet_reader_brotli(datadir): - fname = datadir / "brotli_int16.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname).to_pandas(nullable=True) - - assert_eq(expect, got) - - -def test_parquet_reader_one_level_list(datadir): - fname = datadir / "one_level_list.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_binary_decimal(datadir): - fname = datadir / "binary_decimal.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname).to_pandas() - - assert_eq(expect, got) - - -def test_parquet_reader_fixed_bin(datadir): - fname = datadir / "fixed_len_byte_array.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -def test_parquet_reader_fixed_len_with_dict(tmpdir): - def flba(i): - hasher = hashlib.sha256() - hasher.update(i.to_bytes(4, "little")) - return hasher.digest() - - # use pyarrow to write table of fixed_len_byte_array - num_rows = 200 - data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32)) - padf = pa.Table.from_arrays([data], names=["flba"]) - padf_fname = tmpdir.join("padf.parquet") - pq.write_table(padf, padf_fname, use_dictionary=True) - - expect = pd.read_parquet(padf_fname) - got = cudf.read_parquet(padf_fname) - assert_eq(expect, got) - - -def test_parquet_flba_round_trip(tmpdir): - def flba(i): - hasher = hashlib.sha256() - hasher.update(i.to_bytes(4, "little")) - return hasher.digest() - - # use pyarrow to write table of fixed_len_byte_array - num_rows = 200 - data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32)) - padf = pa.Table.from_arrays([data], names=["flba"]) - padf_fname = tmpdir.join("padf.parquet") - pq.write_table(padf, padf_fname) - - # round trip data with cudf - cdf = cudf.read_parquet(padf_fname) - cdf_fname = tmpdir.join("cdf.parquet") - cdf.to_parquet(cdf_fname, column_type_length={"flba": 32}) - - # now read back in with pyarrow to test it was written properly by cudf - padf2 = pq.read_table(padf_fname) - padf3 = pq.read_table(cdf_fname) - assert_eq(padf2, padf3) - assert_eq(padf2.schema[0].type, padf3.schema[0].type) - - -@pytest.mark.parametrize( - "encoding", - [ - "PLAIN", - "DICTIONARY", - "DELTA_BINARY_PACKED", - "BYTE_STREAM_SPLIT", - "USE_DEFAULT", - ], -) -def test_per_column_options(tmpdir, encoding): - pdf = pd.DataFrame({"ilist": [[1, 2, 3, 1, 2, 3]], "i1": [1]}) - cdf = cudf.from_pandas(pdf) - fname = tmpdir.join("ilist.parquet") - cdf.to_parquet( - fname, - column_encoding={"ilist.list.element": encoding}, - compression="SNAPPY", - skip_compression={"ilist.list.element"}, - ) - # DICTIONARY and USE_DEFAULT should both result in a PLAIN_DICTIONARY encoding in parquet - encoding_name = ( - "PLAIN_DICTIONARY" - if encoding == "DICTIONARY" or encoding == "USE_DEFAULT" - else encoding - ) - pf = pq.ParquetFile(fname) - fmd = pf.metadata - assert encoding_name in fmd.row_group(0).column(0).encodings - assert fmd.row_group(0).column(0).compression == "UNCOMPRESSED" - assert fmd.row_group(0).column(1).compression == "SNAPPY" - - -@pytest.mark.parametrize( - "encoding", - ["DELTA_LENGTH_BYTE_ARRAY", "DELTA_BYTE_ARRAY"], -) -def test_per_column_options_string_col(tmpdir, encoding): - pdf = pd.DataFrame({"s": ["a string"], "i1": [1]}) - cdf = cudf.from_pandas(pdf) - fname = tmpdir.join("strcol.parquet") - cdf.to_parquet( - fname, - column_encoding={"s": encoding}, - compression="SNAPPY", - ) - pf = pq.ParquetFile(fname) - fmd = pf.metadata - assert encoding in fmd.row_group(0).column(0).encodings - - -@pytest.mark.skipif( - version.parse(pa.__version__) < version.parse("16.0.0"), - reason="https://github.com/apache/arrow/pull/39748", -) -@pytest.mark.parametrize( - "num_rows", - [200, 10000], -) -def test_parquet_bss_round_trip(tmpdir, num_rows): - def flba(i): - hasher = hashlib.sha256() - hasher.update(i.to_bytes(4, "little")) - return hasher.digest() - - # use pyarrow to write table of types that support BYTE_STREAM_SPLIT encoding - rows_per_rowgroup = 5000 - fixed_data = pa.array( - [flba(i) for i in range(num_rows)], type=pa.binary(32) - ) - i32_data = pa.array(list(range(num_rows)), type=pa.int32()) - i64_data = pa.array(list(range(num_rows)), type=pa.int64()) - f32_data = pa.array([float(i) for i in range(num_rows)], type=pa.float32()) - f64_data = pa.array([float(i) for i in range(num_rows)], type=pa.float64()) - padf = pa.Table.from_arrays( - [fixed_data, i32_data, i64_data, f32_data, f64_data], - names=["flba", "i32", "i64", "f32", "f64"], - ) - padf_fname = tmpdir.join("padf.parquet") - pq.write_table( - padf, - padf_fname, - column_encoding="BYTE_STREAM_SPLIT", - use_dictionary=False, - row_group_size=rows_per_rowgroup, - ) - - # round trip data with cudf - cdf = cudf.read_parquet(padf_fname) - cdf_fname = tmpdir.join("cdf.parquet") - cdf.to_parquet( - cdf_fname, - column_type_length={"flba": 32}, - column_encoding={ - "flba": "BYTE_STREAM_SPLIT", - "i32": "BYTE_STREAM_SPLIT", - "i64": "BYTE_STREAM_SPLIT", - "f32": "BYTE_STREAM_SPLIT", - "f64": "BYTE_STREAM_SPLIT", - }, - row_group_size_rows=rows_per_rowgroup, - ) - - # now read back in with pyarrow to test it was written properly by cudf - padf2 = pq.read_table(padf_fname) - padf3 = pq.read_table(cdf_fname) - assert_eq(padf2, padf3) - assert_eq(padf2.schema[0].type, padf3.schema[0].type) - - -def test_parquet_reader_rle_boolean(datadir): - fname = datadir / "rle_boolean_encoding.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got) - - -# testing a specific bug-fix/edge case. -# specifically: int a parquet file containing a particular way of representing -# a list column in a schema, the cudf reader was confusing -# nesting information between a list column and a subsequent -# string column, ultimately causing a crash. -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Older versions of pandas do not have DataFrame.map()", -) -def test_parquet_reader_one_level_list2(datadir): - # we are reading in a file containing binary types, but cudf returns - # those as strings. so we have to massage the pandas data to get - # them to compare correctly. - def postprocess(val): - if isinstance(val, bytes): - return val.decode() - elif isinstance(val, np.ndarray): - return np.array([v.decode() for v in val]) - else: - return val - - fname = datadir / "one_level_list2.parquet" - - expect = pd.read_parquet(fname) - expect = expect.map(postprocess) - got = cudf.read_parquet(fname) - - assert_eq(expect, got, check_dtype=False) - - -# testing a specific bug-fix/edge case. -# specifically: in a parquet file containing a particular way of representing -# a list column in a schema, the cudf reader was confusing -# nesting information and building a list of list of int instead -# of a list of int -def test_parquet_reader_one_level_list3(datadir): - fname = datadir / "one_level_list3.parquet" - - expect = pd.read_parquet(fname) - got = cudf.read_parquet(fname) - - assert_eq(expect, got, check_dtype=True) - - -@pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) -@pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) -def test_to_parquet_row_group_size( - tmpdir, large_int64_gdf, size_bytes, size_rows -): - fname = tmpdir.join("row_group_size.parquet") - large_int64_gdf.to_parquet( - fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows - ) - - num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata( - fname - ) - # 8 bytes per row, as the column is int64 - expected_num_rows = max( - math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes) - ) - assert expected_num_rows == row_groups - - -@pytest.mark.parametrize("size_rows", [500_000, 100_000, 10_000]) -def test_parquet_row_group_metadata(tmpdir, large_int64_gdf, size_rows): - fname = tmpdir.join("row_group_size.parquet") - large_int64_gdf.to_parquet(fname, row_group_size_rows=size_rows) - - # read file metadata from parquet - ( - num_rows, - row_groups, - _, # col_names - _, # num_columns - row_group_metadata, - ) = cudf.io.read_parquet_metadata(fname) - - # length(RowGroupsMetaData) == number of row groups - assert len(row_group_metadata) == row_groups - # sum of rows in row groups == total rows - assert num_rows == sum( - [row_group["num_rows"] for row_group in row_group_metadata] - ) - - -def test_parquet_reader_decimal_columns(): - df = cudf.DataFrame( - { - "col1": cudf.Series([1, 2, 3], dtype=cudf.Decimal64Dtype(10, 2)), - "col2": [10, 11, 12], - "col3": [12, 13, 14], - "col4": ["a", "b", "c"], - } - ) - buffer = BytesIO() - df.to_parquet(buffer) - - actual = cudf.read_parquet(buffer, columns=["col3", "col2", "col1"]) - expected = pd.read_parquet(buffer, columns=["col3", "col2", "col1"]) - - assert_eq(actual, expected) - - -def test_parquet_reader_zstd_compression(datadir): - fname = datadir / "spark_zstd.parquet" - try: - df = cudf.read_parquet(fname) - pdf = pd.read_parquet(fname) - assert_eq(df, pdf) - except RuntimeError: - pytest.mark.xfail(reason="zstd support is not enabled") - - -def test_read_parquet_multiple_files(tmpdir): - df_1_path = tmpdir / "df_1.parquet" - df_2_path = tmpdir / "df_2.parquet" - df_1 = cudf.DataFrame({"id": range(100), "a": [1] * 100}) - df_1.to_parquet(df_1_path) - - df_2 = cudf.DataFrame({"id": range(200, 2200), "a": [2] * 2000}) - df_2.to_parquet(df_2_path) - - expected = pd.read_parquet([df_1_path, df_2_path]) - actual = cudf.read_parquet([df_1_path, df_2_path]) - assert_eq(expected, actual) - - expected = pd.read_parquet([df_2_path, df_1_path]) - actual = cudf.read_parquet([df_2_path, df_1_path]) - assert_eq(expected, actual) - - -@pytest.mark.parametrize("index", [True, False, None]) -@pytest.mark.parametrize("columns", [None, [], ["b", "a"]]) -def test_parquet_columns_and_index_param(index, columns): - buffer = BytesIO() - df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - df.to_parquet(buffer, index=index) - - expected = pd.read_parquet(buffer, columns=columns) - got = cudf.read_parquet(buffer, columns=columns) - - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize("columns", [None, ["b", "a"]]) -def test_parquet_columns_and_range_index(columns): - buffer = BytesIO() - df = cudf.DataFrame( - {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=pd.RangeIndex(2, 5) - ) - df.to_parquet(buffer) - - expected = pd.read_parquet(buffer, columns=columns) - got = cudf.read_parquet(buffer, columns=columns) - - assert_eq(expected, got, check_index_type=True) - - -def test_parquet_nested_struct_list(): - buffer = BytesIO() - data = { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - }, - "StreamId": "12345678", - "Duration": 10, - "Offset": 12, - "Resource": [{"Name": "ZoneName", "Value": "RAPIDS"}], - } - } - df = cudf.DataFrame({"a": cudf.Series(data)}) - - df.to_parquet(buffer) - expected = pd.read_parquet(buffer) - actual = cudf.read_parquet(buffer) - assert_eq(expected, actual) - assert_eq(actual.a.dtype, df.a.dtype) - - -def test_parquet_writer_zstd(): - size = 12345 - expected = cudf.DataFrame( - { - "a": np.arange(0, stop=size, dtype="float64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), - } - ) - - buff = BytesIO() - try: - expected.to_parquet(buff, compression="ZSTD") - except RuntimeError: - pytest.mark.xfail(reason="Newer nvCOMP version is required") - else: - got = pd.read_parquet(buff) - assert_eq(expected, got) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_writer_time_delta_physical_type(store_schema): - df = cudf.DataFrame( - { - "s": cudf.Series([1], dtype="timedelta64[s]"), - "ms": cudf.Series([2], dtype="timedelta64[ms]"), - "us": cudf.Series([3], dtype="timedelta64[us]"), - # 4K because Pandas/pyarrow don't support non-zero nanoseconds - # in Parquet files - "ns": cudf.Series([4000], dtype="timedelta64[ns]"), - } - ) - buffer = BytesIO() - df.to_parquet(buffer, store_schema=store_schema) - - got = pd.read_parquet(buffer) - - if store_schema: - expected = pd.DataFrame( - { - "s": ["0 days 00:00:01"], - "ms": ["0 days 00:00:00.002000"], - "us": ["0 days 00:00:00.000003"], - "ns": ["0 days 00:00:00.000004"], - }, - dtype="str", - ) - else: - expected = pd.DataFrame( - { - "s": ["00:00:01"], - "ms": ["00:00:00.002000"], - "us": ["00:00:00.000003"], - "ns": ["00:00:00.000004"], - }, - dtype="str", - ) - assert_eq(got.astype("str"), expected) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_roundtrip_time_delta(store_schema): - num_rows = 12345 - df = cudf.DataFrame( - { - "s": cudf.Series( - random.sample(range(0, 200000), num_rows), - dtype="timedelta64[s]", - ), - "ms": cudf.Series( - random.sample(range(0, 200000), num_rows), - dtype="timedelta64[ms]", - ), - "us": cudf.Series( - random.sample(range(0, 200000), num_rows), - dtype="timedelta64[us]", - ), - "ns": cudf.Series( - random.sample(range(0, 200000), num_rows), - dtype="timedelta64[ns]", - ), - } - ) - buffer = BytesIO() - df.to_parquet(buffer, store_schema=store_schema) - # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]` - assert_eq(df, cudf.read_parquet(buffer), check_dtype=False) - if store_schema: - assert_eq(df, pd.read_parquet(buffer)) - - -def test_parquet_reader_malformed_file(datadir): - fname = datadir / "nested-unsigned-malformed.parquet" - - # expect a failure when reading the whole file - with pytest.raises(RuntimeError): - cudf.read_parquet(fname) - - -def test_parquet_reader_unsupported_page_encoding(datadir): - fname = datadir / "delta_encoding.parquet" - - # expect a failure when reading the whole file - with pytest.raises(RuntimeError): - cudf.read_parquet(fname) - - -def test_parquet_reader_detect_bad_dictionary(datadir): - fname = datadir / "bad_dict.parquet" - - # expect a failure when reading the whole file - with pytest.raises(RuntimeError): - cudf.read_parquet(fname) - - -@pytest.mark.parametrize("data", [{"a": [1, 2, 3, 4]}, {"b": [1, None, 2, 3]}]) -@pytest.mark.parametrize("force_nullable_schema", [True, False]) -def test_parquet_writer_schema_nullability(data, force_nullable_schema): - df = cudf.DataFrame(data) - file_obj = BytesIO() - - df.to_parquet(file_obj, force_nullable_schema=force_nullable_schema) - - assert pa.parquet.read_schema(file_obj).field(0).nullable == ( - force_nullable_schema or df.isnull().any().any() - ) - - -def test_parquet_read_filter_and_project(): - # Filter on columns that are not included - # in the current column projection - - with BytesIO() as buffer: - # Write parquet data - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5] * 10, - "b": [0, 1, 2, 3, 4] * 10, - "c": range(50), - "d": [6, 7] * 25, - "e": [8, 9] * 25, - } - ) - df.to_parquet(buffer) - - # Read back with filter and projection - columns = ["b"] - filters = [[("a", "==", 5), ("c", ">", 20)]] - got = cudf.read_parquet(buffer, columns=columns, filters=filters) - - # Check result - expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True) - assert_eq(got, expected) - - -def test_parquet_reader_multiindex(): - expected = pd.DataFrame( - {"A": [1, 2, 3]}, - index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]), - ) - file_obj = BytesIO() - expected.to_parquet(file_obj, engine="pyarrow") - with pytest.warns(UserWarning): - actual = cudf.read_parquet(file_obj, engine="pyarrow") - assert_eq(actual, expected) - - -def test_parquet_reader_engine_error(): - with pytest.raises(ValueError): - cudf.read_parquet(BytesIO(), engine="abc") - - -def test_reader_lz4(): - pdf = pd.DataFrame({"ints": [1, 2] * 5001}) - - buffer = BytesIO() - pdf.to_parquet(buffer, compression="LZ4") - - got = cudf.read_parquet(buffer) - assert_eq(pdf, got) - - -def test_writer_lz4(): - gdf = cudf.DataFrame({"ints": [1, 2] * 5001}) - - buffer = BytesIO() - gdf.to_parquet(buffer, compression="LZ4") - - got = pd.read_parquet(buffer) - assert_eq(gdf, got) - - -def test_parquet_reader_zstd_huff_tables(datadir): - # Ensure that this zstd-compressed file does not overrun buffers. The - # problem was fixed in nvcomp 3.0.6. - # See https://github.com/rapidsai/cudf/issues/15096 - fname = datadir / "zstd_huff_tables_bug.parquet" - - expected = pa.parquet.read_table(fname).to_pandas() - actual = cudf.read_parquet(fname) - assert_eq(actual, expected) - - -def test_parquet_reader_roundtrip_with_arrow_schema(): - # Ensure that the nested types are faithfully being roundtripped - # across Parquet with arrow schema which is used to faithfully - # round trip duration types (timedelta64) across Parquet read and write. - pdf = pd.DataFrame( - { - "s": pd.Series([None, None, None], dtype="timedelta64[s]"), - "ms": pd.Series([1234, None, 32442], dtype="timedelta64[ms]"), - "us": pd.Series([None, 3456, None], dtype="timedelta64[us]"), - "ns": pd.Series([1234, 3456, 32442], dtype="timedelta64[ns]"), - "duration_list": list( - [ - [ - datetime.timedelta(minutes=7, seconds=4), - datetime.timedelta(minutes=7), - ], - [ - None, - None, - ], - [ - datetime.timedelta(minutes=7, seconds=4), - None, - ], - ] - ), - "int64": pd.Series([1234, 123, 4123], dtype="int64"), - "list": list([[1, 2], [1, 2], [1, 2]]), - "datetime": pd.Series([1234, 123, 4123], dtype="datetime64[ms]"), - "map": pd.Series(["cat", "dog", "lion"]).map( - {"cat": "kitten", "dog": "puppy", "lion": "cub"} - ), - } - ) - - # Write parquet with arrow for now (to write arrow:schema) - buffer = BytesIO() - pdf.to_parquet(buffer, engine="pyarrow") - - # Read parquet with arrow schema - got = cudf.read_parquet(buffer) - # Convert to cudf table for an apple to apple comparison - expected = cudf.from_pandas(pdf) - - # Check results for reader with schema - assert_eq(expected, got) - - # Reset buffer - buffer = BytesIO() - - # Write to buffer with cudf - expected.to_parquet(buffer, store_schema=True) - - # Read parquet with arrow schema - got = cudf.read_parquet(buffer) - # Convert to cudf table for an apple to apple comparison - expected = cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "data", - [ - # struct - [ - {"a": 1, "b": 2}, - {"a": 10, "b": 20}, - {"a": None, "b": 22}, - {"a": None, "b": None}, - {"a": 15, "b": None}, - ], - # struct-of-list - [ - {"a": 1, "b": 2, "c": [1, 2, 3]}, - {"a": 10, "b": 20, "c": [4, 5]}, - {"a": None, "b": 22, "c": [6]}, - {"a": None, "b": None, "c": None}, - {"a": 15, "b": None, "c": [-1, -2]}, - None, - {"a": 100, "b": 200, "c": [-10, None, -20]}, - ], - # list-of-struct - [ - [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], - None, - [{"a": 10, "b": 20}], - [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], - ], - # struct-of-struct - [ - {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, - {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, - {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, - ], - # struct-with-mixed-types - [ - { - "struct": { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - "Duration": datetime.timedelta(minutes=12), - }, - "StreamId": "12345678", - "Duration": datetime.timedelta(minutes=4), - "Offset": None, - "Resource": [ - { - "Name": "ZoneName", - "Value": "RAPIDS", - "Duration": datetime.timedelta(seconds=1), - } - ], - } - } - } - ], - ], -) -def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data): - # Ensure that the structs with duration types are faithfully being - # roundtripped across Parquet with arrow schema - pdf = pd.DataFrame({"struct": pd.Series(data)}) - - buffer = BytesIO() - pdf.to_parquet(buffer, engine="pyarrow") - - # Read parquet with arrow schema - got = cudf.read_parquet(buffer) - # Convert to cudf table for an apple to apple comparison - expected = cudf.from_pandas(pdf) - - # Check results - assert_eq(expected, got) - - # Reset buffer - buffer = BytesIO() - - # Write to buffer with cudf - expected.to_parquet(buffer, store_schema=True) - - # Read parquet with arrow schema - got = cudf.read_parquet(buffer) - # Convert to cudf table for an apple to apple comparison - expected = cudf.from_pandas(pdf) - - # Check results - assert_eq(expected, got) - - -@pytest.mark.parametrize("index", [None, True, False]) -@pytest.mark.skipif( - version.parse(pa.__version__) < version.parse("15.0.0"), - reason="https://github.com/apache/arrow/pull/37792", -) -def test_parquet_writer_roundtrip_with_arrow_schema(index): - # Ensure that the concrete and nested types are faithfully being roundtripped - # across Parquet with arrow schema - expected = cudf.DataFrame( - { - "s": cudf.Series([None, None, None], dtype="timedelta64[s]"), - "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"), - "duration_list": list( - [ - [ - datetime.timedelta(minutes=7, seconds=4), - datetime.timedelta(minutes=7), - ], - [ - None, - None, - ], - [ - datetime.timedelta(minutes=7, seconds=4), - None, - ], - ] - ), - "int64": cudf.Series([-1234, 123, 4123], dtype="int64"), - "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"), - "list": list([[1, 2], [1, 2], [1, 2]]), - "bool": cudf.Series([True, None, False], dtype=bool), - "fixed32": cudf.Series([0.00, 1.0, None]).astype( - cudf.Decimal32Dtype(7, 2) - ), - "fixed64": cudf.Series([0.00, 1.0, None]).astype( - cudf.Decimal64Dtype(7, 2) - ), - "fixed128": cudf.Series([0.00, 1.0, None]).astype( - cudf.Decimal128Dtype(7, 2) - ), - "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), - "map": cudf.Series(["cat", "dog", "lion"]).map( - {"cat": "kitten", "dog": "puppy", "lion": "cub"} - ), - } - ) - - # Write to Parquet with arrow schema for faithful roundtrip - buffer = BytesIO() - expected.to_parquet(buffer, store_schema=True, index=index) - - # Convert decimal types to d128 - expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)}) - expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)}) - - # Read parquet with pyarrow, pandas and cudf readers - got = cudf.DataFrame.from_arrow(pq.read_table(buffer)) - got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer)) - got3 = cudf.read_parquet(buffer) - - # drop the index column for comparison: __index_level_0__ - if index: - got.drop(columns="__index_level_0__", inplace=True) - got2.drop(columns="__index_level_0__", inplace=True) - - # Check results - assert_eq(expected, got) - assert_eq(expected, got2) - assert_eq(expected, got3) - - -def test_parquet_writer_int96_timestamps_and_arrow_schema(): - df = cudf.DataFrame( - { - "timestamp": cudf.Series( - [1234, 123, 4123], dtype="datetime64[ms]" - ), - } - ) - - # Output buffer - buffer = BytesIO() - - # Writing out parquet with both INT96 timestamps and arrow_schema - # enabled should throw an exception. - with pytest.raises(RuntimeError): - df.to_parquet(buffer, int96_timestamps=True, store_schema=True) - - -@pytest.mark.parametrize( - "data", - [ - # struct - [ - {"a": 1, "b": 2}, - {"a": 10, "b": 20}, - {"a": None, "b": 22}, - {"a": None, "b": None}, - {"a": 15, "b": None}, - ], - # struct-of-list - [ - {"a": 1, "b": 2, "c": [1, 2, 3]}, - {"a": 10, "b": 20, "c": [4, 5]}, - {"a": None, "b": 22, "c": [6]}, - {"a": None, "b": None, "c": None}, - {"a": 15, "b": None, "c": [-1, -2]}, - None, - {"a": 100, "b": 200, "c": [-10, None, -20]}, - ], - # list-of-struct - [ - [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}], - None, - [{"a": 10, "b": 20}], - [{"a": 100, "b": 200}, {"a": None, "b": 300}, None], - ], - # struct-of-struct - [ - {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2}, - {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4}, - {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6}, - {"a": 7, "b": None, "c": 8}, - {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None}, - None, - {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10}, - ], - # struct-with-mixed-types - [ - { - "struct": { - "payload": { - "Domain": { - "Name": "abc", - "Id": {"Name": "host", "Value": "127.0.0.8"}, - "Duration": datetime.timedelta(minutes=12), - }, - "StreamId": "12345678", - "Duration": datetime.timedelta(minutes=4), - "Offset": None, - "Resource": [ - { - "Name": "ZoneName", - "Value": "RAPIDS", - "Duration": datetime.timedelta(seconds=1), - } - ], - } - } - } - ], - ], -) -@pytest.mark.parametrize("index", [None, True, False]) -@pytest.mark.skipif( - version.parse(pa.__version__) < version.parse("15.0.0"), - reason="https://github.com/apache/arrow/pull/37792", -) -def test_parquet_writer_roundtrip_structs_with_arrow_schema( - tmpdir, data, index -): - # Ensure that the structs are faithfully being roundtripped across - # Parquet with arrow schema - pa_expected = pa.Table.from_pydict({"struct": data}) - - expected = cudf.DataFrame.from_arrow(pa_expected) - - # Write expected data frame to Parquet with arrow schema - buffer = BytesIO() - expected.to_parquet(buffer, store_schema=True, index=index) - - # Read Parquet with pyarrow - pa_got = pq.read_table(buffer) - - # drop the index column for comparison: __index_level_0__ - if index: - pa_got = pa_got.drop(columns="__index_level_0__") - - # Check results - assert_eq(pa_expected, pa_got) - - # Convert to cuDF table and also read Parquet with cuDF reader - got = cudf.DataFrame.from_arrow(pa_got) - got2 = cudf.read_parquet(buffer) - - # Check results - assert_eq(expected, got) - assert_eq(expected, got2) - - -@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000]) -@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000]) -@pytest.mark.parametrize("use_pandas_metadata", [True, False]) -@pytest.mark.parametrize("row_groups", [[[0]], None, [[0, 1]]]) -def test_parquet_chunked_reader( - chunk_read_limit, pass_read_limit, use_pandas_metadata, row_groups -): - df = pd.DataFrame( - {"a": [1, 2, 3, 4] * 1000000, "b": ["av", "qw", "hi", "xyz"] * 1000000} - ) - buffer = BytesIO() - df.to_parquet(buffer) - actual = read_parquet_chunked( - [buffer], - chunk_read_limit=chunk_read_limit, - pass_read_limit=pass_read_limit, - use_pandas_metadata=use_pandas_metadata, - row_groups=row_groups, - ) - expected = cudf.read_parquet( - buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "nrows,skip_rows", - [ - (0, 0), - (1000, 0), - (0, 1000), - (1000, 10000), - ], -) -def test_parquet_reader_nrows_skiprows(nrows, skip_rows): - df = pd.DataFrame( - {"a": [1, 2, 3, 4] * 100000, "b": ["av", "qw", "hi", "xyz"] * 100000} - ) - expected = df[skip_rows : skip_rows + nrows] - buffer = BytesIO() - df.to_parquet(buffer) - got = cudf.read_parquet(buffer, nrows=nrows, skip_rows=skip_rows) - assert_eq(expected, got) - - -def test_parquet_reader_pandas_compatibility(): - df = pd.DataFrame( - {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000} - ) - buffer = BytesIO() - df.to_parquet(buffer) - with cudf.option_context("io.parquet.low_memory", True): - expected = cudf.read_parquet(buffer) - assert_eq(expected, df) - - -@pytest.mark.parametrize("store_schema", [True, False]) -def test_parquet_reader_with_mismatched_tables(store_schema): - # cuDF tables with mixed types - df1 = cudf.DataFrame( - { - "i32": cudf.Series([None, None, None], dtype="int32"), - "i64": cudf.Series([1234, 467, 123], dtype="int64"), - "list": list([[1, 2], None, [None, 6]]), - "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), - "str": ["vfd", None, "ghu"], - "d_list": list( - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [None, pd.Timedelta(minutes=3)], - [pd.Timedelta(minutes=8), None], - ] - ), - } - ) - - df2 = cudf.DataFrame( - { - "str": ["abc", "def", "ghi"], - "i64": cudf.Series([None, 65, 98], dtype="int64"), - "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"), - "list": list([[7, 8], [9, 10], [11, 12]]), - "d_list": list( - [ - [pd.Timedelta(minutes=4), None], - None, - [pd.Timedelta(minutes=6), None], - ] - ), - } - ) - - # IO buffers - buf1 = BytesIO() - buf2 = BytesIO() - - # Write Parquet with and without arrow schema - df1.to_parquet(buf1, store_schema=store_schema) - df2.to_parquet(buf2, store_schema=store_schema) - - # Read mismatched Parquet files - got = cudf.read_parquet( - [buf1, buf2], - columns=["list", "d_list", "str"], - filters=[("i64", ">", 20)], - allow_mismatched_pq_schemas=True, - ) - - # Construct the expected table - expected = cudf.concat( - [ - df1[df1["i64"] > 20][["list", "d_list", "str"]], - df2[df2["i64"] > 20][["list", "d_list", "str"]], - ] - ).reset_index(drop=True) - - # Read with chunked reader (filter columns not supported) - got_chunked = read_parquet_chunked( - [buf1, buf2], - columns=["list", "d_list", "str"], - chunk_read_limit=240, - pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) - - # Construct the expected table without filter columns - expected_chunked = cudf.concat( - [df1[["list", "d_list", "str"]], df2[["list", "d_list", "str"]]] - ).reset_index(drop=True) - - # Check results - assert_eq(expected, got) - assert_eq(expected_chunked, got_chunked) - - -def test_parquet_reader_with_mismatched_structs(): - data1 = [ - { - "a": 1, - "b": { - "a_a": 10, - "b_b": {"b_b_b": 1, "b_b_a": 2}, - }, - "c": 2, - }, - { - "a": 3, - "b": {"b_a": 30, "b_b": {"b_b_a": 210}}, - "c": 4, - }, - {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6}, - {"a": 7, "b": None, "c": 8}, - {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None}, - ] - - data2 = [ - {"a": 1, "b": {"b_b": {"b_b_a": None}}}, - {"a": 5, "b": {"b_b": None}}, - {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}}, - {"a": None, "b": {"b_b": None}}, - None, - ] - - # cuDF tables from struct data - df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1})) - df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2})) - - # Buffers - buf1 = BytesIO() - buf2 = BytesIO() - - # Write to parquet - df1.to_parquet(buf1) - df2.to_parquet(buf2) - - # Read the struct.b.inner_b.inner_inner_a column from parquet - got = cudf.read_parquet( - [buf1, buf2], - columns=["struct.b.b_b.b_b_a"], - allow_mismatched_pq_schemas=True, - ) - got = ( - cudf.Series(got["struct"]) - .struct.field("b") - .struct.field("b_b") - .struct.field("b_b_a") - ) - - # Read with chunked reader - got_chunked = read_parquet_chunked( - [buf1, buf2], - columns=["struct.b.b_b.b_b_a"], - chunk_read_limit=240, - pass_read_limit=240, - allow_mismatched_pq_schemas=True, - ) - got_chunked = ( - cudf.Series(got_chunked["struct"]) - .struct.field("b") - .struct.field("b_b") - .struct.field("b_b_a") - ) - - # Construct the expected series - expected = cudf.concat( - [ - cudf.Series(df1["struct"]) - .struct.field("b") - .struct.field("b_b") - .struct.field("b_b_a"), - cudf.Series(df2["struct"]) - .struct.field("b") - .struct.field("b_b") - .struct.field("b_b_a"), - ] - ).reset_index(drop=True) - - # Check results - assert_eq(expected, got) - assert_eq(expected, got_chunked) - - -def test_parquet_reader_with_mismatched_schemas_error(): - df1 = cudf.DataFrame( - { - "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"), - "i64": cudf.Series([123, 3454, 123], dtype="int64"), - "i32": cudf.Series([123, 3454, 123], dtype="int32"), - } - ) - df2 = cudf.DataFrame( - { - "i64": cudf.Series([123, 3454, 123], dtype="int64"), - "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"), - } - ) - - buf1 = BytesIO() - buf2 = BytesIO() - - df1.to_parquet(buf1, store_schema=True) - df2.to_parquet(buf2, store_schema=False) - - with pytest.raises( - ValueError, - match="Encountered mismatching SchemaElement properties for a column in the selected path", - ): - cudf.read_parquet( - [buf1, buf2], columns=["millis"], allow_mismatched_pq_schemas=True - ) - - data1 = [ - {"a": 1, "b": {"b_a": 1, "b_b": 6}}, - {"a": 3, "b": {"b_a": None, "b_b": 2}}, - ] - data2 = [ - {"b": {"b_a": 1}, "c": "str"}, - {"b": {"b_a": None}, "c": None}, - ] - - # cuDF tables from struct data - df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1})) - df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2})) - - # Buffers - buf1 = BytesIO() - buf2 = BytesIO() - - # Write to parquet - df1.to_parquet(buf1) - df2.to_parquet(buf2) - - with pytest.raises( - IndexError, - match="Encountered mismatching number of children for a column in the selected path", - ): - cudf.read_parquet( - [buf1, buf2], - columns=["struct.b"], - allow_mismatched_pq_schemas=True, - ) - - with pytest.raises( - IndexError, - match="Encountered mismatching schema tree depths across data sources", - ): - cudf.read_parquet( - [buf1, buf2], - columns=["struct.b.b_b"], - allow_mismatched_pq_schemas=True, - ) - - -def test_parquet_reader_mismatched_nullability(): - # Ensure that we can faithfully read the tables with mismatched nullabilities - df1 = cudf.DataFrame( - { - "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"), - "duration_list": list( - [ - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - None, - [pd.Timedelta(minutes=8), None], - ], - None, - ], - None, - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], - [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], - ] - ], - ] - ), - "int64": cudf.Series([1234, None, 4123], dtype="int64"), - "int32": cudf.Series([1234, 123, 4123], dtype="int32"), - "list": list([[1, 2], [1, 2], [1, 2]]), - "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"), - "string": cudf.Series(["kitten", "puppy", "cub"]), - } - ) - - df2 = cudf.DataFrame( - { - "timedelta": cudf.Series( - [None, None, None], dtype="timedelta64[ms]" - ), - "duration_list": list( - [ - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)], - ], - ], - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], - [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], - ] - ], - [ - [ - [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)], - [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)], - [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)], - ] - ], - ] - ), - "int64": cudf.Series([1234, 123, 4123], dtype="int64"), - "int32": cudf.Series([1234, None, 4123], dtype="int32"), - "list": list([[1, 2], None, [1, 2]]), - "datetime": cudf.Series( - [1234, None, 4123], dtype="datetime64[ms]" - ), - "string": cudf.Series(["kitten", None, "cub"]), - } - ) - - # Write tables to parquet with arrow schema for compatibility for duration column(s) - fname1 = BytesIO() - df1.to_parquet(fname1, store_schema=True) - fname2 = BytesIO() - df2.to_parquet(fname2, store_schema=True) - - # Read tables back with cudf and arrow in either order and compare - assert_eq( - cudf.read_parquet([fname1, fname2]), - cudf.concat([df1, df2]).reset_index(drop=True), - ) - assert_eq( - cudf.read_parquet([fname2, fname1]), - cudf.concat([df2, df1]).reset_index(drop=True), - ) - - -def test_parquet_reader_mismatched_nullability_structs(tmpdir): - data1 = [ - { - "a": "a", - "b": { - "b_a": 10, - "b_b": {"b_b_b": 1, "b_b_a": 12}, - }, - "c": [1, 2], - }, - { - "a": "b", - "b": { - "b_a": 30, - "b_b": {"b_b_b": 2, "b_b_a": 2}, - }, - "c": [3, 4], - }, - { - "a": "c", - "b": { - "b_a": 50, - "b_b": {"b_b_b": 4, "b_b_a": 5}, - }, - "c": [5, 6], - }, - { - "a": "d", - "b": { - "b_a": 135, - "b_b": {"b_b_b": 12, "b_b_a": 32}, - }, - "c": [7, 8], - }, - { - "a": "e", - "b": { - "b_a": 1, - "b_b": {"b_b_b": 1, "b_b_a": 5}, - }, - "c": [9, 10], - }, - { - "a": "f", - "b": { - "b_a": 32, - "b_b": {"b_b_b": 1, "b_b_a": 6}, - }, - "c": [11, 12], - }, - ] - - data2 = [ - { - "a": "g", - "b": { - "b_a": 10, - "b_b": {"b_b_b": None, "b_b_a": 2}, - }, - "c": None, - }, - {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]}, - {"a": "j", "b": None, "c": [8, 10]}, - {"a": None, "b": {"b_a": None, "b_b": None}, "c": None}, - None, - { - "a": None, - "b": {"b_a": None, "b_b": {"b_b_b": 1}}, - "c": [18, 19], - }, - {"a": None, "b": None, "c": None}, - ] - - pa_table1 = pa.Table.from_pydict({"struct": data1}) - df1 = cudf.DataFrame.from_arrow(pa_table1) - - pa_table2 = pa.Table.from_pydict({"struct": data2}) - df2 = cudf.DataFrame.from_arrow(pa_table2) - - # Write tables to parquet - buf1 = BytesIO() - df1.to_parquet(buf1) - buf2 = BytesIO() - df2.to_parquet(buf2) - - # Read tables back with cudf and compare with expected. - assert_eq( - cudf.read_parquet([buf1, buf2]), - cudf.concat([df1, df2]).reset_index(drop=True), - ) - assert_eq( - cudf.read_parquet([buf2, buf1]), - cudf.concat([df2, df1]).reset_index(drop=True), - ) diff --git a/python/cudf/cudf/tests/test_performance_tracking.py b/python/cudf/cudf/tests/test_performance_tracking.py deleted file mode 100644 index e886b77af3f..00000000000 --- a/python/cudf/cudf/tests/test_performance_tracking.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from io import StringIO - -import pytest - -import rmm.mr -import rmm.statistics - -import cudf -from cudf.utils.performance_tracking import ( - get_memory_records, - print_memory_report, -) - - -@pytest.fixture -def rmm_reset(): - """Fixture to reset the RMM resource before and after the test""" - mr = rmm.mr.get_current_device_resource() - try: - rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) - yield - finally: - rmm.mr.set_current_device_resource(mr) - - -def test_memory_profiling(rmm_reset): - df1 = cudf.DataFrame({"a": [1, 2, 3]}) - assert len(get_memory_records()) == 0 - - rmm.statistics.enable_statistics() - cudf.set_option("memory_profiling", True) - - df1.merge(df1) - - assert len(get_memory_records()) > 0 - - out = StringIO() - print_memory_report(file=out) - assert "DataFrame.merge" in out.getvalue() diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py deleted file mode 100644 index 0f13a9e173a..00000000000 --- a/python/cudf/cudf/tests/test_pickling.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import pickle - -import numpy as np -import pandas as pd -import pytest - -from cudf import DataFrame, Index, RangeIndex, Series -from cudf.core.buffer import as_buffer -from cudf.testing import assert_eq - -pytestmark = pytest.mark.spilling - - -def check_serialization(df): - # basic - assert_frame_picklable(df) - # sliced - assert_frame_picklable(df[:-1]) - assert_frame_picklable(df[1:]) - assert_frame_picklable(df[2:-2]) - # sorted - sortvaldf = df.sort_values("vals") - assert isinstance(sortvaldf.index, (Index, RangeIndex)) - assert_frame_picklable(sortvaldf) - # out-of-band - buffers = [] - serialbytes = pickle.dumps(df, protocol=5, buffer_callback=buffers.append) - for b in buffers: - assert isinstance(b, pickle.PickleBuffer) - loaded = pickle.loads(serialbytes, buffers=buffers) - assert_eq(loaded, df) - - -def assert_frame_picklable(df): - serialbytes = pickle.dumps(df) - loaded = pickle.loads(serialbytes) - assert_eq(loaded, df) - - -def test_pickle_dataframe_numeric(): - np.random.seed(0) - df = DataFrame() - nelem = 10 - df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) - - check_serialization(df) - - -def test_pickle_dataframe_categorical(): - np.random.seed(0) - - df = DataFrame() - df["keys"] = pd.Categorical( - ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] - ) - df["vals"] = np.random.random(len(df)) - - check_serialization(df) - - -def test_memory_usage_dataframe(): - np.random.seed(0) - df = DataFrame() - nelem = 1000 - df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) - df["vals"] = hvals = np.random.random(nelem) - - nbytes = hkeys.nbytes + hvals.nbytes - sizeof = df.memory_usage().sum() - assert sizeof >= nbytes - - serialized_nbytes = len(pickle.dumps(df, protocol=pickle.HIGHEST_PROTOCOL)) - - # assert at least sizeof bytes were serialized - assert serialized_nbytes >= sizeof - - -def test_pickle_index(): - nelem = 10 - idx = Index(np.arange(nelem), name="a") - pickled = pickle.dumps(idx) - out = pickle.loads(pickled) - assert (idx == out).all() - - -def test_pickle_buffer(): - arr = np.arange(10).view("|u1") - buf = as_buffer(arr) - assert buf.size == arr.nbytes - pickled = pickle.dumps(buf) - unpacked = pickle.loads(pickled) - # Check that unpacked capacity equals buf.size - assert unpacked.size == arr.nbytes - - -@pytest.mark.parametrize("named", [True, False]) -def test_pickle_series(named): - np.random.seed(0) - if named: - ser = Series(np.random.random(10), name="a") - else: - ser = Series(np.random.random(10)) - - pickled = pickle.dumps(ser) - out = pickle.loads(pickled) - assert (ser == out).all() - - -@pytest.mark.parametrize( - "slices", - [ - slice(None, None, None), - slice(1, 3, 1), - slice(0, 3, 1), - slice(3, 5, 1), - slice(10, 12, 1), - ], -) -def test_pickle_categorical_column(slices): - sr = Series(["a", "b", None, "a", "c", "b"]).astype("category") - sliced_sr = sr.iloc[slices] - input_col = sliced_sr._column - - pickled = pickle.dumps(input_col) - out = pickle.loads(pickled) - - assert_eq(Series._from_column(out), Series._from_column(input_col)) - - -@pytest.mark.parametrize( - "slices", - [ - slice(None, None, None), - slice(1, 3, 1), - slice(0, 3, 1), - slice(3, 5, 1), - slice(10, 12, 1), - ], -) -def test_pickle_string_column(slices): - sr = Series(["a", "b", None, "a", "c", "b"]) - sliced_sr = sr.iloc[slices] - input_col = sliced_sr._column - - pickled = pickle.dumps(input_col) - out = pickle.loads(pickled) - - assert_eq(Series._from_column(out), Series._from_column(input_col)) diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py deleted file mode 100644 index 7d8303df0c3..00000000000 --- a/python/cudf/cudf/tests/test_quantiles.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import re - -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal - - -def test_single_q(): - q = 0.5 - - pdf = pd.DataFrame({"a": [4, 24, 13, 8, 7]}) - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -def test_with_index(): - q = [0, 0.5, 1] - - pdf = pd.DataFrame({"a": [7, 4, 4, 9, 13]}, index=[0, 4, 3, 2, 7]) - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -def test_with_multiindex(): - q = [0, 0.5, 1] - - pdf = pd.DataFrame( - { - "index_1": [3, 1, 9, 7, 5], - "index_2": [2, 4, 3, 5, 1], - "a": [8, 4, 2, 3, 8], - } - ) - pdf.set_index(["index_1", "index_2"], inplace=True) - - gdf = cudf.from_pandas(pdf) - - pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantile(q, interpolation="nearest", method="table") - - assert_eq(pdf_q, gdf_q, check_index_type=False) - - -@pytest.mark.parametrize("q", [2, [1, 2, 3]]) -def test_quantile_range_error(q): - ps = pd.Series([1, 2, 3]) - gs = cudf.from_pandas(ps) - assert_exceptions_equal( - lfunc=ps.quantile, - rfunc=gs.quantile, - lfunc_args_and_kwargs=([q],), - rfunc_args_and_kwargs=([q],), - ) - - -def test_quantile_q_type(): - gs = cudf.Series([1, 2, 3]) - with pytest.raises( - TypeError, - match=re.escape( - "q must be a scalar or array-like, got " - ), - ): - gs.quantile(cudf.DataFrame()) - - -@pytest.mark.parametrize( - "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] -) -def test_quantile_type_int_float(interpolation): - data = [1, 3, 4] - psr = pd.Series(data) - gsr = cudf.Series(data) - - expected = psr.quantile(0.5, interpolation=interpolation) - actual = gsr.quantile(0.5, interpolation=interpolation) - - assert expected == actual - assert type(expected) == type(actual) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py deleted file mode 100644 index b12209fd3b9..00000000000 --- a/python/cudf/cudf/tests/test_query.py +++ /dev/null @@ -1,233 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - - -import datetime -import inspect -from itertools import product - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import DataFrame -from cudf.testing import assert_eq -from cudf.utils import queryutils - -_params_query_parser = [] -_params_query_parser.append(("a > @b", ("a", "__CUDF_ENVREF__b"))) -_params_query_parser.append(("(a + b) <= @c", ("a", "b", "__CUDF_ENVREF__c"))) -_params_query_parser.append(("a > b if a > 0 else b > a", ("a", "b"))) - - -@pytest.mark.parametrize("text,expect_args", _params_query_parser) -def test_query_parser(text, expect_args): - info = queryutils.query_parser(text) - fn = queryutils.query_builder(info, "myfoo") - assert callable(fn) - argspec = inspect.getfullargspec(fn) - assert tuple(argspec.args) == tuple(expect_args) - - -params_query_data = list(product([1, 2, 7, 8, 9, 16, 100, 129], range(2))) -params_query_fn = [ - (lambda a, b: a < b, "a < b"), - (lambda a, b: a * 2 >= b, "a * 2 >= b"), - (lambda a, b: 2 * (a + b) > (a + b) / 2, "2 * (a + b) > (a + b) / 2"), -] -nulls = [True, False] - - -@pytest.mark.parametrize( - "data,fn,nulls", product(params_query_data, params_query_fn, nulls) -) -def test_query(data, fn, nulls): - # prepare - nelem, seed = data - expect_fn, query_expr = fn - np.random.seed(seed) - pdf = pd.DataFrame() - pdf["a"] = np.arange(nelem) - pdf["b"] = np.random.random(nelem) * nelem - if nulls: - pdf.loc[::2, "a"] = None - gdf = cudf.from_pandas(pdf) - assert_eq(pdf.query(query_expr), gdf.query(query_expr)) - - -params_query_env_fn = [ - (lambda a, b, c, d: a * c > b + d, "a * @c > b + @d"), - ( - lambda a, b, c, d: ((a / c) < d) | ((b**c) > d), - "((a / @c) < @d) | ((b ** @c) > @d)", - ), -] - - -@pytest.mark.parametrize( - "data,fn", product(params_query_data, params_query_env_fn) -) -def test_query_ref_env(data, fn): - # prepare - nelem, seed = data - expect_fn, query_expr = fn - np.random.seed(seed) - df = DataFrame() - df["a"] = aa = np.arange(nelem) - df["b"] = bb = np.random.random(nelem) * nelem - c = 2.3 - d = 1.2 - # udt - expect_mask = expect_fn(aa, bb, c, d) - print(expect_mask) - df2 = df.query(query_expr) - # check - assert len(df2) == np.count_nonzero(expect_mask) - np.testing.assert_array_almost_equal(df2["a"].to_numpy(), aa[expect_mask]) - np.testing.assert_array_almost_equal(df2["b"].to_numpy(), bb[expect_mask]) - - -def test_query_env_changing(): - df = DataFrame() - df["a"] = aa = np.arange(100) - expr = "a < @c" - # first attempt - c = 10 - got = df.query(expr) - np.testing.assert_array_equal(aa[aa < c], got["a"].to_numpy()) - # change env - c = 50 - got = df.query(expr) - np.testing.assert_array_equal(aa[aa < c], got["a"].to_numpy()) - - -def test_query_local_dict(): - df = DataFrame() - df["a"] = aa = np.arange(100) - expr = "a < @val" - - got = df.query(expr, local_dict={"val": 10}) - np.testing.assert_array_equal(aa[aa < 10], got["a"].to_numpy()) - - # test for datetime - df = DataFrame() - data = np.array(["2018-10-07", "2018-10-08"], dtype="datetime64") - df["datetimes"] = data - search_date = datetime.datetime.strptime("2018-10-08", "%Y-%m-%d") - expr = "datetimes==@search_date" - - got = df.query(expr, local_dict={"search_date": search_date}) - np.testing.assert_array_equal(data[1], got["datetimes"].to_numpy()) - - -def test_query_splitted_combine(): - np.random.seed(0) - df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} - ) - gdf = DataFrame.from_pandas(df) - - # Split the GDF - s1 = gdf[:5] - s2 = gdf[5:] - - # Do the query - expr = "x > 2" - q1 = s1.query(expr) - q2 = s2.query(expr) - # Combine - got = cudf.concat([q1, q2]).to_pandas() - - # Should equal to just querying the original GDF - expect = gdf.query(expr).to_pandas() - assert_eq(got, expect, check_index_type=True) - - -def test_query_empty_frames(): - empty_pdf = pd.DataFrame({"a": [], "b": []}) - empty_gdf = DataFrame.from_pandas(empty_pdf) - # Do the query - expr = "a > 2" - got = empty_gdf.query(expr).to_pandas() - expect = empty_pdf.query(expr) - - # assert equal results - assert_eq(got, expect) - - -@pytest.mark.parametrize(("a_val", "b_val", "c_val"), [(4, 3, 15)]) -@pytest.mark.parametrize("index", ["a", ["a", "b"]]) -@pytest.mark.parametrize( - "query", - [ - "a < @a_val", - "a < @a_val and b > @b_val", - "(a < @a_val and b >@b_val) or c >@c_val", - ], -) -def test_query_with_index_name(index, query, a_val, b_val, c_val): - pdf = pd.DataFrame( - { - "a": [1, None, 3, 4, 5], - "b": [5, 4, 3, 2, 1], - "c": [12, 15, 17, 19, 27], - } - ) - pdf.set_index(index) - - gdf = DataFrame.from_pandas(pdf) - - out = gdf.query(query) - expect = pdf.query(query) - - assert_eq(out, expect) - - -@pytest.mark.parametrize(("a_val", "b_val", "c_val"), [(4, 3, 15)]) -@pytest.mark.parametrize( - "query", - [ - "index < @a_val", - "index < @a_val and b > @b_val", - "(index < @a_val and b >@b_val) or c >@c_val", - ], -) -def test_query_with_index_keyword(query, a_val, b_val, c_val): - pdf = pd.DataFrame( - { - "a": [1, None, 3, 4, 5], - "b": [5, 4, 3, 2, 1], - "c": [12, 15, 17, 19, 27], - } - ) - pdf.set_index("a") - - gdf = DataFrame.from_pandas(pdf) - - out = gdf.query(query) - expect = pdf.query(query) - - assert_eq(out, expect) - - -@pytest.mark.parametrize( - "data, query", - [ - # Only need to test the dtypes that pandas - # supports but that we do not - (["a", "b", "c"], "data == 'a'"), - ], -) -def test_query_unsupported_dtypes(data, query): - gdf = cudf.DataFrame({"data": data}) - - # make sure the query works in pandas - pdf = gdf.to_pandas() - pdf_result = pdf.query(query) - - expect = pd.DataFrame({"data": ["a"]}) - assert_eq(expect, pdf_result) - - # but fails in cuDF - with pytest.raises(TypeError): - gdf.query(query) diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py deleted file mode 100644 index 9372681187d..00000000000 --- a/python/cudf/cudf/tests/test_query_mask.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - -_data = [ - {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]}, - {"a": [0, 1.0, 2.0, None, 3, np.nan, None, 4]}, - {"a": [0, 1.0, 2.0, None, 3, np.nan, None, 4, None, 9]}, -] -_queries = [ - "a == 3", - # "a != 3", # incompatible with pandas - "a < 3", - "a <= 3", - "a < 3", - "a >= 3", -] - - -@pytest.mark.parametrize("data", _data) -@pytest.mark.parametrize("query", _queries) -def test_mask_0(data, query): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - pdf_q_res = pdf.query(query) - gdf_q_res = gdf.query(query) - - assert_eq(pdf_q_res, gdf_q_res) - - -@pytest.mark.parametrize("data", _data) -@pytest.mark.parametrize("nan_as_null", [False, True]) -@pytest.mark.parametrize("query", _queries) -def test_mask_1(data, nan_as_null, query): - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame.from_pandas(pdf, nan_as_null=nan_as_null) - - pdf_q_res = pdf.query(query) - gdf_q_res = gdf.query(query) - - assert_eq(pdf_q_res, gdf_q_res) - - -@pytest.mark.parametrize("data", _data) -@pytest.mark.parametrize("query", _queries) -def test_mask_2(data, query): - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - pdf_q_res = pdf.query(query) - gdf_q_res = gdf.query(query) - - assert_eq(pdf_q_res, gdf_q_res) - - -@pytest.mark.parametrize("data", _data) -@pytest.mark.parametrize("query", _queries) -def test_dataframe_initializer(data, query): - pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) - - pdf_q_res = pdf.query(query) - gdf_q_res = gdf.query(query) - - assert_eq(pdf_q_res, gdf_q_res) diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py deleted file mode 100644 index 4c1d8ce92ae..00000000000 --- a/python/cudf/cudf/tests/test_rank.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from itertools import chain, combinations_with_replacement, product - -import numpy as np -import pandas as pd -import pytest - -from cudf import DataFrame -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal - - -@pytest.fixture -def pdf(): - return pd.DataFrame( - { - "col1": np.array([5, 4, 3, 5, 8, 5, 2, 1, 6, 6]), - "col2": np.array( - [5, 4, np.nan, 5, 8, 5, np.inf, np.nan, 6, -np.inf] - ), - }, - index=np.array([5, 4, 3, 2, 1, 6, 7, 8, 9, 10]), - ) - - -@pytest.mark.parametrize("dtype", ["O", "f8", "i4"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) -@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) -@pytest.mark.parametrize("pct", [True, False]) -@pytest.mark.parametrize("numeric_only", [True, False]) -def test_rank_all_arguments( - pdf, dtype, ascending, method, na_option, pct, numeric_only -): - if method == "first" and dtype == "O": - # not supported by pandas - return - - pdf = pdf.copy(deep=True) # for parallel pytest - if numeric_only: - pdf["str"] = np.array( - ["a", "b", "c", "d", "e", "1", "2", "3", "4", "5"] - ) - gdf = DataFrame.from_pandas(pdf) - - kwargs = { - "method": method, - "na_option": na_option, - "ascending": ascending, - "pct": pct, - "numeric_only": numeric_only, - } - - # Series - assert_eq(gdf["col1"].rank(**kwargs), pdf["col1"].rank(**kwargs)) - assert_eq(gdf["col2"].rank(**kwargs), pdf["col2"].rank(**kwargs)) - if numeric_only: - assert_exceptions_equal( - lfunc=pdf["str"].rank, - rfunc=gdf["str"].rank, - lfunc_args_and_kwargs=( - [], - kwargs, - ), - rfunc_args_and_kwargs=( - [], - kwargs, - ), - ) - - actual = gdf.rank(**kwargs) - expected = pdf.rank(**kwargs) - - assert_eq(expected, actual) - - -def test_rank_error_arguments(pdf): - gdf = DataFrame.from_pandas(pdf) - - assert_exceptions_equal( - lfunc=pdf["col1"].rank, - rfunc=gdf["col1"].rank, - lfunc_args_and_kwargs=( - [], - { - "method": "randomname", - "na_option": "keep", - "ascending": True, - "pct": True, - }, - ), - rfunc_args_and_kwargs=( - [], - { - "method": "randomname", - "na_option": "keep", - "ascending": True, - "pct": True, - }, - ), - ) - - assert_exceptions_equal( - lfunc=pdf["col1"].rank, - rfunc=gdf["col1"].rank, - lfunc_args_and_kwargs=( - [], - { - "method": "first", - "na_option": "randomname", - "ascending": True, - "pct": True, - }, - ), - rfunc_args_and_kwargs=( - [], - { - "method": "first", - "na_option": "randomname", - "ascending": True, - "pct": True, - }, - ), - ) - - -sort_group_args = [ - np.full((3,), np.nan), - 100 * np.random.random(10), - np.full((3,), np.inf), - np.full((3,), -np.inf), -] -sort_dtype_args = [np.int32, np.int64, np.float32, np.float64] - - -@pytest.mark.filterwarnings("ignore:invalid value encountered in cast") -@pytest.mark.parametrize( - "elem,dtype", - list( - product( - combinations_with_replacement(sort_group_args, 4), - sort_dtype_args, - ) - ), -) -def test_series_rank_combinations(elem, dtype): - np.random.seed(0) - aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype) - gdf = DataFrame() - df = pd.DataFrame() - gdf["a"] = aa - df["a"] = aa - ranked_gs = gdf["a"].rank(method="first") - ranked_ps = df["a"].rank(method="first") - # Check - assert_eq(ranked_ps, ranked_gs) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py deleted file mode 100644 index f276f394cd0..00000000000 --- a/python/cudf/cudf/tests/test_reductions.py +++ /dev/null @@ -1,416 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - - -from decimal import Decimal -from itertools import product - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Series -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import _utils as utils, assert_eq -from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand - -params_dtype = NUMERIC_TYPES - -params_sizes = [1, 2, 3, 127, 128, 129, 200, 10000] - -params = list(product(params_dtype, params_sizes)) - - -@pytest.mark.parametrize("dtype,nelem", params) -def test_sum(dtype, nelem): - dtype = cudf.dtype(dtype).type - data = gen_rand(dtype, nelem) - sr = Series(data) - - got = sr.sum() - expect = data.sum() - significant = 4 if dtype == np.float32 else 6 - np.testing.assert_approx_equal(expect, got, significant=significant) - - -def test_sum_string(): - s = Series(["Hello", "there", "World"]) - - got = s.sum() - expected = "HellothereWorld" - - assert got == expected - - s = Series(["Hello", None, "World"]) - - got = s.sum() - expected = "HelloWorld" - - assert got == expected - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(6, 3), - Decimal64Dtype(10, 6), - Decimal64Dtype(16, 7), - Decimal32Dtype(6, 3), - Decimal128Dtype(20, 7), - ], -) -@pytest.mark.parametrize("nelem", params_sizes) -def test_sum_decimal(dtype, nelem): - np.random.seed(0) - data = [str(x) for x in gen_rand("int64", nelem) / 100] - - expected = pd.Series([Decimal(x) for x in data]).sum() - got = cudf.Series(data).astype(dtype).sum() - - assert_eq(expected, got) - - -@pytest.mark.parametrize("dtype,nelem", params) -def test_product(dtype, nelem): - np.random.seed(0) - dtype = cudf.dtype(dtype).type - if cudf.dtype(dtype).kind in {"u", "i"}: - data = np.ones(nelem, dtype=dtype) - # Set at most 30 items to [0..2) to keep the value within 2^32 - for _ in range(30): - data[np.random.randint(low=0, high=nelem, size=1)] = ( - np.random.uniform() * 2 - ) - else: - data = gen_rand(dtype, nelem) - - sr = Series(data) - - got = sr.product() - expect = pd.Series(data).product() - significant = 4 if dtype == np.float32 else 6 - np.testing.assert_approx_equal(expect, got, significant=significant) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(6, 2), - Decimal64Dtype(8, 4), - Decimal64Dtype(10, 5), - Decimal32Dtype(6, 2), - Decimal128Dtype(20, 5), - ], -) -def test_product_decimal(dtype): - np.random.seed(0) - data = [str(x) for x in gen_rand("int8", 3) / 10] - - expected = pd.Series([Decimal(x) for x in data]).product() - got = cudf.Series(data).astype(dtype).product() - - assert_eq(expected, got) - - -accuracy_for_dtype = {np.float64: 6, np.float32: 5} - - -@pytest.mark.parametrize("dtype,nelem", params) -def test_sum_of_squares(dtype, nelem): - dtype = cudf.dtype(dtype).type - data = gen_rand(dtype, nelem) - sr = Series(data) - df = cudf.DataFrame(sr) - - got = (sr**2).sum() - got_df = (df**2).sum() - expect = (data**2).sum() - - if cudf.dtype(dtype).kind in {"u", "i"}: - if 0 <= expect <= np.iinfo(dtype).max: - np.testing.assert_array_almost_equal(expect, got) - np.testing.assert_array_almost_equal(expect, got_df.iloc[0]) - else: - print("overflow, passing") - else: - np.testing.assert_approx_equal( - expect, got, significant=accuracy_for_dtype[dtype] - ) - np.testing.assert_approx_equal( - expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype] - ) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(6, 2), - Decimal64Dtype(8, 4), - Decimal64Dtype(10, 5), - Decimal128Dtype(20, 7), - Decimal32Dtype(6, 2), - ], -) -def test_sum_of_squares_decimal(dtype): - np.random.seed(0) - data = [str(x) for x in gen_rand("int8", 3) / 10] - - expected = pd.Series([Decimal(x) for x in data]).pow(2).sum() - got = (cudf.Series(data).astype(dtype) ** 2).sum() - - assert_eq(expected, got) - - -@pytest.mark.parametrize("dtype,nelem", params) -def test_min(dtype, nelem): - dtype = cudf.dtype(dtype).type - data = gen_rand(dtype, nelem) - sr = Series(data) - - got = sr.min() - expect = dtype(data.min()) - - assert expect == got - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(6, 3), - Decimal64Dtype(10, 6), - Decimal64Dtype(16, 7), - Decimal32Dtype(6, 3), - Decimal128Dtype(20, 7), - ], -) -@pytest.mark.parametrize("nelem", params_sizes) -def test_min_decimal(dtype, nelem): - np.random.seed(0) - data = [str(x) for x in gen_rand("int64", nelem) / 100] - - expected = pd.Series([Decimal(x) for x in data]).min() - got = cudf.Series(data).astype(dtype).min() - - assert_eq(expected, got) - - -@pytest.mark.parametrize("dtype,nelem", params) -def test_max(dtype, nelem): - dtype = cudf.dtype(dtype).type - data = gen_rand(dtype, nelem) - sr = Series(data) - - got = sr.max() - expect = dtype(data.max()) - - assert expect == got - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(6, 3), - Decimal64Dtype(10, 6), - Decimal64Dtype(16, 7), - Decimal32Dtype(6, 3), - Decimal128Dtype(20, 7), - ], -) -@pytest.mark.parametrize("nelem", params_sizes) -def test_max_decimal(dtype, nelem): - np.random.seed(0) - data = [str(x) for x in gen_rand("int64", nelem) / 100] - - expected = pd.Series([Decimal(x) for x in data]).max() - got = cudf.Series(data).astype(dtype).max() - - assert_eq(expected, got) - - -@pytest.mark.parametrize("nelem", params_sizes) -def test_sum_masked(nelem): - dtype = np.float64 - data = gen_rand(dtype, nelem) - - mask = utils.random_bitmask(nelem) - bitmask = utils.expand_bits_to_bytes(mask)[:nelem] - null_count = utils.count_zero(bitmask) - - sr = Series.from_masked_array(data, mask, null_count) - - got = sr.sum() - res_mask = np.asarray(bitmask, dtype=np.bool_)[: data.size] - expect = data[res_mask].sum() - - significant = 4 if dtype == np.float32 else 6 - np.testing.assert_approx_equal(expect, got, significant=significant) - - -def test_sum_boolean(): - s = Series(np.arange(100000)) - got = (s > 1).sum() - expect = 99998 - - assert expect == got - - -def test_date_minmax(): - np_data = np.random.normal(size=10**3) - gdf_data = Series(np_data) - - np_casted = np_data.astype("datetime64[ms]") - gdf_casted = gdf_data.astype("datetime64[ms]") - - np_min = np_casted.min() - gdf_min = gdf_casted.min() - assert np_min == gdf_min - - np_max = np_casted.max() - gdf_max = gdf_casted.max() - assert np_max == gdf_max - - -@pytest.mark.parametrize( - "op", - ["sum", "product", "var", "kurt", "kurtosis", "skew"], -) -def test_datetime_unsupported_reductions(op): - gsr = cudf.Series([1, 2, 3, None], dtype="datetime64[ns]") - psr = gsr.to_pandas() - - utils.assert_exceptions_equal( - lfunc=getattr(psr, op), - rfunc=getattr(gsr, op), - ) - - -@pytest.mark.parametrize("op", ["product", "var", "kurt", "kurtosis", "skew"]) -def test_timedelta_unsupported_reductions(op): - gsr = cudf.Series([1, 2, 3, None], dtype="timedelta64[ns]") - psr = gsr.to_pandas() - - utils.assert_exceptions_equal( - lfunc=getattr(psr, op), - rfunc=getattr(gsr, op), - ) - - -@pytest.mark.parametrize("op", ["sum", "product", "std", "var"]) -def test_categorical_reductions(op): - gsr = cudf.Series([1, 2, 3, None], dtype="category") - psr = gsr.to_pandas() - - utils.assert_exceptions_equal(getattr(psr, op), getattr(gsr, op)) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": [10, 11, 12]}, - {"a": [1, 0, 3], "b": [10, 11, 12]}, - {"a": [1, 2, 3], "b": [10, 11, None]}, - { - "a": [], - }, - {}, - ], -) -@pytest.mark.parametrize("op", ["all", "any"]) -def test_any_all_axis_none(data, op): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - expected = getattr(pdf, op)(axis=None) - actual = getattr(gdf, op)(axis=None) - - assert expected == actual - - -@pytest.mark.parametrize( - "op", - [ - "sum", - "product", - "std", - "var", - "kurt", - "kurtosis", - "skew", - "min", - "max", - "mean", - "median", - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warning not given on older versions of pandas", -) -def test_reductions_axis_none_warning(op): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]}) - pdf = df.to_pandas() - with expect_warning_if( - op in {"sum", "product", "std", "var"}, - FutureWarning, - ): - actual = getattr(df, op)(axis=None) - with expect_warning_if( - op in {"sum", "product", "std", "var"}, - FutureWarning, - ): - expected = getattr(pdf, op)(axis=None) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "op", - [ - "sum", - "product", - "std", - "var", - "kurt", - "kurtosis", - "skew", - "min", - "max", - "mean", - "median", - ], -) -def test_dataframe_reduction_no_args(op): - df = cudf.DataFrame({"a": range(10), "b": range(10)}) - pdf = df.to_pandas() - result = getattr(df, op)() - expected = getattr(pdf, op)() - assert_eq(result, expected) - - -def test_reduction_column_multiindex(): - idx = cudf.MultiIndex.from_tuples( - [("a", 1), ("a", 2)], names=["foo", "bar"] - ) - df = cudf.DataFrame(np.array([[1, 3], [2, 4]]), columns=idx) - result = df.mean() - expected = df.to_pandas().mean() - assert_eq(result, expected) - - -@pytest.mark.parametrize("op", ["sum", "product"]) -def test_dtype_deprecated(op): - ser = cudf.Series(range(5)) - with pytest.warns(FutureWarning): - result = getattr(ser, op)(dtype=np.dtype(np.int8)) - assert isinstance(result, np.int8) - - -@pytest.mark.parametrize( - "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")] -) -def test_dataframe_axis_0_preserve_column_type_in_index(columns): - pd_df = pd.DataFrame([[1, 2]], columns=columns) - cudf_df = cudf.DataFrame.from_pandas(pd_df) - result = cudf_df.sum(axis=0) - expected = pd_df.sum(axis=0) - assert_eq(result, expected, check_index_type=True) diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py deleted file mode 100644 index 3a8928297c0..00000000000 --- a/python/cudf/cudf/tests/test_replace.py +++ /dev/null @@ -1,1394 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import operator -import re -from decimal import Decimal - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import ( - INTEGER_TYPES, - NUMERIC_TYPES, - assert_exceptions_equal, - expect_warning_if, -) - - -@pytest.mark.parametrize( - "gsr", - [ - cudf.Series([5, 1, 2, 3, None, 243, None, 4]), - cudf.Series(["one", "two", "three", None, "one"], dtype="category"), - cudf.Series(list(range(400)) + [None]), - ], -) -@pytest.mark.parametrize( - "to_replace,value", - [ - (0, 5), - ("one", "two"), - ("one", "five"), - ("abc", "hello"), - ([0, 1], [5, 6]), - ([22, 323, 27, 0], -1), - ([1, 2, 3], cudf.Series([10, 11, 12])), - (cudf.Series([1, 2, 3]), None), - ({1: 10, 2: 22}, None), - (np.inf, 4), - ], -) -def test_series_replace_all(gsr, to_replace, value): - psr = gsr.to_pandas() - - gd_to_replace = to_replace - if isinstance(to_replace, cudf.Series): - pd_to_replace = to_replace.to_pandas() - else: - pd_to_replace = to_replace - - gd_value = value - if isinstance(value, cudf.Series): - pd_value = value.to_pandas() - else: - pd_value = value - - expect_warn = ( - isinstance(gsr.dtype, cudf.CategoricalDtype) - and isinstance(gd_to_replace, str) - and gd_to_replace == "one" - ) - with expect_warning_if(expect_warn): - actual = gsr.replace(to_replace=gd_to_replace, value=gd_value) - with expect_warning_if(expect_warn and PANDAS_GE_220): - if pd_value is None: - # TODO: Remove this workaround once cudf - # introduces `no_default` values - expected = psr.replace(to_replace=pd_to_replace) - else: - expected = psr.replace(to_replace=pd_to_replace, value=pd_value) - - assert_eq( - expected.sort_values().reset_index(drop=True), - actual.sort_values().reset_index(drop=True), - ) - - -def test_series_replace(): - a1 = np.array([0, 1, 2, 3, 4]) - - # Numerical - a2 = np.array([5, 1, 2, 3, 4]) - sr1 = cudf.Series(a1) - sr2 = sr1.replace(0, 5) - assert_eq(a2, sr2.to_numpy()) - - # Categorical - psr3 = pd.Series(["one", "two", "three"], dtype="category") - with expect_warning_if(PANDAS_GE_220, FutureWarning): - psr4 = psr3.replace("one", "two") - sr3 = cudf.from_pandas(psr3) - with pytest.warns(FutureWarning): - sr4 = sr3.replace("one", "two") - assert_eq( - psr4.sort_values().reset_index(drop=True), - sr4.sort_values().reset_index(drop=True), - ) - with expect_warning_if(PANDAS_GE_220, FutureWarning): - psr5 = psr3.replace("one", "five") - with pytest.warns(FutureWarning): - sr5 = sr3.replace("one", "five") - - assert_eq(psr5, sr5) - - # List input - a6 = np.array([5, 6, 2, 3, 4]) - sr6 = sr1.replace([0, 1], [5, 6]) - assert_eq(a6, sr6.to_numpy()) - - with pytest.raises(TypeError): - sr1.replace([0, 1], [5.5, 6.5]) - - # Series input - a8 = np.array([5, 5, 5, 3, 4]) - sr8 = sr1.replace(sr1[:3].to_numpy(), 5) - assert_eq(a8, sr8.to_numpy()) - - # large input containing null - sr9 = cudf.Series(list(range(400)) + [None]) - sr10 = sr9.replace([22, 323, 27, 0], None) - assert sr10.null_count == 5 - assert len(sr10.dropna().to_numpy()) == (401 - 5) - - sr11 = sr9.replace([22, 323, 27, 0], -1) - assert sr11.null_count == 1 - assert len(sr11.dropna().to_numpy()) == (401 - 1) - - # large input not containing nulls - sr9 = sr9.fillna(-11) - sr12 = sr9.replace([22, 323, 27, 0], None) - assert sr12.null_count == 4 - assert len(sr12.dropna().to_numpy()) == (401 - 4) - - sr13 = sr9.replace([22, 323, 27, 0], -1) - assert sr13.null_count == 0 - assert len(sr13.to_numpy()) == 401 - - -def test_series_replace_with_nulls(): - a1 = np.array([0, 1, 2, 3, 4]) - - # Numerical - a2 = np.array([-10, 1, 2, 3, 4]) - sr1 = cudf.Series(a1) - sr2 = sr1.replace(0, None).fillna(-10) - assert_eq(a2, sr2.to_numpy()) - - # List input - a6 = np.array([-10, 6, 2, 3, 4]) - sr6 = sr1.replace([0, 1], [None, 6]).fillna(-10) - assert_eq(a6, sr6.to_numpy()) - - sr1 = cudf.Series([0, 1, 2, 3, 4, None]) - with pytest.raises(TypeError): - sr1.replace([0, 1], [5.5, 6.5]).fillna(-10) - - # Series input - a8 = np.array([-10, -10, -10, 3, 4, -10]) - sr8 = sr1.replace(cudf.Series([-10] * 3, index=sr1[:3]), None).fillna(-10) - assert_eq(a8, sr8.to_numpy()) - - a9 = np.array([-10, 6, 2, 3, 4, -10]) - sr9 = sr1.replace([0, 1], [None, 6]).fillna(-10) - assert_eq(a9, sr9.to_numpy()) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning introduced in pandas-2.2.0", -) -@pytest.mark.parametrize( - "df", - [ - cudf.DataFrame( - { - "a": [0, 1, None, 2, 3], - "b": [3, 2, 2, 3, None], - "c": ["abc", "def", ".", None, None], - } - ), - cudf.DataFrame( - { - "a": ["one", "two", None, "three"], - "b": ["one", None, "two", "three"], - }, - dtype="category", - ), - cudf.DataFrame( - { - "col one": [None, 10, 11, None, 1000, 500, 600], - "col two": ["abc", "def", "ghi", None, "pp", None, "a"], - "a": [0.324, 0.234, 324.342, 23.32, 9.9, None, None], - } - ), - ], -) -@pytest.mark.parametrize( - "to_replace,value", - [ - (0, 4), - ([0, 1], [4, 5]), - ([0, 1], 4), - ({"a": 0, "b": 0}, {"a": 4, "b": 5}), - ({"a": 0}, {"a": 4}), - ("abc", "---"), - ([".", "gh"], "hi"), - ([".", "def"], ["_", None]), - ({"c": 0}, {"a": 4, "b": 5}), - ({"a": 2}, {"c": "a"}), - ("two", "three"), - ([1, 2], pd.Series([10, 11])), - (pd.Series([10, 11], index=[3, 2]), None), - ( - pd.Series(["a+", "+c", "p", "---"], index=["abc", "gh", "l", "z"]), - None, - ), - ( - pd.Series([10, 11], index=[3, 2]), - {"a": [-10, -30], "l": [-111, -222]}, - ), - (pd.Series([10, 11], index=[3, 2]), 555), - ( - pd.Series([10, 11], index=["a", "b"]), - pd.Series([555, 1111], index=["a", "b"]), - ), - ({"a": "2", "b": "3", "zzz": "hi"}, None), - ({"a": 2, "b": 3, "zzz": "hi"}, 324353), - ( - {"a": 2, "b": 3, "zzz": "hi"}, - pd.Series([5, 6, 10], index=["a", "b", "col one"]), - ), - ], -) -def test_dataframe_replace(df, to_replace, value): - gdf = df - pdf = gdf.to_pandas() - - pd_value = value - if isinstance(value, pd.Series): - gd_value = cudf.from_pandas(value) - else: - gd_value = value - - pd_to_replace = to_replace - if isinstance(to_replace, pd.Series): - gd_to_replace = cudf.from_pandas(to_replace) - else: - gd_to_replace = to_replace - - can_warn = ( - isinstance(df["a"].dtype, cudf.CategoricalDtype) - and isinstance(to_replace, str) - and to_replace == "two" - and isinstance(value, str) - and value == "three" - ) - with expect_warning_if(can_warn): - if pd_value is None: - expected = pdf.replace(to_replace=pd_to_replace) - else: - expected = pdf.replace(to_replace=pd_to_replace, value=pd_value) - with expect_warning_if(can_warn): - actual = gdf.replace(to_replace=gd_to_replace, value=gd_value) - - expected_sorted = expected.sort_values(by=list(expected.columns), axis=0) - actual_sorted = actual.sort_values(by=list(actual.columns), axis=0) - - assert_eq(expected_sorted, actual_sorted) - - -def test_dataframe_replace_with_nulls(): - # numerical - pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]}) - gdf1 = cudf.from_pandas(pdf1) - pdf2 = pdf1.replace(0, 4) - gdf2 = gdf1.replace(0, None).fillna(4) - assert_eq(gdf2, pdf2) - - # list input - pdf6 = pdf1.replace([0, 1], [4, 5]) - gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5) - assert_eq(gdf6, pdf6) - - pdf7 = pdf1.replace([0, 1], 4) - gdf7 = gdf1.replace([0, 1], None).fillna(4) - assert_eq(gdf7, pdf7) - - # dict input: - pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5}) - gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4) - assert_eq(gdf8, pdf8) - - gdf1 = cudf.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]}) - gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3) - assert_eq(gdf9, pdf6) - - -@pytest.mark.parametrize( - "psr", - [ - pd.Series([0, 1, None, 2, None], dtype=pd.Int8Dtype()), - pd.Series([0, 1, np.nan, 2, np.nan]), - ], -) -@pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("fill_value", [10, pd.Series([10, 20, 30, 40, 50])]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_fillna_numerical(psr, data_dtype, fill_value, inplace): - test_psr = psr.copy(deep=True) - # TODO: These tests should use Pandas' nullable int type - # when we support a recent enough version of Pandas - # https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html - if np.dtype(data_dtype).kind not in ("f") and test_psr.dtype.kind == "i": - test_psr = test_psr.astype( - cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[np.dtype(data_dtype)] - ) - - gsr = cudf.from_pandas(test_psr) - - if isinstance(fill_value, pd.Series): - fill_value_cudf = cudf.from_pandas(fill_value) - else: - fill_value_cudf = fill_value - - expected = test_psr.fillna(fill_value, inplace=inplace) - actual = gsr.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - expected = test_psr - actual = gsr - - # TODO: Remove check_dtype when we have support - # to compare with pandas nullable dtypes - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - [1, None, None, 2, 3, 4], - [None, None, 1, 2, None, 3, 4], - [1, 2, None, 3, 4, None, None], - [0] + [None] * 14, - [None] * 14 + [0], - ], -) -@pytest.mark.parametrize("container", [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_method_numerical(data, container, data_dtype, method, inplace): - if container == pd.DataFrame: - data = {"a": data, "b": data, "c": data} - - pdata = container(data) - - if np.dtype(data_dtype).kind not in ("f"): - data_dtype = cudf.utils.dtypes.np_dtypes_to_pandas_dtypes[ - np.dtype(data_dtype) - ] - pdata = pdata.astype(data_dtype) - - # Explicitly using nans_as_nulls=True - gdata = cudf.from_pandas(pdata, nan_as_null=True) - - with pytest.warns(FutureWarning): - expected = pdata.fillna(method=method, inplace=inplace) - with pytest.warns(FutureWarning): - actual = gdata.fillna(method=method, inplace=inplace) - - if inplace: - expected = pdata - actual = gdata - - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "gsr_data", - [ - cudf.Series(["2.34", "5.2", "7.47", None, "92.29", None]).astype( - Decimal64Dtype(7, 2) - ), - cudf.Series(["-74.56", None, "-23.73", "34.55", "2.89", None]).astype( - Decimal32Dtype(7, 2) - ), - cudf.Series( - ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan] - ).astype(Decimal64Dtype(8, 3)), - cudf.Series( - ["2.964", None, "57.432", "-989.330", None, "56.444"] - ).astype(Decimal64Dtype(8, 3)), - cudf.Series( - [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan] - ).astype(Decimal64Dtype(10, 4)), - cudf.Series( - ["2.964", None, "54347.432", "-989.330", None, "56.444"] - ).astype(Decimal128Dtype(20, 7)), - ], -) -@pytest.mark.parametrize( - "fill_value", - [ - 42, - -123, - Decimal("8.2"), - Decimal("-12.87"), - cudf.Series([None, -854, 9533, -274, -845, 7924], dtype="int32"), - cudf.Series(["-53.5", "13.4", "-64.3", None, "42.42", None]).astype( - Decimal64Dtype(7, 2) - ), - cudf.Series( - ["57.45", np.nan, np.nan, "686.49", "-55.5", "73.24"], - ).astype(Decimal64Dtype(7, 2)), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_decimal(gsr_data, fill_value, inplace): - gsr = gsr_data.copy(deep=True) - psr = gsr.to_pandas() - - if isinstance(fill_value, cudf.Series): - p_fill_value = fill_value.to_pandas() - else: - p_fill_value = fill_value - - expected = psr.fillna(p_fill_value, inplace=inplace) - got = gsr.fillna(fill_value, inplace=inplace) - - assert_eq(expected, got, check_dtype=False) - - -@pytest.mark.parametrize( - "psr_data", - [ - pd.Series(["a", "b", "a", None, "c", None], dtype="category"), - pd.Series( - ["a", "b", "a", None, "c", None], - dtype="category", - index=["q", "r", "z", "a", "b", "c"], - ), - pd.Series( - ["a", "b", "a", None, "c", None], - dtype="category", - index=["x", "t", "p", "q", "r", "z"], - ), - pd.Series(["a", "b", "a", np.nan, "c", np.nan], dtype="category"), - pd.Series( - [None, None, None, None, None, None, "a", "b", "c"], - dtype="category", - ), - ], -) -@pytest.mark.parametrize( - "fill_value", - [ - "c", - pd.Series(["c", "c", "c", "c", "c", "a"], dtype="category"), - pd.Series( - ["a", "b", "a", None, "c", None], - dtype="category", - index=["x", "t", "p", "q", "r", "z"], - ), - pd.Series( - ["a", "b", "a", None, "c", None], - dtype="category", - index=["q", "r", "z", "a", "b", "c"], - ), - pd.Series(["a", "b", "a", None, "c", None], dtype="category"), - pd.Series(["a", "b", "a", np.nan, "c", np.nan], dtype="category"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_categorical(psr_data, fill_value, inplace): - psr = psr_data.copy(deep=True) - gsr = cudf.from_pandas(psr) - - if isinstance(fill_value, pd.Series): - fill_value_cudf = cudf.from_pandas(fill_value) - else: - fill_value_cudf = fill_value - - if ( - isinstance(fill_value_cudf, cudf.Series) - and gsr.dtype != fill_value_cudf.dtype - ): - assert_exceptions_equal( - lfunc=psr.fillna, - rfunc=gsr.fillna, - lfunc_args_and_kwargs=([fill_value], {"inplace": inplace}), - rfunc_args_and_kwargs=([fill_value_cudf], {"inplace": inplace}), - ) - else: - expected = psr.fillna(fill_value, inplace=inplace) - got = gsr.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - expected = psr - got = gsr - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "psr_data", - [ - pd.Series( - pd.date_range( - "2010-01-01", - "2020-01-10", - freq="1YE" if PANDAS_GE_220 else "1y", - ) - ), - pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), - pd.Series( - [ - None, - None, - None, - None, - None, - None, - "2011-10-10", - "2010-01-01", - "2010-01-02", - "2010-01-04", - "2010-11-01", - ], - dtype="datetime64[ns]", - ), - pd.Series( - [ - None, - None, - None, - None, - None, - None, - "2011-10-10", - "2010-01-01", - "2010-01-02", - "2010-01-04", - "2010-11-01", - ], - dtype="datetime64[ns]", - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], - ), - ], -) -@pytest.mark.parametrize( - "fill_value", - [ - pd.Timestamp("2010-01-02"), - pd.Series( - pd.date_range( - "2010-01-01", - "2020-01-10", - freq="1YE" if PANDAS_GE_220 else "1y", - ) - ) - + pd.Timedelta("1d"), - pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"), - pd.Series( - [ - None, - None, - None, - None, - None, - None, - "2011-10-10", - "2010-01-01", - "2010-01-02", - "2010-01-04", - "2010-11-01", - ], - dtype="datetime64[ns]", - ), - pd.Series( - [ - None, - None, - None, - None, - None, - None, - "2011-10-10", - "2010-01-01", - "2010-01-02", - "2010-01-04", - "2010-11-01", - ], - dtype="datetime64[ns]", - index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_datetime(psr_data, fill_value, inplace): - psr = psr_data.copy(deep=True) - gsr = cudf.from_pandas(psr) - - if isinstance(fill_value, pd.Series): - fill_value_cudf = cudf.from_pandas(fill_value) - else: - fill_value_cudf = fill_value - - expected = psr.fillna(fill_value, inplace=inplace) - got = gsr.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - got = gsr - expected = psr - - assert_eq(expected, got) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - # Categorical - pd.Categorical([1, 2, None, None, 3, 4]), - pd.Categorical([None, None, 1, None, 3, 4]), - pd.Categorical([1, 2, None, 3, 4, None, None]), - pd.Categorical(["1", "20", None, None, "3", "40"]), - pd.Categorical([None, None, "10", None, "30", "4"]), - pd.Categorical(["1", "20", None, "30", "4", None, None]), - # Datetime - np.array( - [ - "2020-01-01 08:00:00", - "2020-01-01 09:00:00", - None, - "2020-01-01 10:00:00", - None, - "2020-01-01 10:00:00", - ], - dtype="datetime64[ns]", - ), - np.array( - [ - None, - None, - "2020-01-01 09:00:00", - "2020-01-01 10:00:00", - None, - "2020-01-01 10:00:00", - ], - dtype="datetime64[ns]", - ), - np.array( - [ - "2020-01-01 09:00:00", - None, - None, - "2020-01-01 10:00:00", - None, - None, - ], - dtype="datetime64[ns]", - ), - # Timedelta - np.array( - [10, 100, 1000, None, None, 10, 100, 1000], dtype="datetime64[ns]" - ), - np.array( - [None, None, 10, None, 1000, 100, 10], dtype="datetime64[ns]" - ), - np.array( - [10, 100, None, None, 1000, None, None], dtype="datetime64[ns]" - ), - # String - np.array( - ["10", "100", "1000", None, None, "10", "100", "1000"], - dtype="object", - ), - np.array( - [None, None, "1000", None, "10", "100", "10"], dtype="object" - ), - np.array( - ["10", "100", None, None, "1000", None, None], dtype="object" - ), - ], -) -@pytest.mark.parametrize("container", [pd.Series, pd.DataFrame]) -@pytest.mark.parametrize("method", ["ffill", "bfill"]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_method_fixed_width_non_num(data, container, method, inplace): - if container == pd.DataFrame: - data = {"a": data, "b": data, "c": data} - - pdata = container(data) - - # Explicitly using nans_as_nulls=True - gdata = cudf.from_pandas(pdata, nan_as_null=True) - - with pytest.warns(FutureWarning): - expected = pdata.fillna(method=method, inplace=inplace) - with pytest.warns(FutureWarning): - actual = gdata.fillna(method=method, inplace=inplace) - - if inplace: - expected = pdata - actual = gdata - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}), - pd.DataFrame( - {"a": [1, 2, None], "b": [None, None, 5]}, index=["a", "p", "z"] - ), - pd.DataFrame({"a": [1, 2, 3]}), - ], -) -@pytest.mark.parametrize( - "value", - [ - 10, - pd.Series([10, 20, 30]), - pd.Series([3, 4, 5]), - pd.Series([10, 20, 30], index=["z", "a", "p"]), - {"a": 5, "b": pd.Series([3, 4, 5])}, - {"a": 5001}, - {"b": pd.Series([11, 22, 33], index=["a", "p", "z"])}, - {"a": 5, "b": pd.Series([3, 4, 5], index=["a", "p", "z"])}, - {"c": 100}, - np.nan, - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_dataframe(df, value, inplace): - pdf = df.copy(deep=True) - gdf = cudf.from_pandas(pdf) - - fill_value_pd = value - if isinstance(fill_value_pd, (pd.Series, pd.DataFrame)): - fill_value_cudf = cudf.from_pandas(fill_value_pd) - elif isinstance(fill_value_pd, dict): - fill_value_cudf = {} - for key in fill_value_pd: - temp_val = fill_value_pd[key] - if isinstance(temp_val, pd.Series): - temp_val = cudf.from_pandas(temp_val) - fill_value_cudf[key] = temp_val - else: - fill_value_cudf = value - - expect = pdf.fillna(fill_value_pd, inplace=inplace) - got = gdf.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - got = gdf - expect = pdf - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "ps_data", - [ - pd.Series(["a", "b", "c", "d"]), - pd.Series([None] * 4, dtype="object"), - pd.Series(["z", None, "z", None]), - pd.Series(["x", "y", None, None, None]), - pd.Series([None, None, None, "i", "P"]), - ], -) -@pytest.mark.parametrize( - "fill_value", - [ - "a", - pd.Series(["a", "b", "c", "d"]), - pd.Series(["z", None, "z", None]), - pd.Series([None] * 4, dtype="object"), - pd.Series(["x", "y", None, None, None]), - pd.Series([None, None, None, "i", "P"]), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_fillna_string(ps_data, fill_value, inplace): - psr = ps_data.copy(deep=True) - gsr = cudf.from_pandas(psr) - - if isinstance(fill_value, pd.Series): - fill_value_cudf = cudf.from_pandas(fill_value) - else: - fill_value_cudf = fill_value - - expected = psr.fillna(fill_value, inplace=inplace) - got = gsr.fillna(fill_value_cudf, inplace=inplace) - - if inplace: - expected = psr - got = gsr - - assert_eq(expected, got) - - -@pytest.mark.parametrize("data_dtype", INTEGER_TYPES) -def test_series_fillna_invalid_dtype(data_dtype): - gdf = cudf.Series([1, 2, None, 3], dtype=data_dtype) - fill_value = 2.5 - msg = ( - f"Cannot safely cast non-equivalent" - f" {type(fill_value).__name__} to {gdf.dtype.type.__name__}" - ) - with pytest.raises(TypeError, match=msg): - gdf.fillna(fill_value) - - -@pytest.mark.parametrize("data_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("fill_value", [100, 100.0, 128.5]) -@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt]) -def test_series_where(data_dtype, fill_value, op): - psr = pd.Series(list(range(10)), dtype=data_dtype) - sr = cudf.from_pandas(psr) - - try: - scalar_fits = sr.dtype.type(fill_value) == fill_value - except OverflowError: - scalar_fits = False - - if not scalar_fits: - with pytest.raises(TypeError): - sr.where(op(sr, 0), fill_value) - else: - # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(op(psr, 0), fill_value) - got = sr.where(op(sr, 0), fill_value) - # pandas returns 'float16' dtype, which is not supported in cudf - assert_eq( - expect, - got, - check_dtype=expect.dtype.kind not in ("f"), - ) - - -@pytest.mark.parametrize("fill_value", [100, 100.0, 100.5]) -def test_series_with_nulls_where(fill_value): - psr = pd.Series([None] * 3 + list(range(5))) - sr = cudf.from_pandas(psr) - - expect = psr.where(psr > 0, fill_value) - got = sr.where(sr > 0, fill_value) - assert_eq(expect, got) - - expect = psr.where(psr < 0, fill_value) - got = sr.where(sr < 0, fill_value) - assert_eq(expect, got) - - expect = psr.where(psr == 0, fill_value) - got = sr.where(sr == 0, fill_value) - assert_eq(expect, got) - - -@pytest.mark.parametrize("fill_value", [[888, 999]]) -def test_dataframe_with_nulls_where_with_scalars(fill_value): - pdf = pd.DataFrame( - { - "A": [-1, 2, -3, None, 5, 6, -7, 0], - "B": [4, -2, 3, None, 7, 6, 8, 0], - } - ) - gdf = cudf.from_pandas(pdf) - - expect = pdf.where(pdf % 3 == 0, fill_value) - got = gdf.where(gdf % 3 == 0, fill_value) - - assert_eq(expect, got) - - -def test_dataframe_with_different_types(): - # Testing for int and float - pdf = pd.DataFrame( - {"A": [111, 22, 31, 410, 56], "B": [-10.12, 121.2, 45.7, 98.4, 87.6]} - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.where(pdf > 50, -pdf) - got = gdf.where(gdf > 50, -gdf) - - assert_eq(expect, got) - - # Testing for string - pdf = pd.DataFrame({"A": ["a", "bc", "cde", "fghi"]}) - gdf = cudf.from_pandas(pdf) - pdf_mask = pd.DataFrame({"A": [True, False, True, False]}) - gdf_mask = cudf.from_pandas(pdf_mask) - expect = pdf.where(pdf_mask, ["cudf"]) - got = gdf.where(gdf_mask, ["cudf"]) - - assert_eq(expect, got) - - # Testing for categoriacal - pdf = pd.DataFrame({"A": ["a", "b", "b", "c"]}) - pdf["A"] = pdf["A"].astype("category") - gdf = cudf.from_pandas(pdf) - expect = pdf.where(pdf_mask, "c") - got = gdf.where(gdf_mask, ["c"]) - - assert_eq(expect, got) - - -def test_dataframe_where_with_different_options(): - pdf = pd.DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]}) - gdf = cudf.from_pandas(pdf) - - # numpy array - boolean_mask = np.array([[False, True], [True, False], [False, True]]) - - expect = pdf.where(boolean_mask, -pdf) - got = gdf.where(boolean_mask, -gdf) - - assert_eq(expect, got) - - # with single scalar - expect = pdf.where(boolean_mask, 8) - got = gdf.where(boolean_mask, 8) - - assert_eq(expect, got) - - # with multi scalar - expect = pdf.where(boolean_mask, [8, 9]) - got = gdf.where(boolean_mask, [8, 9]) - - assert_eq(expect, got) - - -def test_series_multiple_times_with_nulls(): - sr = cudf.Series([1, 2, 3, None]) - expected = cudf.Series([None, None, None, None], dtype=np.int64) - - for i in range(3): - got = sr.replace([1, 2, 3], None) - assert_eq(expected, got) - # BUG: #2695 - # The following series will acquire a chunk of memory and update with - # values, but these values may still linger even after the memory - # gets released. This memory space might get used for replace in - # subsequent calls and the memory used for mask may have junk values. - # So, if it is not updated properly, the result would be wrong. - # So, this will help verify that scenario. - cudf.Series([1, 1, 1, None]) - - -@pytest.mark.parametrize("series_dtype", NUMERIC_TYPES) -@pytest.mark.parametrize( - "replacement", [128, 128.0, 128.5, 32769, 32769.0, 32769.5] -) -def test_numeric_series_replace_dtype(series_dtype, replacement): - psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) - sr = cudf.from_pandas(psr) - - numpy_replacement = np.array(replacement).astype(sr.dtype)[()] - can_replace = numpy_replacement == replacement - - # Both Scalar - if not can_replace: - with pytest.raises(TypeError): - sr.replace(1, replacement) - else: - expect = psr.replace(1, replacement).astype(psr.dtype) - got = sr.replace(1, replacement) - assert_eq(expect, got) - - # to_replace is a list, replacement is a scalar - if not can_replace: - with pytest.raises(TypeError): - sr.replace([2, 3], replacement) - else: - expect = psr.replace([2, 3], replacement).astype(psr.dtype) - got = sr.replace([2, 3], replacement) - assert_eq(expect, got) - - # If to_replace is a scalar and replacement is a list - with pytest.raises(TypeError): - sr.replace(0, [replacement, 2]) - - # Both list of unequal length - with pytest.raises(ValueError): - sr.replace([0, 1], [replacement]) - - # Both lists of equal length - if ( - np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"} - ) or (not can_replace): - with pytest.raises(TypeError): - sr.replace([2, 3], [replacement, replacement]) - else: - expect = psr.replace([2, 3], [replacement, replacement]).astype( - psr.dtype - ) - got = sr.replace([2, 3], [replacement, replacement]) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "pframe, replace_args", - [ - ( - pd.Series([5, 1, 2, 3, 4]), - {"to_replace": 5, "value": 0, "inplace": True}, - ), - ( - pd.Series([5, 1, 2, 3, 4]), - {"to_replace": {5: 0, 3: -5}, "inplace": True}, - ), - (pd.Series([5, 1, 2, 3, 4]), {}), - pytest.param( - pd.Series(["one", "two", "three"], dtype="category"), - {"to_replace": "one", "value": "two", "inplace": True}, - marks=pytest.mark.xfail( - condition=PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/43232" - "https://github.com/pandas-dev/pandas/issues/53358", - ), - ), - ( - pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9]}), - {"to_replace": 5, "value": 0, "inplace": True}, - ), - ( - pd.Series([1, 2, 3, 45]), - { - "to_replace": np.array([]).astype(int), - "value": 77, - "inplace": True, - }, - ), - ( - pd.Series([1, 2, 3, 45]), - { - "to_replace": np.array([]).astype(int), - "value": 77, - "inplace": False, - }, - ), - ( - pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), - {"to_replace": {"a": 2}, "value": {"a": -33}, "inplace": True}, - ), - ( - pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), - { - "to_replace": {"a": [2, 5]}, - "value": {"a": [9, 10]}, - "inplace": True, - }, - ), - ( - pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}), - {"to_replace": [], "value": [], "inplace": True}, - ), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warning not given on older versions of pandas", -) -def test_replace_inplace(pframe, replace_args): - gpu_frame = cudf.from_pandas(pframe) - pandas_frame = pframe.copy() - - gpu_copy = gpu_frame.copy() - cpu_copy = pandas_frame.copy() - - assert_eq(gpu_frame, pandas_frame) - assert_eq(gpu_copy, cpu_copy) - with expect_warning_if(len(replace_args) == 0): - gpu_frame.replace(**replace_args) - with expect_warning_if(len(replace_args) == 0): - pandas_frame.replace(**replace_args) - assert_eq(gpu_frame, pandas_frame) - assert_eq(gpu_copy, cpu_copy) - - -def test_replace_df_error(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}) - gdf = cudf.from_pandas(pdf) - - assert_exceptions_equal( - lfunc=pdf.replace, - rfunc=gdf.replace, - lfunc_args_and_kwargs=([], {"to_replace": -1, "value": []}), - rfunc_args_and_kwargs=([], {"to_replace": -1, "value": []}), - ) - - -@pytest.mark.parametrize( - ("lower", "upper"), - [ - ([2, 7.4], [4, 7.9]), - ([2, 7.4], None), - ( - None, - [4, 7.9], - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_clip(lower, upper, inplace): - pdf = pd.DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} - ) - gdf = cudf.from_pandas(pdf) - - got = gdf.clip(lower=lower, upper=upper, inplace=inplace) - expect = pdf.clip(lower=lower, upper=upper, axis=1) - - if inplace is True: - assert_eq(expect, gdf) - else: - assert_eq(expect, got) - - -@pytest.mark.parametrize( - ("lower", "upper"), - [("b", "d"), ("b", None), (None, "c"), (None, None)], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dataframe_category_clip(lower, upper, inplace): - data = ["a", "b", "c", "d", "e"] - pdf = pd.DataFrame({"a": data}) - gdf = cudf.from_pandas(pdf) - gdf["a"] = gdf["a"].astype("category") - - expect = pdf.clip(lower=lower, upper=upper) - got = gdf.clip(lower=lower, upper=upper, inplace=inplace) - - if inplace is True: - assert_eq(expect, gdf.astype("str")) - else: - assert_eq(expect, got.astype("str")) - - -@pytest.mark.parametrize( - ("lower", "upper"), - [([2, 7.4], [4, 7.9, "d"]), ([2, 7.4, "a"], [4, 7.9, "d"])], -) -def test_dataframe_exceptions_for_clip(lower, upper): - gdf = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [7.1, 7.24, 7.5, 7.8, 8.11]} - ) - - with pytest.raises(ValueError): - gdf.clip(lower=lower, upper=upper) - - -@pytest.mark.parametrize( - ("data", "lower", "upper"), - [ - ([1, 2, 3, 4, 5], 2, 4), - ([1, 2, 3, 4, 5], 2, None), - ([1, 2, 3, 4, 5], None, 4), - ([1, 2, 3, 4, 5], None, None), - ([1, 2, 3, 4, 5], 4, 2), - ([1.0, 2.0, 3.0, 4.0, 5.0], 4, 2), - (pd.Series([1, 2, 3, 4, 5], dtype="int32"), 4, 2), - (["a", "b", "c", "d", "e"], "b", "d"), - (["a", "b", "c", "d", "e"], "b", None), - (["a", "b", "c", "d", "e"], None, "d"), - (["a", "b", "c", "d", "e"], "d", "b"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_clip(data, lower, upper, inplace): - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - - expect = psr.clip(lower=lower, upper=upper) - got = gsr.clip(lower=lower, upper=upper, inplace=inplace) - - if inplace is True: - assert_eq(expect, gsr) - else: - assert_eq(expect, got) - - -def test_series_exceptions_for_clip(): - with pytest.raises(ValueError): - cudf.Series([1, 2, 3, 4]).clip([1, 2], [2, 3]) - - with pytest.raises(NotImplementedError): - cudf.Series([1, 2, 3, 4]).clip(1, 2, axis=0) - - -@pytest.mark.parametrize( - "data", [[1, 2.0, 3, 4, None, 1, None, 10, None], ["a", "b", "c"]] -) -@pytest.mark.parametrize( - "index", - [ - None, - [1, 2, 3], - ["a", "b", "z"], - ["a", "b", "c", "d", "e", "f", "g", "l", "m"], - ], -) -@pytest.mark.parametrize("value", [[1, 2, 3, 4, None, 1, None, 10, None]]) -def test_series_fillna(data, index, value): - psr = pd.Series( - data, - index=index if index is not None and len(index) == len(data) else None, - ) - gsr = cudf.Series( - data, - index=index if index is not None and len(index) == len(data) else None, - ) - - expect = psr.fillna(pd.Series(value)) - got = gsr.fillna(cudf.Series(value)) - assert_eq(expect, got) - - -def test_series_fillna_error(): - psr = pd.Series([1, 2, None, 3, None]) - gsr = cudf.from_pandas(psr) - - assert_exceptions_equal( - psr.fillna, - gsr.fillna, - ([pd.DataFrame({"a": [1, 2, 3]})],), - ([cudf.DataFrame({"a": [1, 2, 3]})],), - ) - - -def test_series_replace_errors(): - gsr = cudf.Series([1, 2, None, 3, None]) - psr = gsr.to_pandas() - - with pytest.raises( - TypeError, - match=re.escape( - "to_replace and value should be of same types," - "got to_replace dtype: int64 and " - "value dtype: object" - ), - ): - gsr.replace(1, "a") - - gsr = cudf.Series(["a", "b", "c"]) - with pytest.raises( - TypeError, - match=re.escape( - "to_replace and value should be of same types," - "got to_replace dtype: int64 and " - "value dtype: object" - ), - ): - gsr.replace([1, 2], ["a", "b"]) - - assert_exceptions_equal( - psr.replace, - gsr.replace, - ([{"a": 1}, 1],), - ([{"a": 1}, 1],), - ) - - assert_exceptions_equal( - lfunc=psr.replace, - rfunc=gsr.replace, - lfunc_args_and_kwargs=([[1, 2], [1]],), - rfunc_args_and_kwargs=([[1, 2], [1]],), - ) - - assert_exceptions_equal( - lfunc=psr.replace, - rfunc=gsr.replace, - lfunc_args_and_kwargs=([object(), [1]],), - rfunc_args_and_kwargs=([object(), [1]],), - ) - - assert_exceptions_equal( - lfunc=psr.replace, - rfunc=gsr.replace, - lfunc_args_and_kwargs=([{"a": 1}, object()],), - rfunc_args_and_kwargs=([{"a": 1}, object()],), - ) - - -@pytest.mark.parametrize( - "gsr,old,new,expected", - [ - ( - cudf.Series(["a", "b", "c", None]), - None, - "a", - cudf.Series(["a", "b", "c", "a"]), - ), - ( - cudf.Series(["a", "b", "c", None]), - [None, "a", "a"], - ["c", "b", "d"], - cudf.Series(["d", "b", "c", "c"]), - ), - ( - cudf.Series(["a", "b", "c", None]), - [None, "a"], - ["b", None], - cudf.Series([None, "b", "c", "b"]), - ), - ( - cudf.Series(["a", "b", "c", None]), - [None, None], - [None, None], - cudf.Series(["a", "b", "c", None]), - ), - (cudf.Series([1, 2, None, 3]), None, 10, cudf.Series([1, 2, 10, 3])), - ( - cudf.Series([1, 2, None, 3]), - [None, 1, 1], - [3, 2, 4], - cudf.Series([4, 2, 3, 3]), - ), - ( - cudf.Series([1, 2, None, 3]), - [None, 1], - [2, None], - cudf.Series([None, 2, 2, 3]), - ), - ( - cudf.Series(["a", "q", "t", None], dtype="category"), - None, - "z", - cudf.Series(["a", "q", "t", "z"], dtype="category"), - ), - ( - cudf.Series(["a", "q", "t", None], dtype="category"), - [None, "a", "q"], - ["z", None, None], - cudf.Series([None, None, "t", "z"], dtype="category"), - ), - ( - cudf.Series(["a", None, "t", None], dtype="category"), - [None, "t"], - ["p", None], - cudf.Series(["a", "p", None, "p"], dtype="category"), - ), - ], -) -def test_replace_nulls(gsr, old, new, expected): - with expect_warning_if(isinstance(gsr.dtype, cudf.CategoricalDtype)): - actual = gsr.replace(old, new) - assert_eq( - expected.sort_values().reset_index(drop=True), - actual.sort_values().reset_index(drop=True), - ) - - -def test_fillna_columns_multiindex(): - columns = pd.MultiIndex.from_tuples([("a", "b"), ("d", "e")]) - pdf = pd.DataFrame( - {"0": [1, 2, None, 3, None], "1": [None, None, None, None, 4]} - ) - pdf.columns = columns - gdf = cudf.from_pandas(pdf) - - expected = pdf.fillna(10) - actual = gdf.fillna(10) - - assert_eq(expected, actual) - - -def test_fillna_nan_and_null(): - ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False) - result = ser.fillna(2.2) - expected = cudf.Series([2.2, 2.2, 1.1]) - assert_eq(result, expected) - - -def test_replace_with_index_objects(): - result = cudf.Series([1, 2]).replace(cudf.Index([1]), cudf.Index([2])) - expected = pd.Series([1, 2]).replace(pd.Index([1]), pd.Index([2])) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py deleted file mode 100644 index 95e19fae501..00000000000 --- a/python/cudf/cudf/tests/test_repr.py +++ /dev/null @@ -1,1501 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import textwrap - -import cupy as cp -import numpy as np -import pandas as pd -import pytest -from hypothesis import given, settings, strategies as st - -import cudf -from cudf.testing import _utils as utils -from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes - -repr_categories = [ - "uint16", - "int64", - "float64", - "str", - "category", - "datetime64[ns]", -] - - -@pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [0, 5, 10]) -def test_null_series(nrows, dtype): - size = 5 - sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype) - sr[np.random.choice([False, True], size=size)] = None - if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: - ps = pd.Series( - sr._column.data_array_view(mode="read").copy_to_host(), - dtype=np_dtypes_to_pandas_dtypes.get( - cudf.dtype(dtype), cudf.dtype(dtype) - ), - ) - ps[sr.isnull().to_pandas()] = pd.NA - else: - ps = sr.to_pandas() - - pd.options.display.max_rows = int(nrows) - psrepr = repr(ps).replace("NaN", "").replace("None", "") - if "UInt" in psrepr: - psrepr = psrepr.replace("UInt", "uint") - elif "Int" in psrepr: - psrepr = psrepr.replace("Int", "int") - assert psrepr.split() == repr(sr).split() - pd.reset_option("display.max_rows") - - -dtype_categories = [ - "float32", - "float64", - "datetime64[ns]", - "str", - "category", -] - - -@pytest.mark.parametrize("ncols", [1, 2, 3, 4, 5, 10]) -def test_null_dataframe(ncols): - size = 20 - gdf = cudf.DataFrame() - for idx, dtype in enumerate(dtype_categories): - sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype) - sr[np.random.choice([False, True], size=size)] = None - gdf[dtype] = sr - pdf = gdf.to_pandas() - pd.options.display.max_columns = int(ncols) - pdf_repr = repr(pdf).replace("NaN", "").replace("None", "") - assert pdf_repr.split() == repr(gdf).split() - pd.reset_option("display.max_columns") - - -@pytest.mark.parametrize("dtype", repr_categories) -@pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21]) -def test_full_series(nrows, dtype): - size = 20 - ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype) - sr = cudf.from_pandas(ps) - pd.options.display.max_rows = nrows - assert repr(ps) == repr(sr) - pd.reset_option("display.max_rows") - - -@pytest.mark.parametrize("nrows", [5, 10, 15]) -@pytest.mark.parametrize("ncols", [5, 10, 15]) -@pytest.mark.parametrize("size", [20, 21]) -@pytest.mark.parametrize("dtype", repr_categories) -def test_full_dataframe_20(dtype, size, nrows, ncols): - pdf = pd.DataFrame( - {idx: np.random.randint(0, 100, size) for idx in range(size)} - ).astype(dtype) - gdf = cudf.from_pandas(pdf) - - with pd.option_context( - "display.max_rows", int(nrows), "display.max_columns", int(ncols) - ): - assert repr(pdf) == repr(gdf) - assert pdf._repr_html_() == gdf._repr_html_() - assert pdf._repr_latex_() == gdf._repr_latex_() - - -@given( - st.lists( - st.integers(-9223372036854775808, 9223372036854775807), - min_size=1, - max_size=10000, - ) -) -@settings(deadline=None) -def test_integer_dataframe(x): - gdf = cudf.DataFrame({"x": x}) - pdf = gdf.to_pandas() - pd.options.display.max_columns = 1 - assert repr(gdf) == repr(pdf) - assert repr(gdf.T) == repr(pdf.T) - pd.reset_option("display.max_columns") - - -@given( - st.lists( - st.integers(-9223372036854775808, 9223372036854775807), max_size=10000 - ) -) -@settings(deadline=None) -def test_integer_series(x): - sr = cudf.Series(x, dtype=int) - ps = pd.Series(data=x, dtype=int) - - assert repr(sr) == repr(ps) - - -@given(st.lists(st.floats())) -@settings(deadline=None) -def test_float_dataframe(x): - gdf = cudf.DataFrame({"x": cudf.Series(x, dtype=float, nan_as_null=False)}) - pdf = gdf.to_pandas() - assert repr(gdf) == repr(pdf) - - -@given(st.lists(st.floats())) -@settings(deadline=None) -def test_float_series(x): - sr = cudf.Series(x, dtype=float, nan_as_null=False) - ps = pd.Series(data=x, dtype=float) - assert repr(sr) == repr(ps) - - -@pytest.fixture -def mixed_pdf(): - pdf = pd.DataFrame() - pdf["Integer"] = np.array([2345, 11987, 9027, 9027]) - pdf["Date"] = np.array( - ["18/04/1995", "14/07/1994", "07/06/2006", "16/09/2005"] - ) - pdf["Float"] = np.array([9.001, 8.343, 6, 2.781]) - pdf["Integer2"] = np.array([2345, 106, 2088, 789277]) - pdf["Category"] = np.array(["M", "F", "F", "F"]) - pdf["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - pdf["Boolean"] = np.array([True, False, True, False]) - return pdf - - -@pytest.fixture -def mixed_gdf(mixed_pdf): - return cudf.from_pandas(mixed_pdf) - - -def test_mixed_dataframe(mixed_pdf, mixed_gdf): - assert repr(mixed_gdf) == repr(mixed_pdf) - - -def test_mixed_series(mixed_pdf, mixed_gdf): - for col in mixed_gdf.columns: - assert repr(mixed_gdf[col]) == repr(mixed_pdf[col]) - - -def test_MI(): - gdf = cudf.DataFrame( - { - "a": np.random.randint(0, 4, 10), - "b": np.random.randint(0, 4, 10), - "c": np.random.randint(0, 4, 10), - } - ) - levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] - codes = [ - [0, 0, 0, 0, 1, 1, 2, 2, 3, 3], - [0, 1, 2, 3, 0, 1, 2, 3, 0, 1], - [0, 1, 0, 1, 0, 1, 0, 1, 0, 1], - ] - pd.options.display.max_rows = 999 - pd.options.display.max_columns = 0 - gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes)) - pdf = gdf.to_pandas() - assert repr(gdf) == repr(pdf) - assert repr(gdf.index) == repr(pdf.index) - assert repr(gdf.T) == repr(pdf.T) - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") - - -@pytest.mark.parametrize("nrows", [0, 1, 3, 5, 10]) -@pytest.mark.parametrize("ncols", [0, 1, 2, 3]) -def test_groupby_MI(nrows, ncols): - gdf = cudf.DataFrame( - {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} - ) - pdf = gdf.to_pandas() - gdg = gdf.groupby(["a", "b"], sort=True).count() - pdg = pdf.groupby(["a", "b"], sort=True).count() - pd.options.display.max_rows = nrows - pd.options.display.max_columns = ncols - assert repr(gdg) == repr(pdg) - assert repr(gdg.index) == repr(pdg.index) - assert repr(gdg.T) == repr(pdg.T) - pd.reset_option("display.max_rows") - pd.reset_option("display.max_columns") - - -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) -@pytest.mark.parametrize("length", [0, 1, 10, 100, 1000]) -def test_generic_index(length, dtype): - psr = pd.Series( - range(length), - index=np.random.randint(0, high=100, size=length).astype(dtype), - dtype="float64" if length == 0 else None, - ) - gsr = cudf.Series.from_pandas(psr) - - assert repr(psr.index) == repr(gsr.index) - - -@pytest.mark.parametrize( - "gdf", - [ - cudf.DataFrame({"a": range(10000)}), - cudf.DataFrame({"a": range(10000), "b": range(10000)}), - cudf.DataFrame({"a": range(20), "b": range(20)}), - cudf.DataFrame( - { - "a": range(20), - "b": range(20), - "c": ["abc", "def", "xyz", "def", "pqr"] * 4, - } - ), - cudf.DataFrame(index=[1, 2, 3]), - cudf.DataFrame(index=range(10000)), - cudf.DataFrame(columns=["a", "b", "c", "d"]), - cudf.DataFrame(columns=["a"], index=range(10000)), - cudf.DataFrame(columns=["a", "col2", "...col n"], index=range(10000)), - cudf.DataFrame(index=cudf.Series(range(10000)).astype("str")), - cudf.DataFrame( - columns=["a", "b", "c", "d"], - index=cudf.Series(range(10000)).astype("str"), - ), - ], -) -@pytest.mark.parametrize( - "slice", - [ - slice(2500, 5000), - slice(2500, 2501), - slice(5000), - slice(1, 10), - slice(10, 20), - slice(15, 2400), - ], -) -@pytest.mark.parametrize("max_seq_items", [1, 10, 60, 10000, None]) -@pytest.mark.parametrize("max_rows", [1, 10, 60, 10000, None]) -def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): - pd.options.display.max_seq_items = max_seq_items - pd.options.display.max_rows = max_rows - pdf = gdf.to_pandas() - - sliced_gdf = gdf[slice] - sliced_pdf = pdf[slice] - - expected_repr = repr(sliced_pdf).replace("None", "") - actual_repr = repr(sliced_gdf) - - assert expected_repr == actual_repr - pd.reset_option("display.max_rows") - pd.reset_option("display.max_seq_items") - - -@pytest.mark.parametrize( - "index,expected_repr", - [ - ( - cudf.Index([1, 2, 3, None]), - "Index([1, 2, 3, ], dtype='int64')", - ), - ( - cudf.Index([None, 2.2, 3.324342, None]), - "Index([, 2.2, 3.324342, ], dtype='float64')", - ), - ( - cudf.Index([None, None, None], name="hello"), - "Index([, , ], dtype='object', name='hello')", - ), - ( - cudf.Index([None, None, None], dtype="float", name="hello"), - "Index([, , ], dtype='float64', name='hello')", - ), - ( - cudf.Index([None], dtype="float64", name="hello"), - "Index([], dtype='float64', name='hello')", - ), - ( - cudf.Index([None], dtype="int8", name="hello"), - "Index([], dtype='int8', name='hello')", - ), - ( - cudf.Index([None] * 50, dtype="object"), - "Index([, , , , , , , , , " - ", , ,\n , , , , , , , " - ", , , , ,\n , , , , " - ", , , , , , , ,\n , " - ", , , , , , , , , , " - ",\n , ],\n dtype='object')", - ), - ( - cudf.Index([None] * 20, dtype="uint32"), - "Index([, , , , , , , , " - ",\n , , , , , , , , " - ",\n , ],\n dtype='uint32')", - ), - ( - cudf.Index( - [None, 111, 22, 33, None, 23, 34, 2343, None], dtype="int16" - ), - "Index([, 111, 22, 33, , 23, 34, 2343, ], " - "dtype='int16')", - ), - ( - cudf.Index([1, 2, 3, None], dtype="category"), - "CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], " - "ordered=False, dtype='category')", - ), - ( - cudf.Index([None, None], dtype="category"), - "CategoricalIndex([, ], categories=[], ordered=False, " - "dtype='category')", - ), - ( - cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ns]")), - "DatetimeIndex([1970-01-01 00:00:00.000000010, " - "1970-01-01 00:00:00.000000020," - "\n 1970-01-01 00:00:00.000000030, NaT],\n " - "dtype='datetime64[ns]')", - ), - ( - cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[s]")), - "DatetimeIndex([1970-01-01 00:00:10, " - "1970-01-01 00:00:20, 1970-01-01 00:00:30,\n" - " NaT],\n dtype='datetime64[s]')", - ), - ( - cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[us]")), - "DatetimeIndex([1970-01-01 00:00:00.000010, " - "1970-01-01 00:00:00.000020,\n " - "1970-01-01 00:00:00.000030, NaT],\n " - "dtype='datetime64[us]')", - ), - ( - cudf.Index(np.array([10, 20, 30, None], dtype="datetime64[ms]")), - "DatetimeIndex([1970-01-01 00:00:00.010, " - "1970-01-01 00:00:00.020,\n " - "1970-01-01 00:00:00.030, NaT],\n " - "dtype='datetime64[ms]')", - ), - ( - cudf.Index(np.array([None] * 10, dtype="datetime64[ms]")), - "DatetimeIndex([NaT, NaT, NaT, NaT, NaT, NaT, NaT, NaT, " - "NaT, NaT], dtype='datetime64[ms]')", - ), - ], -) -def test_generic_index_null(index, expected_repr): - actual_repr = repr(index) - - assert expected_repr == actual_repr - - -@pytest.mark.parametrize( - "df,pandas_special_case", - [ - (pd.DataFrame({"a": [1, 2, 3]}, index=[10, 20, None]), False), - ( - pd.DataFrame( - { - "a": [1, None, 3], - "string_col": ["hello", "world", "rapids"], - }, - index=[None, "a", "b"], - ), - True, - ), - (pd.DataFrame([], index=[None, "a", "b"]), False), - (pd.DataFrame({"aa": [None, None]}, index=[None, None]), False), - (pd.DataFrame({"aa": [1, 2, 3]}, index=[None, None, None]), False), - ( - pd.DataFrame( - {"aa": [None, 2, 3]}, - index=np.array([1, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"aa": [None, 2, 3]}, - index=np.array([100, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"aa": [None, None, None]}, - index=np.array([None, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"aa": [1, None, 3]}, - index=np.array([10, 15, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"a": [1, 2, None], "v": [10, None, 22], "p": [100, 200, 300]} - ).set_index(["a", "v"]), - False, - ), - ( - pd.DataFrame( - { - "a": [1, 2, None], - "v": ["n", "c", "a"], - "p": [None, None, None], - } - ).set_index(["a", "v"]), - False, - ), - ( - pd.DataFrame( - { - "a": np.array([1, None, None], dtype="datetime64[ns]"), - "v": ["n", "c", "a"], - "p": [None, None, None], - } - ).set_index(["a", "v"]), - False, - ), - ], -) -def test_dataframe_null_index_repr(df, pandas_special_case): - pdf = df - gdf = cudf.from_pandas(pdf) - - expected_repr = repr(pdf).replace("NaN", "").replace("None", "") - actual_repr = repr(gdf) - - if pandas_special_case: - # Pandas inconsistently print Index null values - # as `None` at some places and `NaN` at few other places - # Whereas cudf is consistent with strings `null` values - # to be printed as `None` everywhere. - actual_repr = repr(gdf).replace("None", "") - - assert expected_repr.split() == actual_repr.split() - - -@pytest.mark.parametrize( - "sr,pandas_special_case", - [ - (pd.Series([1, 2, 3], index=[10, 20, None]), False), - (pd.Series([1, None, 3], name="a", index=[None, "a", "b"]), True), - (pd.Series(None, index=[None, "a", "b"], dtype="float"), True), - (pd.Series([None, None], name="aa", index=[None, None]), False), - (pd.Series([1, 2, 3], index=[None, None, None]), False), - ( - pd.Series( - [None, 2, 3], - index=np.array([1, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.Series( - [None, None, None], - index=np.array([None, None, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.Series( - [1, None, 3], - index=np.array([10, 15, None], dtype="datetime64[ns]"), - ), - False, - ), - ( - pd.DataFrame( - {"a": [1, 2, None], "v": [10, None, 22], "p": [100, 200, 300]} - ).set_index(["a", "v"])["p"], - False, - ), - ( - pd.DataFrame( - { - "a": [1, 2, None], - "v": ["n", "c", "a"], - "p": [None, None, None], - } - ).set_index(["a", "v"])["p"], - False, - ), - ( - pd.DataFrame( - { - "a": np.array([1, None, None], dtype="datetime64[ns]"), - "v": ["n", "c", "a"], - "p": [None, None, None], - } - ).set_index(["a", "v"])["p"], - False, - ), - ], -) -def test_series_null_index_repr(sr, pandas_special_case): - psr = sr - gsr = cudf.from_pandas(psr) - - expected_repr = repr(psr).replace("NaN", "").replace("None", "") - actual_repr = repr(gsr) - - if pandas_special_case: - # Pandas inconsistently print Index null values - # as `None` at some places and `NaN` at few other places - # Whereas cudf is consistent with strings `null` values - # to be printed as `None` everywhere. - actual_repr = repr(gsr).replace("None", "") - assert expected_repr.split() == actual_repr.split() - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [ - 136457654, - 134736784, - 245345345, - 223432411, - 2343241, - 3634548734, - 23234, - ], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize("dtype", ["timedelta64[s]", "timedelta64[us]"]) -def test_timedelta_series_s_us_repr(data, dtype): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - expected = repr(psr).replace("timedelta64[ns]", dtype) - actual = repr(sr) - - assert expected.split() == actual.split() - - -@pytest.mark.parametrize( - "ser, expected_repr", - [ - ( - cudf.Series([], dtype="timedelta64[ns]"), - textwrap.dedent( - """ - Series([], dtype: timedelta64[ns]) - """ - ), - ), - ( - cudf.Series([], dtype="timedelta64[ms]"), - textwrap.dedent( - """ - Series([], dtype: timedelta64[ms]) - """ - ), - ), - ( - cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), - textwrap.dedent( - """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 0 days 00:00:00.003000000 - dtype: timedelta64[ns] - """ - ), - ), - ( - cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ms]"), - textwrap.dedent( - """ - 0 0 days 00:16:40 - 1 0 days 00:03:20 - 2 0 days 00:50:00 - dtype: timedelta64[ms] - """ - ), - ), - ( - cudf.Series([1000000, 200000, None], dtype="timedelta64[ns]"), - textwrap.dedent( - """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 NaT - dtype: timedelta64[ns] - """ - ), - ), - ( - cudf.Series([1000000, 200000, None], dtype="timedelta64[ms]"), - textwrap.dedent( - """ - 0 0 days 00:16:40 - 1 0 days 00:03:20 - 2 NaT - dtype: timedelta64[ms] - """ - ), - ), - ( - cudf.Series( - [None, None, None, None, None], dtype="timedelta64[ns]" - ), - textwrap.dedent( - """ - 0 NaT - 1 NaT - 2 NaT - 3 NaT - 4 NaT - dtype: timedelta64[ns] - """ - ), - ), - ( - cudf.Series( - [None, None, None, None, None], dtype="timedelta64[ms]" - ), - textwrap.dedent( - """ - 0 NaT - 1 NaT - 2 NaT - 3 NaT - 4 NaT - dtype: timedelta64[ms] - """ - ), - ), - ( - cudf.Series( - [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ns]" - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.000000012 - 1 0 days 00:00:00.000000012 - 2 0 days 00:00:00.000000022 - 3 0 days 00:00:00.000000343 - 4 0 days 00:00:00.004353534 - 5 0 days 00:00:00.000435342 - dtype: timedelta64[ns] - """ - ), - ), - ( - cudf.Series( - [12, 12, 22, 343, 4353534, 435342], dtype="timedelta64[ms]" - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.012 - 1 0 days 00:00:00.012 - 2 0 days 00:00:00.022 - 3 0 days 00:00:00.343 - 4 0 days 01:12:33.534 - 5 0 days 00:07:15.342 - dtype: timedelta64[ms] - """ - ), - ), - ( - cudf.Series( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - dtype="timedelta64[ns]", - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.000000001 - 1 0 days 00:00:00.000001132 - 2 0 days 00:00:00.023223231 - 3 0 days 00:00:00.000000233 - 4 0 days 00:00:00 - 5 0 days 00:00:00.000000332 - 6 0 days 00:00:00.000000323 - dtype: timedelta64[ns] - """ - ), - ), - ( - cudf.Series( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - dtype="timedelta64[ms]", - ), - textwrap.dedent( - """ - 0 0 days 00:00:00.001 - 1 0 days 00:00:01.132 - 2 0 days 06:27:03.231 - 3 0 days 00:00:00.233 - 4 0 days 00:00:00 - 5 0 days 00:00:00.332 - 6 0 days 00:00:00.323 - dtype: timedelta64[ms] - """ - ), - ), - ( - cudf.Series( - [ - 13645765432432, - 134736784, - 245345345, - 223432411, - 999992343241, - 3634548734, - 23234, - ], - dtype="timedelta64[ms]", - ), - textwrap.dedent( - """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 - dtype: timedelta64[ms] - """ - ), - ), - ( - cudf.Series( - [ - 13645765432432, - 134736784, - 245345345, - 223432411, - 999992343241, - 3634548734, - 23234, - ], - dtype="timedelta64[ns]", - ), - textwrap.dedent( - """ - 0 0 days 03:47:25.765432432 - 1 0 days 00:00:00.134736784 - 2 0 days 00:00:00.245345345 - 3 0 days 00:00:00.223432411 - 4 0 days 00:16:39.992343241 - 5 0 days 00:00:03.634548734 - 6 0 days 00:00:00.000023234 - dtype: timedelta64[ns] - """ - ), - ), - ( - cudf.Series( - [ - 13645765432432, - 134736784, - 245345345, - 223432411, - 999992343241, - 3634548734, - 23234, - ], - dtype="timedelta64[ms]", - name="abc", - ), - textwrap.dedent( - """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 - Name: abc, dtype: timedelta64[ms] - """ - ), - ), - ( - cudf.Series( - [ - 13645765432432, - 134736784, - 245345345, - 223432411, - 999992343241, - 3634548734, - 23234, - ], - dtype="timedelta64[ns]", - index=["a", "b", "z", "x", "y", "l", "m"], - name="hello", - ), - textwrap.dedent( - """ - a 0 days 03:47:25.765432432 - b 0 days 00:00:00.134736784 - z 0 days 00:00:00.245345345 - x 0 days 00:00:00.223432411 - y 0 days 00:16:39.992343241 - l 0 days 00:00:03.634548734 - m 0 days 00:00:00.000023234 - Name: hello, dtype: timedelta64[ns] - """ - ), - ), - ], -) -def test_timedelta_series_ns_ms_repr(ser, expected_repr): - expected = expected_repr - actual = repr(ser) - - assert expected.split() == actual.split() - - -@pytest.mark.parametrize( - "df,expected_repr", - [ - ( - cudf.DataFrame( - { - "a": cudf.Series( - [1000000, 200000, 3000000], dtype="timedelta64[s]" - ) - } - ), - textwrap.dedent( - """ - a - 0 11 days 13:46:40 - 1 2 days 07:33:20 - 2 34 days 17:20:00 - """ - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[s]", - ), - "b": [10, 11, 22, 33, 44, 55, 66], - } - ), - textwrap.dedent( - """ - a b - 0 1579 days 08:54:14 10 - 1 NaT 11 - 2 2839 days 15:29:05 22 - 3 2586 days 00:33:31 33 - 4 NaT 44 - 5 42066 days 12:52:14 55 - 6 0 days 06:27:14 66 - """ - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[s]", - index=["a", "b", "c", "d", "e", "f", "g"], - ) - } - ), - textwrap.dedent( - """ - a - a 1579 days 08:54:14 - b NaT - c 2839 days 15:29:05 - d 2586 days 00:33:31 - e NaT - f 42066 days 12:52:14 - g 0 days 06:27:14 - """ - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series( - [1, 2, 3, 4, 5, 6, 7], - index=cudf.Index( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[ms]", - ), - ) - } - ), - textwrap.dedent( - """ - a - 1 days 13:54:17.654 1 - NaT 2 - 2 days 20:09:05.345 3 - 2 days 14:03:52.411 4 - NaT 5 - 42 days 01:35:48.734 6 - 0 days 00:00:23.234 7 - """ - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series( - ["a", "f", "q", "e", "w", "e", "t"], - index=cudf.Index( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[ns]", - ), - ) - } - ), - textwrap.dedent( - """ - a - 0 days 00:00:00.136457654 a - NaT f - 0 days 00:00:00.245345345 q - 0 days 00:00:00.223432411 e - NaT w - 0 days 00:00:03.634548734 e - 0 days 00:00:00.000023234 t - """ - ), - ), - ], -) -def test_timedelta_dataframe_repr(df, expected_repr): - actual_repr = repr(df) - - assert actual_repr.split() == expected_repr.split() - - -@pytest.mark.parametrize( - "index, expected_repr", - [ - ( - cudf.Index([1000000, 200000, 3000000], dtype="timedelta64[ms]"), - "TimedeltaIndex(['0 days 00:16:40', " - "'0 days 00:03:20', '0 days 00:50:00'], " - "dtype='timedelta64[ms]')", - ), - ( - cudf.Index( - [None, None, None, None, None], dtype="timedelta64[us]" - ), - "TimedeltaIndex([NaT, NaT, NaT, NaT, NaT], " - "dtype='timedelta64[us]')", - ), - ( - cudf.Index( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[us]", - ), - "TimedeltaIndex([0 days 00:02:16.457654, NaT, " - "0 days 00:04:05.345345, " - "0 days 00:03:43.432411, NaT," - " 0 days 01:00:34.548734, 0 days 00:00:00.023234]," - " dtype='timedelta64[us]')", - ), - ( - cudf.Index( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[s]", - ), - "TimedeltaIndex([1579 days 08:54:14, NaT, 2839 days 15:29:05," - " 2586 days 00:33:31, NaT, 42066 days 12:52:14, " - "0 days 06:27:14]," - " dtype='timedelta64[s]')", - ), - ], -) -def test_timedelta_index_repr(index, expected_repr): - actual_repr = repr(index) - - assert actual_repr.split() == expected_repr.split() - - -@pytest.mark.parametrize( - "pmi", - [ - pd.MultiIndex.from_tuples( - [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] - ), - pd.MultiIndex.from_tuples( - [(1, "red"), (1, "blue"), (2, "red"), (2, "blue")] * 10 - ), - pd.MultiIndex.from_tuples([(1, "red", 102, "sdf")]), - pd.MultiIndex.from_tuples( - [ - ("abc", 0.234, 1), - ("a", -0.34, 0), - ("ai", 111, 4385798), - ("rapids", 0, 34534534), - ], - names=["alphabets", "floats", "ints"], - ), - ], -) -@pytest.mark.parametrize("max_seq_items", [None, 1, 2, 5, 10, 100]) -def test_multiindex_repr(pmi, max_seq_items): - pd.set_option("display.max_seq_items", max_seq_items) - gmi = cudf.from_pandas(pmi) - - assert repr(gmi) == repr(pmi) - pd.reset_option("display.max_seq_items") - - -@pytest.mark.parametrize( - "gdi, expected_repr", - [ - ( - cudf.DataFrame( - { - "a": [None, 1, 2, 3], - "b": ["abc", None, "xyz", None], - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b"]) - .index, - textwrap.dedent( - """ - MultiIndex([(, 'abc'), - ( 1, ), - ( 2, 'xyz'), - ( 3, )], - names=['a', 'b']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series([None, np.nan, 2, 3], nan_as_null=False), - "b": ["abc", None, "xyz", None], - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b"]) - .index, - textwrap.dedent( - """ - MultiIndex([(, 'abc'), - ( nan, ), - ( 2.0, 'xyz'), - ( 3.0, )], - names=['a', 'b']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"), - "b": ["abc", None, "xyz", None], - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b"]) - .index, - textwrap.dedent( - """ - MultiIndex([( 'NaT', 'abc'), - ('1970-01-01 00:00:00.000000001', ), - ('1970-01-01 00:00:00.000000002', 'xyz'), - ('1970-01-01 00:00:00.000000003', )], - names=['a', 'b']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": cudf.Series([None, 1, 2, 3], dtype="datetime64[ns]"), - "b": ["abc", None, "xyz", None], - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b", "c"]) - .index, - textwrap.dedent( - """ - MultiIndex([( 'NaT', 'abc', 0.345), - ('1970-01-01 00:00:00.000000001', , ), - ('1970-01-01 00:00:00.000000002', 'xyz', 100.0), - ('1970-01-01 00:00:00.000000003', , 10.0)], - names=['a', 'b', 'c']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": ["abc", None, "xyz", None], - "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"), - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["a", "b", "c"]) - .index, - textwrap.dedent( - """ - MultiIndex([('abc', NaT, 0.345), - ( , '0 days 00:00:00.000000001', ), - ('xyz', '0 days 00:00:00.000000002', 100.0), - ( , '0 days 00:00:00.000000003', 10.0)], - names=['a', 'b', 'c']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": ["abc", None, "xyz", None], - "b": cudf.Series([None, 1, 2, 3], dtype="timedelta64[ns]"), - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["c", "a"]) - .index, - textwrap.dedent( - """ - MultiIndex([(0.345, 'abc'), - ( , ), - (100.0, 'xyz'), - ( 10.0, )], - names=['c', 'a']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": [None, None, None, None], - "b": cudf.Series( - [None, None, None, None], dtype="timedelta64[ns]" - ), - "c": [0.345, np.nan, 100, 10], - } - ) - .set_index(["b", "a"]) - .index, - textwrap.dedent( - """ - MultiIndex([(NaT, ), - (NaT, ), - (NaT, ), - (NaT, )], - names=['b', 'a']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": [1, 2, None, 3, 5], - "b": [ - "abc", - "def, hi, bye", - None, - ", one, two, three, four", - None, - ], - "c": cudf.Series( - [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False - ), - "d": [None, 100, 2000324, None, None], - } - ) - .set_index(["a", "b", "c", "d"]) - .index, - textwrap.dedent( - """ - MultiIndex([( 1, 'abc', 0.3232, ), - ( 2, 'def, hi, bye', nan, 100), - (, , 1.0, 2000324), - ( 3, ', one, two, three, four', , ), - ( 5, , -0.34534, )], - names=['a', 'b', 'c', 'd']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": [1, 2, None, 3, 5], - "b": [ - "abc", - "def, hi, bye", - None, - ", one, two, three, four", - None, - ], - "c": cudf.Series( - [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False - ), - "d": [None, 100, 2000324, None, None], - } - ) - .set_index(["b", "a", "c", "d"]) - .index, - textwrap.dedent( - """ - MultiIndex([( 'abc', 1, 0.3232, ), - ( 'def, hi, bye', 2, nan, 100), - ( , , 1.0, 2000324), - (', one, two, three, four', 3, , ), - ( , 5, -0.34534, )], - names=['b', 'a', 'c', 'd']) - """ - ), - ), - ( - cudf.DataFrame( - { - "a": ["(abc", "2", None, "3", "5"], - "b": [ - "abc", - "def, hi, bye", - None, - ", one, two, three, four", - None, - ], - "c": cudf.Series( - [0.3232, np.nan, 1, None, -0.34534], nan_as_null=False - ), - "d": [None, 100, 2000324, None, None], - } - ) - .set_index(["a", "b", "c", "d"]) - .index, - textwrap.dedent( - """ - MultiIndex([('(abc', 'abc', 0.3232, ), - ( '2', 'def, hi, bye', nan, 100), - ( , , 1.0, 2000324), - ( '3', ', one, two, three, four', , ), - ( '5', , -0.34534, )], - names=['a', 'b', 'c', 'd']) - """ - ), - ), - ], -) -def test_multiindex_null_repr(gdi, expected_repr): - actual_repr = repr(gdi) - - assert actual_repr.split() == expected_repr.split() - - -def test_categorical_series_with_nan_repr(): - series = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") - - expected_repr = textwrap.dedent( - """ - 0 1.0 - 1 2.0 - 2 NaN - 3 10.0 - 4 NaN - 5 - dtype: category - Categories (4, float64): [1.0, 2.0, 10.0, NaN] - """ - ) - - assert repr(series).split() == expected_repr.split() - - sliced_expected_repr = textwrap.dedent( - """ - 2 NaN - 3 10.0 - 4 NaN - 5 - dtype: category - Categories (4, float64): [1.0, 2.0, 10.0, NaN] - """ - ) - - assert repr(series[2:]).split() == sliced_expected_repr.split() - - -def test_categorical_dataframe_with_nan_repr(): - series = cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") - df = cudf.DataFrame({"a": series}) - expected_repr = textwrap.dedent( - """ - a - 0 1.0 - 1 2.0 - 2 NaN - 3 10.0 - 4 NaN - 5 - """ - ) - - assert repr(df).split() == expected_repr.split() - - -def test_categorical_index_with_nan_repr(): - cat_index = cudf.Index( - cudf.Series( - [1, 2, np.nan, 10, np.nan, None], nan_as_null=False - ).astype("category") - ) - - expected_repr = ( - "CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, ], " - "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')" - ) - - assert repr(cat_index) == expected_repr - - sliced_expected_repr = ( - "CategoricalIndex([NaN, 10.0, NaN, ], " - "categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')" - ) - - assert repr(cat_index[2:]) == sliced_expected_repr - - -def test_empty_series_name(): - ps = pd.Series([], name="abc", dtype="int") - gs = cudf.from_pandas(ps) - - assert repr(ps) == repr(gs) - - -def test_repr_struct_after_concat(): - df = cudf.DataFrame( - { - "a": cudf.Series( - [ - {"sa": 2056831253}, - {"sa": -1463792165}, - {"sa": 1735783038}, - {"sa": 103774433}, - {"sa": -1413247520}, - ] - * 13 - ), - "b": cudf.Series( - [ - {"sa": {"ssa": 1140062029}}, - None, - {"sa": {"ssa": 1998862860}}, - {"sa": None}, - {"sa": {"ssa": -395088502}}, - ] - * 13 - ), - } - ) - pdf = df.to_pandas() - - assert repr(df) == repr(pdf) - - -def test_interval_index_repr(): - pi = pd.Index( - [ - np.nan, - pd.Interval(2.0, 3.0, closed="right"), - pd.Interval(3.0, 4.0, closed="right"), - ] - ) - gi = cudf.from_pandas(pi) - - assert repr(pi) == repr(gi) - - -def test_large_unique_categories_repr(): - # Unfortunately, this is a long running test (takes about 1 minute) - # and there is no way we can reduce the time - pi = pd.CategoricalIndex(range(100_000_000)) - gi = cudf.CategoricalIndex(range(100_000_000)) - expected_repr = repr(pi) - with utils.cudf_timeout(6): - actual_repr = repr(gi) - assert expected_repr == actual_repr - - -@pytest.mark.parametrize("ordered", [True, False]) -def test_categorical_index_ordered(ordered): - pi = pd.CategoricalIndex(range(10), ordered=ordered) - gi = cudf.CategoricalIndex(range(10), ordered=ordered) - - assert repr(pi) == repr(gi) diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py deleted file mode 100644 index a61477981f8..00000000000 --- a/python/cudf/cudf/tests/test_resampling.py +++ /dev/null @@ -1,187 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq - - -def assert_resample_results_equal(lhs, rhs, **kwargs): - assert_eq( - lhs.sort_index(), - rhs.sort_index(), - check_dtype=False, - check_freq=False, - check_index_type=False, - **kwargs, - ) - - -@pytest.mark.parametrize("ts_resolution", ["ns", "s", "ms"]) -def test_series_downsample_simple(ts_resolution): - # Series with and index of 5min intervals: - - index = pd.date_range(start="2001-01-01", periods=10, freq="1min") - psr = pd.Series(range(10), index=index) - gsr = cudf.from_pandas(psr) - gsr.index = gsr.index.astype(f"datetime64[{ts_resolution}]") - assert_resample_results_equal( - psr.resample("3min").sum(), - gsr.resample("3min").sum(), - check_index=False, - ) - - -def test_series_upsample_simple(): - # Series with and index of 5min intervals: - - index = pd.date_range(start="2001-01-01", periods=10, freq="1min") - psr = pd.Series(range(10), index=index) - gsr = cudf.from_pandas(psr) - assert_resample_results_equal( - psr.resample("3min").sum(), - gsr.resample("3min").sum(), - check_index=False, - ) - - -@pytest.mark.parametrize("rule", ["2s", "10s"]) -def test_series_resample_ffill(rule): - rng = pd.date_range("1/1/2012", periods=10, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - gts = cudf.from_pandas(ts) - assert_resample_results_equal( - ts.resample(rule).ffill(), gts.resample(rule).ffill() - ) - - -@pytest.mark.parametrize("rule", ["2s", "10s"]) -def test_series_resample_bfill(rule): - rng = pd.date_range("1/1/2012", periods=10, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - gts = cudf.from_pandas(ts) - assert_resample_results_equal( - ts.resample(rule).bfill(), gts.resample(rule).bfill() - ) - - -@pytest.mark.parametrize("rule", ["2s", "10s"]) -def test_series_resample_asfreq(rule): - rng = pd.date_range("1/1/2012", periods=100, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - gts = cudf.from_pandas(ts) - assert_resample_results_equal( - ts.resample(rule).asfreq(), gts.resample(rule).asfreq() - ) - - -def test_dataframe_resample_aggregation_simple(): - pdf = pd.DataFrame( - np.random.randn(1000, 3), - index=pd.date_range("1/1/2012", freq="s", periods=1000), - columns=["A", "B", "C"], - ) - gdf = cudf.from_pandas(pdf) - assert_resample_results_equal( - pdf.resample("3min").mean(), gdf.resample("3min").mean() - ) - - -def test_dataframe_resample_multiagg(): - pdf = pd.DataFrame( - np.random.randn(1000, 3), - index=pd.date_range("1/1/2012", freq="s", periods=1000), - columns=["A", "B", "C"], - ) - gdf = cudf.from_pandas(pdf) - assert_resample_results_equal( - pdf.resample("3min").agg(["sum", "mean", "std"]), - gdf.resample("3min").agg(["sum", "mean", "std"]), - ) - - -def test_dataframe_resample_on(): - # test resampling on a specified column - pdf = pd.DataFrame( - { - "x": np.random.randn(1000), - "y": pd.date_range("1/1/2012", freq="s", periods=1000), - } - ) - gdf = cudf.from_pandas(pdf) - assert_resample_results_equal( - pdf.resample("3min", on="y").mean(), - gdf.resample("3min", on="y").mean(), - ) - - -def test_dataframe_resample_level(): - # test resampling on a specific level of a MultIndex - pdf = pd.DataFrame( - { - "x": np.random.randn(1000), - "y": pd.date_range("1/1/2012", freq="s", periods=1000), - } - ) - pdi = pd.MultiIndex.from_frame(pdf) - pdf = pd.DataFrame({"a": np.random.randn(1000)}, index=pdi) - gdf = cudf.from_pandas(pdf) - assert_resample_results_equal( - pdf.resample("3min", level="y").mean(), - gdf.resample("3min", level="y").mean(), - ) - - -@pytest.mark.parametrize( - "in_freq, sampling_freq, out_freq", - [ - ("1ns", "1us", "us"), - ("1us", "10us", "us"), - ("ms", "100us", "us"), - ("ms", "1s", "s"), - ("s", "1min", "s"), - ("1min", "30s", "s"), - ("1D", "10D", "s"), - ("10D", "1D", "s"), - ], -) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq): - # test that we cast to the appropriate frequency - # when resampling: - pdf = pd.DataFrame( - { - "x": np.random.randn(100), - "y": pd.date_range("1/1/2012", freq=in_freq, periods=100), - } - ) - gdf = cudf.from_pandas(pdf) - expect = pdf.resample(sampling_freq, on="y").mean() - got = gdf.resample(sampling_freq, on="y").mean() - assert_resample_results_equal(expect, got) - - assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]") - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_resampling_downsampling_ms(): - pdf = pd.DataFrame( - { - "time": pd.date_range("2020-01-01", periods=5, freq="1ns"), - "sign": range(5), - } - ) - gdf = cudf.from_pandas(pdf) - expected = pdf.resample("10ms", on="time").mean() - result = gdf.resample("10ms", on="time").mean() - result.index = result.index.astype("datetime64[ns]") - assert_eq(result, expected, check_freq=False) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py deleted file mode 100644 index 4235affd4d1..00000000000 --- a/python/cudf/cudf/tests/test_reshape.py +++ /dev/null @@ -1,842 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import re -from itertools import chain - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import ( - PANDAS_CURRENT_SUPPORTED_VERSION, - PANDAS_GE_220, - PANDAS_VERSION, -) -from cudf.core.buffer.spill_manager import get_global_manager -from cudf.testing import assert_eq -from cudf.testing._utils import ( - ALL_TYPES, - DATETIME_TYPES, - NUMERIC_TYPES, - expect_warning_if, -) - -pytest_xfail = pytest.mark.xfail -pytestmark = pytest.mark.spilling - -# If spilling is enabled globally, we skip many test permutations -# to reduce running time. -if get_global_manager() is not None: - ALL_TYPES = ["float32"] # noqa: F811 - DATETIME_TYPES = ["datetime64[ms]"] # noqa: F811 - NUMERIC_TYPES = ["float32"] # noqa: F811 - # To save time, we skip tests marked "pytest.mark.xfail" - pytest_xfail = pytest.mark.skipif - - -@pytest.mark.parametrize("num_id_vars", [0, 1, 2]) -@pytest.mark.parametrize("num_value_vars", [0, 1, 2]) -@pytest.mark.parametrize("num_rows", [1, 2, 100]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) -@pytest.mark.parametrize("nulls", ["none", "some", "all"]) -def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): - if dtype not in ["float32", "float64"] and nulls in ["some", "all"]: - pytest.skip(reason="nulls not supported in dtype: " + dtype) - - pdf = pd.DataFrame() - id_vars = [] - for i in range(num_id_vars): - colname = "id" + str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) - if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) - data[idx] = np.nan - elif nulls == "all": - data[:] = np.nan - pdf[colname] = data - id_vars.append(colname) - - value_vars = [] - for i in range(num_value_vars): - colname = "val" + str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) - if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) - data[idx] = np.nan - elif nulls == "all": - data[:] = np.nan - pdf[colname] = data - value_vars.append(colname) - - gdf = cudf.from_pandas(pdf) - - got = cudf.melt(frame=gdf, id_vars=id_vars, value_vars=value_vars) - got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars) - - expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars) - - assert_eq(expect, got) - - assert_eq(expect, got_from_melt_method) - - -def test_melt_many_columns(): - mydict = {"id": ["foobar"]} - for i in range(1, 1942): - mydict[f"d_{i}"] = i - - df = pd.DataFrame(mydict) - grid_df = pd.melt(df, id_vars=["id"], var_name="d", value_name="sales") - - df_d = cudf.DataFrame(mydict) - grid_df_d = cudf.melt( - df_d, id_vars=["id"], var_name="d", value_name="sales" - ) - grid_df_d["d"] = grid_df_d["d"] - - assert_eq(grid_df, grid_df_d) - - -def test_melt_str_scalar_id_var(): - data = {"index": [1, 2], "id": [1, 2], "d0": [10, 20], "d1": [30, 40]} - result = cudf.melt( - cudf.DataFrame(data), - id_vars="index", - var_name="column", - value_name="value", - ) - expected = pd.melt( - pd.DataFrame(data), - id_vars="index", - var_name="column", - value_name="value", - ) - assert_eq(result, expected) - - -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 1000]) -@pytest.mark.parametrize( - "dtype", list(chain(NUMERIC_TYPES, DATETIME_TYPES, ["str"])) -) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_df_stack(nulls, num_cols, num_rows, dtype): - if dtype not in ["float32", "float64"] and nulls in ["some"]: - pytest.skip(reason="nulls not supported in dtype: " + dtype) - - pdf = pd.DataFrame() - for i in range(num_cols): - colname = str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) - if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) - data[idx] = np.nan - pdf[colname] = data - - gdf = cudf.from_pandas(pdf) - - got = gdf.stack() - expect = pdf.stack() - - assert_eq(expect, got) - - -def test_df_stack_reset_index(): - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [10, 11, 12, 13], - "c": ["ab", "cd", None, "gh"], - } - ) - df = df.set_index(["a", "b"]) - pdf = df.to_pandas() - - expected = pdf.stack() - actual = df.stack() - - assert_eq(expected, actual) - - expected = expected.reset_index() - actual = actual.reset_index() - - assert_eq(expected, actual) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Need pandas-2.1.0+ to match `stack` api", -) -@pytest.mark.parametrize( - "columns", - [ - pd.MultiIndex.from_tuples( - [("A", "cat"), ("A", "dog"), ("B", "cat"), ("B", "dog")], - names=["letter", "animal"], - ), - pd.MultiIndex.from_tuples( - [("A", "cat"), ("B", "bird"), ("A", "dog"), ("B", "dog")], - names=["letter", "animal"], - ), - ], -) -@pytest.mark.parametrize( - "level", - [ - -1, - 0, - 1, - "letter", - "animal", - [0, 1], - [1, 0], - ["letter", "animal"], - ["animal", "letter"], - ], -) -@pytest.mark.parametrize( - "index", - [ - pd.RangeIndex(2, name="range"), - pd.Index([9, 8], name="myindex"), - pd.MultiIndex.from_arrays( - [ - ["A", "B"], - [101, 102], - ], - names=["first", "second"], - ), - ], -) -@pytest.mark.parametrize("dropna", [True, False]) -def test_df_stack_multiindex_column_axis(columns, index, level, dropna): - if isinstance(level, list) and len(level) > 1 and not dropna: - pytest.skip( - "Stacking multiple levels with dropna==False is unsupported." - ) - - pdf = pd.DataFrame( - data=[[1, 2, 3, 4], [2, 4, 6, 8]], columns=columns, index=index - ) - gdf = cudf.from_pandas(pdf) - - with pytest.warns(FutureWarning): - got = gdf.stack(level=level, dropna=dropna, future_stack=False) - with expect_warning_if(PANDAS_GE_220, FutureWarning): - expect = pdf.stack(level=level, dropna=dropna, future_stack=False) - - assert_eq(expect, got, check_dtype=False) - - got = gdf.stack(level=level, future_stack=True) - expect = pdf.stack(level=level, future_stack=True) - - assert_eq(expect, got, check_dtype=False) - - -def test_df_stack_mixed_dtypes(): - pdf = pd.DataFrame( - { - "A": pd.Series([1, 2, 3], dtype="f4"), - "B": pd.Series([4, 5, 6], dtype="f8"), - } - ) - - gdf = cudf.from_pandas(pdf) - - got = gdf.stack() - expect = pdf.stack() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Need pandas-2.1.0+ to match `stack` api", -) -@pytest.mark.parametrize("level", [["animal", "hair_length"], [1, 2]]) -def test_df_stack_multiindex_column_axis_pd_example(level): - columns = pd.MultiIndex.from_tuples( - [ - ("A", "cat", "long"), - ("B", "cat", "long"), - ("A", "dog", "short"), - ("B", "dog", "short"), - ], - names=["exp", "animal", "hair_length"], - ) - - df = pd.DataFrame(np.random.randn(4, 4), columns=columns) - - with expect_warning_if(PANDAS_GE_220, FutureWarning): - expect = df.stack(level=level, future_stack=False) - gdf = cudf.from_pandas(df) - with pytest.warns(FutureWarning): - got = gdf.stack(level=level, future_stack=False) - - assert_eq(expect, got) - - expect = df.stack(level=level, future_stack=True) - got = gdf.stack(level=level, future_stack=True) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("num_rows", [1, 2, 10, 1000]) -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["category"] -) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_interleave_columns(nulls, num_cols, num_rows, dtype): - if dtype not in ["float32", "float64"] and nulls in ["some"]: - pytest.skip(reason="nulls not supported in dtype: " + dtype) - - pdf = pd.DataFrame(dtype=dtype) - for i in range(num_cols): - colname = str(i) - data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype) - - if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) - data[idx] = np.nan - pdf[colname] = data - - gdf = cudf.from_pandas(pdf) - - if dtype == "category": - with pytest.raises(ValueError): - assert gdf.interleave_columns() - else: - got = gdf.interleave_columns() - - expect = pd.Series(np.vstack(pdf.to_numpy()).reshape((-1,))).astype( - dtype - ) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("num_cols", [1, 2, 10]) -@pytest.mark.parametrize("num_rows", [1, 2, 1000]) -@pytest.mark.parametrize("count", [1, 2, 10]) -@pytest.mark.parametrize("dtype", ALL_TYPES) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_tile(nulls, num_cols, num_rows, dtype, count): - if dtype not in ["float32", "float64"] and nulls in ["some"]: - pytest.skip(reason="nulls not supported in dtype: " + dtype) - - pdf = pd.DataFrame(dtype=dtype) - for i in range(num_cols): - colname = str(i) - data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype( - dtype - ) - - if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) - data[idx] = np.nan - pdf[colname] = data - - gdf = cudf.from_pandas(pdf) - - got = gdf.tile(count) - expect = pd.DataFrame(pd.concat([pdf] * count)) - - assert_eq(expect, got) - - -def _prepare_merge_sorted_test( - size, - nparts, - keys, - add_null=False, - na_position="last", - ascending=True, - series=False, - index=False, -): - if index: - df = ( - cudf.datasets.timeseries()[:size] - .reset_index(drop=False) - .set_index(keys, drop=True) - ) - else: - df = cudf.datasets.timeseries()[:size].reset_index(drop=False) - if add_null: - df.iloc[1, df.columns.get_loc(keys[0])] = None - chunk = int(size / nparts) - indices = [i * chunk for i in range(0, nparts)] + [size] - if index: - dfs = [ - df.iloc[indices[i] : indices[i + 1]] - .copy() - .sort_index(ascending=ascending) - for i in range(nparts) - ] - elif series: - df = df[keys[0]] - dfs = [ - df.iloc[indices[i] : indices[i + 1]] - .copy() - .sort_values(na_position=na_position, ascending=ascending) - for i in range(nparts) - ] - else: - dfs = [ - df.iloc[indices[i] : indices[i + 1]] - .copy() - .sort_values(keys, na_position=na_position, ascending=ascending) - for i in range(nparts) - ] - return df, dfs - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.parametrize("keys", [None, ["id"], ["name", "timestamp"]]) -@pytest.mark.parametrize("nparts", [2, 10]) -def test_df_merge_sorted(nparts, keys, na_position, ascending): - size = 100 - keys_1 = keys or ["timestamp"] - # Null values NOT currently supported with Categorical data - # or when `ascending=False` - add_null = keys_1[0] not in ("name") - df, dfs = _prepare_merge_sorted_test( - size, - nparts, - keys_1, - add_null=add_null, - na_position=na_position, - ascending=ascending, - ) - - expect = df.sort_values( - keys_1, na_position=na_position, ascending=ascending - ) - result = cudf.core.reshape._merge_sorted( - dfs, keys=keys, na_position=na_position, ascending=ascending - ) - if keys: - expect = expect[keys] - result = result[keys] - - assert expect.index.dtype == result.index.dtype - assert_eq(expect.reset_index(drop=True), result.reset_index(drop=True)) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("index", ["id", "x"]) -@pytest.mark.parametrize("nparts", [2, 10]) -def test_df_merge_sorted_index(nparts, index, ascending): - size = 100 - df, dfs = _prepare_merge_sorted_test( - size, nparts, index, ascending=ascending, index=True - ) - - expect = df.sort_index(ascending=ascending) - result = cudf.core.reshape._merge_sorted( - dfs, by_index=True, ascending=ascending - ) - - assert_eq(expect.index, result.index) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.parametrize("keys", [None, ["name", "timestamp"]]) -def test_df_merge_sorted_ignore_index(keys, na_position, ascending): - size = 100 - nparts = 3 - keys_1 = keys or ["timestamp"] - # Null values NOT currently supported with Categorical data - # or when `ascending=False` - add_null = keys_1[0] not in ("name") - df, dfs = _prepare_merge_sorted_test( - size, - nparts, - keys_1, - add_null=add_null, - na_position=na_position, - ascending=ascending, - ) - - expect = df.sort_values( - keys_1, na_position=na_position, ascending=ascending - ) - result = cudf.core.reshape._merge_sorted( - dfs, - keys=keys, - na_position=na_position, - ascending=ascending, - ignore_index=True, - ) - if keys: - expect = expect[keys] - result = result[keys] - - assert_eq(expect.reset_index(drop=True), result) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -@pytest.mark.parametrize("key", ["id", "name", "timestamp"]) -@pytest.mark.parametrize("nparts", [2, 10]) -def test_series_merge_sorted(nparts, key, na_position, ascending): - size = 100 - df, dfs = _prepare_merge_sorted_test( - size, - nparts, - [key], - na_position=na_position, - ascending=ascending, - series=True, - ) - - expect = df.sort_values(na_position=na_position, ascending=ascending) - result = cudf.core.reshape._merge_sorted( - dfs, na_position=na_position, ascending=ascending - ) - - assert_eq(expect.reset_index(drop=True), result.reset_index(drop=True)) - - -@pytest.mark.parametrize( - "index, column, data", - [ - ([], [], []), - ([0], [0], [0]), - ([0, 0], [0, 1], [1, 2.0]), - ([0, 1], [0, 0], [1, 2.0]), - ([0, 1], [0, 1], [1, 2.0]), - (["a", "a", "b", "b"], ["c", "d", "c", "d"], [1, 2, 3, 4]), - ( - ["a", "a", "b", "b", "a"], - ["c", "d", "c", "d", "e"], - [1, 2, 3, 4, 5], - ), - ], -) -def test_pivot_simple(index, column, data): - pdf = pd.DataFrame({"index": index, "column": column, "data": data}) - gdf = cudf.from_pandas(pdf) - - expect = pdf.pivot(columns="column", index="index") - got = gdf.pivot(columns="column", index="index") - - check_index_and_columns = expect.shape != (0, 0) - assert_eq( - expect, - got, - check_dtype=False, - check_index_type=check_index_and_columns, - check_column_type=check_index_and_columns, - ) - - -def test_pivot_multi_values(): - # from Pandas docs: - # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.pivot.html - pdf = pd.DataFrame( - { - "foo": ["one", "one", "one", "two", "two", "two"], - "bar": ["A", "B", "C", "A", "B", "C"], - "baz": [1, 2, 3, 4, 5, 6], - "zoo": ["x", "y", "z", "q", "w", "t"], - } - ) - gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.pivot(index="foo", columns="bar", values=["baz", "zoo"]), - gdf.pivot(index="foo", columns="bar", values=["baz", "zoo"]), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "values", ["z", "z123", ["z123"], ["z", "z123", "123z"]] -) -def test_pivot_values(values): - data = [ - ["A", "a", 0, 0, 0], - ["A", "b", 1, 1, 1], - ["A", "c", 2, 2, 2], - ["B", "a", 0, 0, 0], - ["B", "b", 1, 1, 1], - ["B", "c", 2, 2, 2], - ["C", "a", 0, 0, 0], - ["C", "b", 1, 1, 1], - ["C", "c", 2, 2, 2], - ] - columns = ["x", "y", "z", "z123", "123z"] - pdf = pd.DataFrame(data, columns=columns) - cdf = cudf.DataFrame(data, columns=columns) - expected = pd.pivot(pdf, index="x", columns="y", values=values) - actual = cudf.pivot(cdf, index="x", columns="y", values=values) - assert_eq( - expected, - actual, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "level", - [ - 0, - pytest.param( - 1, - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - 2, - "foo", - pytest.param( - "bar", - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - "baz", - [], - pytest.param( - [0, 1], - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - ["foo"], - pytest.param( - ["foo", "bar"], - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - pytest.param( - [0, 1, 2], - marks=pytest_xfail(reason="Pandas behaviour unclear"), - ), - pytest.param( - ["foo", "bar", "baz"], - marks=pytest_xfail(reason="Pandas behaviour unclear"), - ), - ], -) -def test_unstack_multiindex(level): - pdf = pd.DataFrame( - { - "foo": ["one", "one", "one", "two", "two", "two"], - "bar": pd.Categorical(["A", "B", "C", "A", "B", "C"]), - "baz": [1, 2, 3, 4, 5, 6], - "zoo": ["x", "y", "z", "q", "w", "t"], - } - ).set_index(["foo", "bar", "baz"]) - gdf = cudf.from_pandas(pdf) - assert_eq( - pdf.unstack(level=level), - gdf.unstack(level=level), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "data", - [{"A": [1.0, 2.0, 3.0, 4.0, 5.0], "B": [11.0, 12.0, 13.0, 14.0, 15.0]}], -) -@pytest.mark.parametrize( - "index", - [ - pd.Index(range(0, 5), name=None), - pd.Index(range(0, 5), name="row_index"), - pytest.param( - pd.CategoricalIndex(["d", "e", "f", "g", "h"]), - marks=pytest_xfail( - reason="Categorical column indexes not supported" - ), - ), - ], -) -@pytest.mark.parametrize( - "col_idx", - [ - pd.Index(["a", "b"], name=None), - pd.Index(["a", "b"], name="col_index"), - pd.MultiIndex.from_tuples([("c", 1), ("c", 2)], names=[None, None]), - pd.MultiIndex.from_tuples( - [("c", 1), ("c", 2)], names=["col_index1", "col_index2"] - ), - ], -) -def test_unstack_index(data, index, col_idx): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - pdf.index = index - pdf.columns = col_idx - - gdf.index = cudf.from_pandas(index) - gdf.columns = cudf.from_pandas(col_idx) - - assert_eq(pdf.unstack(), gdf.unstack()) - - -def test_unstack_index_invalid(): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) - with pytest.raises( - ValueError, - match=re.escape( - "Calling unstack() on single index dataframe with " - "different column datatype is not supported." - ), - ): - gdf.unstack() - - -def test_pivot_duplicate_error(): - gdf = cudf.DataFrame( - {"a": [0, 1, 2, 2], "b": [1, 2, 3, 3], "d": [1, 2, 3, 4]} - ) - with pytest.raises(ValueError): - gdf.pivot(index="a", columns="b") - with pytest.raises(ValueError): - gdf.pivot(index="b", columns="a") - - -@pytest.mark.parametrize( - "data", - [ - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), - } - ], -) -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) -@pytest.mark.parametrize("fill_value", [0]) -def test_pivot_table_simple(data, aggfunc, fill_value): - pdf = pd.DataFrame(data) - expected = pd.pivot_table( - pdf, - values=["D", "E"], - index=["A", "B"], - columns=["C"], - aggfunc=aggfunc, - fill_value=fill_value, - ) - cdf = cudf.DataFrame(data) - actual = cudf.pivot_table( - cdf, - values=["D", "E"], - index=["A", "B"], - columns=["C"], - aggfunc=aggfunc, - fill_value=fill_value, - ) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - { - "A": ["one", "one", "two", "three"] * 6, - "B": ["A", "B", "C"] * 8, - "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), - } - ], -) -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) -@pytest.mark.parametrize("fill_value", [0]) -def test_dataframe_pivot_table_simple(data, aggfunc, fill_value): - pdf = pd.DataFrame(data) - expected = pdf.pivot_table( - values=["D", "E"], - index=["A", "B"], - columns=["C"], - aggfunc=aggfunc, - fill_value=fill_value, - ) - cdf = cudf.DataFrame(data) - actual = cdf.pivot_table( - values=["D", "E"], - index=["A", "B"], - columns=["C"], - aggfunc=aggfunc, - fill_value=fill_value, - ) - assert_eq(expected, actual, check_dtype=False) - - -def test_crosstab_simple(): - a = np.array( - [ - "foo", - "foo", - "foo", - "foo", - "bar", - "bar", - "bar", - "bar", - "foo", - "foo", - "foo", - ], - dtype=object, - ) - b = np.array( - [ - "one", - "one", - "one", - "two", - "one", - "one", - "one", - "two", - "two", - "two", - "one", - ], - dtype=object, - ) - c = np.array( - [ - "dull", - "dull", - "shiny", - "dull", - "dull", - "shiny", - "shiny", - "dull", - "shiny", - "shiny", - "shiny", - ], - dtype=object, - ) - expected = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - actual = cudf.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) - assert_eq(expected, actual, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py deleted file mode 100644 index 135870f7359..00000000000 --- a/python/cudf/cudf/tests/test_rolling.py +++ /dev/null @@ -1,519 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import math - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing.dataset_generator import rand_dataframe - - -@pytest.mark.parametrize( - "data,index", - [ - ([], []), - ([1, 1, 1, 1], None), - ([1, 2, 3, 4], pd.date_range("2001-01-01", "2001-01-04")), - ([1, 2, 4, 9, 9, 4], ["a", "b", "c", "d", "e", "f"]), - ], -) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "std", "var"] -) -@pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) -@pytest.mark.parametrize("center", [True, False]) -def test_rolling_series_basic(data, index, agg, nulls, center): - rng = np.random.default_rng(1) - - if len(data) > 0: - if nulls == "one": - p = rng.integers(0, len(data)) - data[p] = np.nan - elif nulls == "some": - p1, p2 = rng.integers(0, len(data), (2,)) - data[p1] = np.nan - data[p2] = np.nan - elif nulls == "all": - data = [np.nan] * len(data) - - psr = pd.Series(data, index=index) - gsr = cudf.from_pandas(psr) - for window_size in range(1, len(data) + 1): - for min_periods in range(1, window_size + 1): - expect = getattr( - psr.rolling(window_size, min_periods, center), agg - )().fillna(-1) - got = getattr( - gsr.rolling(window_size, min_periods, center), agg - )().fillna(-1) - assert_eq(expect, got, check_dtype=False, check_freq=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [], "b": []}, - {"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}, - {"a": [1, 2, 4, 9, 9, 4], "b": [1, 2, 4, 9, 9, 4]}, - { - "a": np.array([1, 2, 4, 9, 9, 4]), - "b": np.array([1.5, 2.2, 2.2, 8.0, 9.1, 4.2]), - }, - ], -) -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "std", "var"] -) -@pytest.mark.parametrize("nulls", ["none", "one", "some", "all"]) -@pytest.mark.parametrize("center", [True, False]) -def test_rolling_dataframe_basic(data, agg, nulls, center): - rng = np.random.default_rng(0) - pdf = pd.DataFrame(data) - - if len(pdf) > 0: - if nulls == "all": - pdf = pd.DataFrame(np.nan, columns=pdf.columns, index=pdf.index) - else: - for col_idx in range(len(pdf.columns)): - if nulls == "one": - p = rng.integers(0, len(data)) - pdf.iloc[p, col_idx] = np.nan - elif nulls == "some": - p1, p2 = rng.integers(0, len(data), (2,)) - pdf.iloc[p1, col_idx] = np.nan - pdf.iloc[p2, col_idx] = np.nan - - gdf = cudf.from_pandas(pdf) - for window_size in range(1, len(data) + 1): - for min_periods in range(1, window_size + 1): - expect = getattr( - pdf.rolling(window_size, min_periods, center), agg - )().fillna(-1) - got = getattr( - gdf.rolling(window_size, min_periods, center), agg - )().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "agg", - [ - pytest.param("sum"), - pytest.param("min"), - pytest.param("max"), - pytest.param("mean"), - pytest.param("count"), - pytest.param("std"), - pytest.param("var"), - ], -) -def test_rolling_with_offset(agg): - psr = pd.Series( - [1, 2, 4, 4, np.nan, 9], - index=[ - pd.Timestamp("20190101 09:00:00"), - pd.Timestamp("20190101 09:00:01"), - pd.Timestamp("20190101 09:00:02"), - pd.Timestamp("20190101 09:00:04"), - pd.Timestamp("20190101 09:00:07"), - pd.Timestamp("20190101 09:00:08"), - ], - ) - gsr = cudf.from_pandas(psr) - assert_eq( - getattr(psr.rolling("2s"), agg)().fillna(-1), - getattr(gsr.rolling("2s"), agg)().fillna(-1), - check_dtype=False, - ) - - -@pytest.mark.parametrize("agg", ["std", "var"]) -@pytest.mark.parametrize("ddof", [0, 1]) -@pytest.mark.parametrize("center", [True, False]) -@pytest.mark.parametrize("seed", [100, 2000]) -@pytest.mark.parametrize("window_size", [2, 10, 100]) -def test_rolling_var_std_large(agg, ddof, center, seed, window_size): - iupper_bound = math.sqrt(np.iinfo(np.int64).max / window_size) - ilower_bound = -math.sqrt(abs(np.iinfo(np.int64).min) / window_size) - - fupper_bound = math.sqrt(np.finfo(np.float64).max / window_size) - flower_bound = -math.sqrt(abs(np.finfo(np.float64).min) / window_size) - - n_rows = 1_000 - data = rand_dataframe( - dtypes_meta=[ - { - "dtype": "int64", - "null_frequency": 0.4, - "cardinality": n_rows, - "min_bound": ilower_bound, - "max_bound": iupper_bound, - }, - { - "dtype": "float64", - "null_frequency": 0.4, - "cardinality": n_rows, - "min_bound": flower_bound, - "max_bound": fupper_bound, - }, - { - "dtype": "decimal64", - "null_frequency": 0.4, - "cardinality": n_rows, - "min_bound": ilower_bound, - "max_bound": iupper_bound, - }, - ], - rows=n_rows, - use_threads=False, - seed=seed, - ) - pdf = data.to_pandas() - gdf = cudf.from_pandas(pdf) - - expect = getattr(pdf.rolling(window_size, 1, center), agg)(ddof=ddof) - got = getattr(gdf.rolling(window_size, 1, center), agg)(ddof=ddof) - - import platform - - if platform.machine() == "aarch64": - # Due to pandas-37051, pandas rolling var/std on uniform window is - # not reliable. Skipping these rows when comparing. - for col in expect: - mask = (got[col].fillna(-1) != 0).to_pandas() - expect[col] = expect[col][mask] - got[col] = got[col][mask] - assert_eq(expect[col], got[col], check_freq=False) - else: - assert_eq(expect, got, check_freq=False) - - -def test_rolling_var_uniform_window(): - """ - Pandas adopts an online variance calculation algorithm. This gives a - floating point artifact. - - In cudf, each window is computed independently from the previous window, - this gives better numeric precision. - """ - - s = pd.Series([1e8, 5, 5, 5]) - expected = s.rolling(3).var() - got = cudf.from_pandas(s).rolling(3).var() - - assert_eq(expected, got) - - -def test_rolling_count_with_offset(): - """ - This test covers the xfail case from test_rolling_with_offset["count"]. - It is expected that count should return a non-Nan value, even if - the counted value is a Nan, unless the min-periods condition - is not met. - This behaviour is consistent with counts for rolling-windows, - in the non-offset window case. - """ - psr = pd.Series( - [1, 2, 4, 4, np.nan, 9], - index=[ - pd.Timestamp("20190101 09:00:00"), - pd.Timestamp("20190101 09:00:01"), - pd.Timestamp("20190101 09:00:02"), - pd.Timestamp("20190101 09:00:04"), - pd.Timestamp("20190101 09:00:07"), - pd.Timestamp("20190101 09:00:08"), - ], - ) - gsr = cudf.from_pandas(psr) - assert_eq( - getattr(gsr.rolling("2s"), "count")().fillna(-1), - pd.Series( - [1, 2, 2, 1, 0, 1], - index=[ - pd.Timestamp("20190101 09:00:00"), - pd.Timestamp("20190101 09:00:01"), - pd.Timestamp("20190101 09:00:02"), - pd.Timestamp("20190101 09:00:04"), - pd.Timestamp("20190101 09:00:07"), - pd.Timestamp("20190101 09:00:08"), - ], - ), - check_dtype=False, - ) - - -def test_rolling_getattr(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.rolling(2).a.sum().fillna(-1), - gdf.rolling(2).a.sum().fillna(-1), - check_dtype=False, - ) - - -def test_rolling_getitem(): - pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.rolling(2)["a"].sum().fillna(-1), - gdf.rolling(2)["a"].sum().fillna(-1), - check_dtype=False, - ) - assert_eq( - pdf.rolling(2)["a", "b"].sum().fillna(-1), - gdf.rolling(2)["a", "b"].sum().fillna(-1), - check_dtype=False, - ) - assert_eq( - pdf.rolling(2)[["a", "b"]].sum().fillna(-1), - gdf.rolling(2)["a", "b"].sum().fillna(-1), - check_dtype=False, - ) - - -def test_rolling_getitem_window(): - index = pd.DatetimeIndex( - pd.date_range("2000-01-01", "2000-01-02", freq="1h") - ) - pdf = pd.DataFrame({"x": np.arange(len(index))}, index=index) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.rolling("2h").x.mean(), - gdf.rolling("2h").x.mean(), - check_freq=False, - ) - - -@pytest.mark.parametrize( - "data,index", [([1.2, 4.5, 5.9, 2.4, 9.3, 7.1], None), ([], [])] -) -@pytest.mark.parametrize("center", [True, False]) -def test_rollling_series_numba_udf_basic(data, index, center): - psr = pd.Series(data, index=index) - gsr = cudf.from_pandas(psr) - - def some_func(A): - b = 0 - for a in A: - b = max(b, math.sqrt(a)) - return b - - for window_size in range(1, len(data) + 1): - for min_periods in range(1, window_size + 1): - assert_eq( - psr.rolling(window_size, min_periods, center) - .apply(some_func) - .fillna(-1), - gsr.rolling(window_size, min_periods, center) - .apply(some_func) - .fillna(-1), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [], "b": []}, - {"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]}, - {"a": [1, 2, 4, 9, 9, 4], "b": [1, 2, 4, 9, 9, 4]}, - { - "a": np.array([1, 2, 4, 9, 9, 4]), - "b": np.array([1.5, 2.2, 2.2, 8.0, 9.1, 4.2]), - }, - ], -) -@pytest.mark.parametrize("center", [True, False]) -def test_rolling_dataframe_numba_udf_basic(data, center): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - def some_func(A): - b = 0 - for a in A: - b = b + a**2 - return b / len(A) - - for window_size in range(1, len(data) + 1): - for min_periods in range(1, window_size + 1): - assert_eq( - pdf.rolling(window_size, min_periods, center) - .apply(some_func) - .fillna(-1), - gdf.rolling(window_size, min_periods, center) - .apply(some_func) - .fillna(-1), - check_dtype=False, - ) - - -def test_rolling_numba_udf_with_offset(): - psr = pd.Series( - [1, 2, 4, 4, 8, 9], - index=[ - pd.Timestamp("20190101 09:00:00"), - pd.Timestamp("20190101 09:00:01"), - pd.Timestamp("20190101 09:00:02"), - pd.Timestamp("20190101 09:00:04"), - pd.Timestamp("20190101 09:00:07"), - pd.Timestamp("20190101 09:00:08"), - ], - ) - gsr = cudf.from_pandas(psr) - - def some_func(A): - b = 0 - for a in A: - b = b + a - return b / len(A) - - assert_eq( - psr.rolling("2s").apply(some_func).fillna(-1), - gsr.rolling("2s").apply(some_func).fillna(-1), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) -def test_rolling_groupby_simple(agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], - "b": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1], - } - ) - gdf = cudf.from_pandas(pdf) - - for window_size in range(1, len(pdf) + 1): - expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( - -1 - ) - got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - pdf = pd.DataFrame( - {"a": [1, 1, 1, 2, 2], "b": [1, 1, 2, 2, 3], "c": [1, 2, 3, 4, 5]} - ) - gdf = cudf.from_pandas(pdf) - - for window_size in range(1, len(pdf) + 1): - expect = getattr(pdf.groupby("a").rolling(window_size), agg)().fillna( - -1 - ) - got = getattr(gdf.groupby("a").rolling(window_size), agg)().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) -def test_rolling_groupby_multi(agg): - pdf = pd.DataFrame( - { - "a": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2], - "b": [0, 0, 1, 1, 0, 1, 2, 1, 1, 0], - "c": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1], - } - ) - gdf = cudf.from_pandas(pdf) - - for window_size in range(1, len(pdf) + 1): - expect = getattr( - pdf.groupby(["a", "b"], sort=True).rolling(window_size), agg - )().fillna(-1) - got = getattr( - gdf.groupby(["a", "b"], sort=True).rolling(window_size), agg - )().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "agg", ["sum", "min", "max", "mean", "count", "var", "std"] -) -@pytest.mark.parametrize( - "window_size", ["1d", "2d", "3d", "4d", "5d", "6d", "7d"] -) -def test_rolling_groupby_offset(agg, window_size): - pdf = pd.DataFrame( - { - "date": pd.date_range(start="2016-01-01", periods=7, freq="D"), - "group": [1, 2, 2, 1, 1, 2, 1], - "val": [5, 6, 7, 8, 1, 2, 3], - } - ).set_index("date") - gdf = cudf.from_pandas(pdf) - expect = getattr(pdf.groupby("group").rolling(window_size), agg)().fillna( - -1 - ) - got = getattr(gdf.groupby("group").rolling(window_size), agg)().fillna(-1) - assert_eq(expect, got, check_dtype=False) - - -def test_rolling_custom_index_support(): - from pandas.api.indexers import BaseIndexer - - class CustomIndexer(BaseIndexer): - def get_window_bounds( - self, num_values, min_periods, center, closed, step=None - ): - start = np.empty(num_values, dtype=np.int64) - end = np.empty(num_values, dtype=np.int64) - - for i in range(num_values): - if self.use_expanding[i]: - start[i] = 0 - end[i] = i + 1 - else: - start[i] = i - end[i] = i + self.window_size - - return start, end - - use_expanding = [True, False, True, False, True] - indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) - - df = pd.DataFrame({"values": range(5)}) - gdf = cudf.from_pandas(df) - - expected = df.rolling(window=indexer).sum() - actual = gdf.rolling(window=indexer).sum() - - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "indexer", - [ - pd.api.indexers.FixedForwardWindowIndexer(window_size=2), - pd.api.indexers.VariableOffsetWindowIndexer( - index=pd.date_range("2020", periods=5), offset=pd.offsets.BDay(1) - ), - ], -) -def test_rolling_indexer_support(indexer): - df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) - gdf = cudf.from_pandas(df) - - expected = df.rolling(window=indexer, min_periods=2).sum() - actual = gdf.rolling(window=indexer, min_periods=2).sum() - - assert_eq(expected, actual) - - -def test_rolling_series(): - df = cudf.DataFrame({"a": range(0, 100), "b": [10, 20, 30, 40, 50] * 20}) - pdf = df.to_pandas() - - expected = pdf.groupby("b")["a"].rolling(5).mean() - actual = df.groupby("b")["a"].rolling(5).mean() - - assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py deleted file mode 100644 index 0958b68084d..00000000000 --- a/python/cudf/cudf/tests/test_s3.py +++ /dev/null @@ -1,531 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import os -import socket -from contextlib import contextmanager -from io import BytesIO, StringIO - -import numpy as np -import pandas as pd -import pytest -from fsspec.core import get_fs_token_paths - -import cudf -from cudf.testing import assert_eq - -moto = pytest.importorskip("moto", minversion="3.1.6") -boto3 = pytest.importorskip("boto3") -s3fs = pytest.importorskip("s3fs") - -ThreadedMotoServer = pytest.importorskip("moto.server").ThreadedMotoServer - - -@pytest.fixture(scope="session") -def endpoint_ip(): - return "127.0.0.1" - - -@pytest.fixture(scope="session") -def endpoint_port(): - # Return a free port per worker session. - sock = socket.socket() - sock.bind(("127.0.0.1", 0)) - port = sock.getsockname()[1] - sock.close() - return port - - -@contextmanager -def ensure_safe_environment_variables(): - """ - Get a context manager to safely set environment variables - All changes will be undone on close, hence environment variables set - within this contextmanager will neither persist nor change global state. - """ - saved_environ = dict(os.environ) - try: - yield - finally: - os.environ.clear() - os.environ.update(saved_environ) - - -@pytest.fixture(scope="session") -def s3_base(endpoint_ip, endpoint_port): - """ - Fixture to set up moto server in separate process - """ - with ensure_safe_environment_variables(): - # Fake aws credentials exported to prevent botocore looking for - # system aws credentials, https://github.com/spulec/moto/issues/1793 - os.environ["AWS_ACCESS_KEY_ID"] = "foobar_key" - os.environ["AWS_SECRET_ACCESS_KEY"] = "foobar_secret" - os.environ["S3FS_LOGGING_LEVEL"] = "DEBUG" - os.environ["AWS_SECURITY_TOKEN"] = "foobar_security_token" - os.environ["AWS_SESSION_TOKEN"] = "foobar_session_token" - os.environ["AWS_DEFAULT_REGION"] = "us-east-1" - - # Launching moto in server mode, i.e., as a separate process - # with an S3 endpoint on localhost - - endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/" - - server = ThreadedMotoServer(ip_address=endpoint_ip, port=endpoint_port) - server.start() - yield endpoint_uri - server.stop() - - -@pytest.fixture() -def s3so(endpoint_ip, endpoint_port): - """ - Returns s3 storage options to pass to fsspec - """ - endpoint_uri = f"http://{endpoint_ip}:{endpoint_port}/" - - return {"client_kwargs": {"endpoint_url": endpoint_uri}} - - -@contextmanager -def s3_context(s3_base, bucket, files=None): - if files is None: - files = {} - with ensure_safe_environment_variables(): - client = boto3.client("s3", endpoint_url=s3_base) - client.create_bucket(Bucket=bucket, ACL="public-read-write") - for f, data in files.items(): - client.put_object(Bucket=bucket, Key=f, Body=data) - - yield s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base}) - - for f, data in files.items(): - try: - client.delete_object(Bucket=bucket, Key=f) - except Exception: - pass - - -@pytest.fixture -def pdf(scope="module"): - df = pd.DataFrame() - df["Integer"] = np.array([2345, 11987, 9027, 9027]) - df["Float"] = np.array([9.001, 8.343, 6, 2.781]) - df["Integer2"] = np.array([2345, 106, 2088, 789277]) - df["String"] = np.array(["Alpha", "Beta", "Gamma", "Delta"]) - df["Boolean"] = np.array([True, False, True, False]) - return df - - -@pytest.fixture -def pdf_ext(scope="module"): - size = 100 - df = pd.DataFrame() - df["Integer"] = np.array([i for i in range(size)]) - df["List"] = [[i] for i in range(size)] - df["Struct"] = [{"a": i} for i in range(size)] - df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[ - :size - ] - return df - - -@pytest.mark.parametrize("bytes_per_thread", [32, 1024]) -def test_read_csv(s3_base, s3so, pdf, bytes_per_thread): - # Write to buffer - fname = "test_csv_reader.csv" - bucket = "csv" - buffer = pdf.to_csv(index=False) - - # Use fsspec file object - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - ) - assert_eq(pdf, got) - - -@pytest.mark.parametrize("bytes_per_thread", [32, 1024]) -def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread): - # Write to buffer - fname = "test_csv_reader_byte_range.csv" - bucket = "csv" - buffer = pdf.to_csv(index=False) - - # Use fsspec file object - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_csv( - f"s3://{bucket}/{fname}", - storage_options=s3so, - byte_range=(74, 73), - bytes_per_thread=bytes_per_thread, - header=None, - names=["Integer", "Float", "Integer2", "String", "Boolean"], - ) - - assert_eq(pdf.iloc[-2:].reset_index(drop=True), got) - - -@pytest.mark.parametrize("chunksize", [None, 3]) -def test_write_csv(s3_base, s3so, pdf, chunksize): - # Write to buffer - fname = "test_csv_writer.csv" - bucket = "csv" - gdf = cudf.from_pandas(pdf) - with s3_context(s3_base=s3_base, bucket=bucket) as s3fs: - gdf.to_csv( - f"s3://{bucket}/{fname}", - index=False, - chunksize=chunksize, - storage_options=s3so, - ) - assert s3fs.exists(f"s3://{bucket}/{fname}") - - # TODO: Update to use `storage_options` from pandas v1.2.0 - got = pd.read_csv(s3fs.open(f"s3://{bucket}/{fname}")) - - assert_eq(pdf, got) - - -@pytest.mark.parametrize("bytes_per_thread", [32, 1024]) -@pytest.mark.parametrize("columns", [None, ["Float", "String"]]) -def test_read_parquet( - s3_base, - s3so, - pdf, - bytes_per_thread, - columns, -): - fname = "test_parquet_reader.parquet" - bucket = "parquet" - buffer = BytesIO() - pdf.to_parquet(path=buffer) - - # Check direct path handling - buffer.seek(0) - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got1 = cudf.read_parquet( - f"s3://{bucket}/{fname}", - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - columns=columns, - ) - expect = pdf[columns] if columns else pdf - assert_eq(expect, got1) - - # Check fsspec file-object handling - buffer.seek(0) - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - fs = get_fs_token_paths( - f"s3://{bucket}/{fname}", storage_options=s3so - )[0] - with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f: - got2 = cudf.read_parquet( - f, - bytes_per_thread=bytes_per_thread, - columns=columns, - ) - assert_eq(expect, got2) - - -@pytest.mark.parametrize("method", ["all", "parquet"]) -@pytest.mark.parametrize("blocksize", [1024 * 1024, 1024]) -def test_read_parquet_prefetch_options( - s3_base, - s3so, - pdf, - method, - blocksize, -): - bucket = "parquet" - fname_1 = "test_parquet_reader_prefetch_options_1.parquet" - buffer_1 = BytesIO() - pdf.to_parquet(path=buffer_1) - buffer_1.seek(0) - - fname_2 = "test_parquet_reader_prefetch_options_2.parquet" - buffer_2 = BytesIO() - pdf_2 = pdf.copy() - pdf_2["Integer"] += 1 - pdf_2.to_parquet(path=buffer_2) - buffer_2.seek(0) - - with s3_context( - s3_base=s3_base, - bucket=bucket, - files={ - fname_1: buffer_1, - fname_2: buffer_2, - }, - ): - got = cudf.read_parquet( - [ - f"s3://{bucket}/{fname_1}", - f"s3://{bucket}/{fname_2}", - ], - storage_options=s3so, - prefetch_options={ - "method": method, - "blocksize": blocksize, - }, - columns=["String", "Integer"], - ) - - expect = pd.concat([pdf, pdf_2], ignore_index=True)[["String", "Integer"]] - assert_eq(expect, got) - - -@pytest.mark.parametrize("bytes_per_thread", [32, 1024]) -@pytest.mark.parametrize("columns", [None, ["List", "Struct"]]) -@pytest.mark.parametrize("index", [None, "Integer"]) -def test_read_parquet_ext( - s3_base, - s3so, - pdf_ext, - bytes_per_thread, - columns, - index, -): - fname = "test_parquet_reader_ext.parquet" - bucket = "parquet" - buffer = BytesIO() - - if index: - pdf_ext.set_index(index).to_parquet(path=buffer) - else: - pdf_ext.to_parquet(path=buffer) - - # Check direct path handling - buffer.seek(0) - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got1 = cudf.read_parquet( - f"s3://{bucket}/{fname}", - storage_options=s3so, - bytes_per_thread=bytes_per_thread, - columns=columns, - ) - if index: - expect = ( - pdf_ext.set_index(index)[columns] - if columns - else pdf_ext.set_index(index) - ) - else: - expect = pdf_ext[columns] if columns else pdf_ext - assert_eq(expect, got1) - - -def test_read_parquet_filesystem(s3_base, s3so, pdf): - fname = "data.0.parquet" - # NOTE: Need a unique bucket name when a glob pattern - # is used, otherwise fsspec seems to cache the bucket - # contents, and later tests using the same bucket name - # will fail. - bucket = "test_read_parquet_filesystem" - buffer = BytesIO() - pdf.to_parquet(path=buffer) - buffer.seek(0) - fs = get_fs_token_paths("s3://", mode="rb", storage_options=s3so)[0] - with s3_context( - s3_base=s3_base, - bucket=bucket, - files={fname: buffer}, - ): - # Check that a glob pattern works - path = f"s3://{bucket}/{'data.*.parquet'}" - got = cudf.read_parquet(path, filesystem=fs) - assert_eq(pdf, got) - - -def test_read_parquet_multi_file(s3_base, s3so, pdf): - fname_1 = "test_parquet_reader_multi_file_1.parquet" - buffer_1 = BytesIO() - pdf.to_parquet(path=buffer_1) - buffer_1.seek(0) - - fname_2 = "test_parquet_reader_multi_file_2.parquet" - buffer_2 = BytesIO() - pdf.to_parquet(path=buffer_2) - buffer_2.seek(0) - - bucket = "parquet" - with s3_context( - s3_base=s3_base, - bucket=bucket, - files={ - fname_1: buffer_1, - fname_2: buffer_2, - }, - ): - got = cudf.read_parquet( - [ - f"s3://{bucket}/{fname_1}", - f"s3://{bucket}/{fname_2}", - ], - storage_options=s3so, - ).reset_index(drop=True) - - expect = pd.concat([pdf, pdf], ignore_index=True) - assert_eq(expect, got) - - -def test_read_parquet_filters(s3_base, s3so, pdf_ext): - fname = "test_parquet_reader_filters.parquet" - bucket = "parquet" - buffer = BytesIO() - pdf_ext.to_parquet(path=buffer) - buffer.seek(0) - filters = [("String", "==", "Omega")] - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_parquet( - f"s3://{bucket}/{fname}", - storage_options=s3so, - filters=filters, - ) - - # All row-groups should be filtered out - assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True)) - - -@pytest.mark.parametrize("partition_cols", [None, ["String"]]) -def test_write_parquet(s3_base, s3so, pdf, partition_cols): - fname_cudf = "test_parquet_writer_cudf" - fname_pandas = "test_parquet_writer_pandas" - bucket = "parquet" - gdf = cudf.from_pandas(pdf) - - with s3_context(s3_base=s3_base, bucket=bucket) as s3fs: - gdf.to_parquet( - f"s3://{bucket}/{fname_cudf}", - partition_cols=partition_cols, - storage_options=s3so, - ) - assert s3fs.exists(f"s3://{bucket}/{fname_cudf}") - pdf.to_parquet( - f"s3://{bucket}/{fname_pandas}", - partition_cols=partition_cols, - storage_options=s3so, - ) - assert s3fs.exists(f"s3://{bucket}/{fname_pandas}") - - got = pd.read_parquet( - f"s3://{bucket}/{fname_pandas}", storage_options=s3so - ) - expect = cudf.read_parquet( - f"s3://{bucket}/{fname_cudf}", storage_options=s3so - ) - - assert_eq(expect, got) - - -def test_read_json(s3_base, s3so): - fname = "test_json_reader.json" - bucket = "json" - buffer = ( - '{"amount": 100, "name": "Alice"}\n' - '{"amount": 200, "name": "Bob"}\n' - '{"amount": 300, "name": "Charlie"}\n' - '{"amount": 400, "name": "Dennis"}\n' - ) - - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_json( - f"s3://{bucket}/{fname}", - engine="cudf", - orient="records", - lines=True, - storage_options=s3so, - ) - - expect = pd.read_json(StringIO(buffer), lines=True) - assert_eq(expect, got) - - -@pytest.mark.parametrize("columns", [None, ["string1"]]) -def test_read_orc(s3_base, s3so, datadir, columns): - source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc") - fname = "test_orc_reader.orc" - bucket = "orc" - expect = pd.read_orc(source_file) - - with open(source_file, "rb") as f: - buffer = f.read() - - with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): - got = cudf.read_orc( - f"s3://{bucket}/{fname}", - columns=columns, - storage_options=s3so, - ) - - if columns: - expect = expect[columns] - assert_eq(expect, got) - - -def test_write_orc(s3_base, s3so, pdf): - fname = "test_orc_writer.orc" - bucket = "orc" - gdf = cudf.from_pandas(pdf) - with s3_context(s3_base=s3_base, bucket=bucket) as s3fs: - gdf.to_orc(f"s3://{bucket}/{fname}", storage_options=s3so) - assert s3fs.exists(f"s3://{bucket}/{fname}") - - with s3fs.open(f"s3://{bucket}/{fname}") as f: - got = pd.read_orc(f) - - assert_eq(pdf, got) - - -def test_write_chunked_parquet(s3_base, s3so): - df1 = cudf.DataFrame({"b": [10, 11, 12], "a": [1, 2, 3]}) - df2 = cudf.DataFrame({"b": [20, 30, 50], "a": [3, 2, 1]}) - dirname = "chunked_writer_directory" - bucket = "parquet" - from cudf.io.parquet import ParquetDatasetWriter - - with s3_context( - s3_base=s3_base, bucket=bucket, files={dirname: BytesIO()} - ) as s3fs: - with ParquetDatasetWriter( - f"s3://{bucket}/{dirname}", - partition_cols=["a"], - storage_options=s3so, - ) as cw: - cw.write_table(df1) - cw.write_table(df2) - - # TODO: Replace following workaround with: - # expect = cudf.read_parquet(f"s3://{bucket}/{dirname}/", - # storage_options=s3so) - # after the following bug is fixed: - # https://issues.apache.org/jira/browse/ARROW-16438 - - dfs = [] - for folder in {"a=1", "a=2", "a=3"}: - assert s3fs.exists(f"s3://{bucket}/{dirname}/{folder}") - for file in s3fs.ls(f"s3://{bucket}/{dirname}/{folder}"): - df = cudf.read_parquet("s3://" + file, storage_options=s3so) - dfs.append(df) - - actual = cudf.concat(dfs).astype("int64") - assert_eq( - actual.sort_values(["b"]).reset_index(drop=True), - cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True), - ) - - -def test_no_s3fs_on_cudf_import(): - import subprocess - import sys - - output = subprocess.check_output( - [ - sys.executable, - "-c", - "import cudf; import sys; print('pyarrow._s3fs' in sys.modules)", - ], - cwd="/", - ) - assert output.strip() == b"False" diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py deleted file mode 100644 index f2faf4343b6..00000000000 --- a/python/cudf/cudf/tests/test_scalar.py +++ /dev/null @@ -1,480 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -import datetime -import re -from decimal import Decimal - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest -from packaging import version - -import rmm - -import cudf -from cudf._lib.copying import get_element -from cudf.testing._utils import ( - ALL_TYPES, - DATETIME_TYPES, - NUMERIC_TYPES, - TIMEDELTA_TYPES, -) - - -@pytest.fixture(autouse=True) -def clear_scalar_cache(): - cudf.Scalar._clear_instance_cache() - yield - - -TEST_DECIMAL_TYPES = [ - cudf.Decimal64Dtype(1, 1), - cudf.Decimal64Dtype(4, 2), - cudf.Decimal64Dtype(4, -2), - cudf.Decimal32Dtype(3, 1), - cudf.Decimal128Dtype(28, 3), -] - -SCALAR_VALUES = [ - 0, - -1, - 42, - 0.0, - 1.0, - np.int8(0), - np.int8(1), - np.int8(-1), - np.iinfo(np.int8).min, - np.iinfo(np.int8).max, - np.int16(1), - np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - np.int32(42), - np.int32(-42), - np.iinfo(np.int32).min, - np.iinfo(np.int32).max, - np.int64(42), - np.iinfo(np.int64).min, - np.iinfo(np.int64).max, - np.uint8(0), - np.uint8(1), - np.uint8(255), - np.iinfo(np.uint8).min, - np.iinfo(np.uint8).max, - np.uint16(1), - np.iinfo(np.uint16).min, - np.iinfo(np.uint16).max, - np.uint32(42), - np.uint32(4294967254), - np.iinfo(np.uint32).min, - np.iinfo(np.uint32).max, - np.uint64(42), - np.iinfo(np.uint64).min, - np.uint64(np.iinfo(np.uint64).max), - np.float32(1), - np.float32(-1), - np.finfo(np.float32).min, - np.finfo(np.float32).max, - np.float64(1), - np.float64(-1), - np.finfo(np.float64).min, - np.finfo(np.float64).max, - np.float32("NaN"), - np.float64("NaN"), - np.datetime64(0, "s"), - np.datetime64(1, "s"), - np.datetime64(-1, "s"), - np.datetime64(42, "s"), - np.datetime64(np.iinfo(np.int64).max, "s"), - np.datetime64(np.iinfo(np.int64).min + 1, "s"), - np.datetime64(42, "ms"), - np.datetime64(np.iinfo(np.int64).max, "ms"), - np.datetime64(np.iinfo(np.int64).min + 1, "ms"), - np.datetime64(42, "us"), - np.datetime64(np.iinfo(np.int64).max, "us"), - np.datetime64(np.iinfo(np.int64).min + 1, "us"), - np.datetime64(42, "ns"), - np.datetime64(np.iinfo(np.int64).max, "ns"), - np.datetime64(np.iinfo(np.int64).min + 1, "ns"), - np.timedelta64(0, "s"), - np.timedelta64(1, "s"), - np.timedelta64(-1, "s"), - np.timedelta64(42, "s"), - np.timedelta64(np.iinfo(np.int64).max, "s"), - np.timedelta64(np.iinfo(np.int64).min + 1, "s"), - np.timedelta64(42, "ms"), - np.timedelta64(np.iinfo(np.int64).max, "ms"), - np.timedelta64(np.iinfo(np.int64).min + 1, "ms"), - np.timedelta64(42, "us"), - np.timedelta64(np.iinfo(np.int64).max, "us"), - np.timedelta64(np.iinfo(np.int64).min + 1, "us"), - np.timedelta64(42, "ns"), - np.timedelta64(np.iinfo(np.int64).max, "ns"), - np.timedelta64(np.iinfo(np.int64).min + 1, "ns"), - "", - "one", - "1", - True, - False, - np.bool_(True), - np.bool_(False), - np.str_("asdf"), - np.object_("asdf"), -] - -DECIMAL_VALUES = [ - Decimal("100"), - Decimal("0.0042"), - Decimal("1.0042"), -] - - -@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) -def test_scalar_host_initialization(value): - s = cudf.Scalar(value) - - np.testing.assert_equal(s.value, value) - assert s.is_valid() is True - assert s._is_host_value_current - assert not s._is_device_value_current - - -@pytest.mark.parametrize("value", SCALAR_VALUES) -def test_scalar_device_initialization(value): - column = cudf.Series([value], nan_as_null=False)._column - dev_slr = get_element(column, 0) - - s = cudf.Scalar.from_device_scalar(dev_slr) - - assert s._is_device_value_current - assert not s._is_host_value_current - - assert s.value == value or np.isnan(s.value) and np.isnan(value) - - assert s._is_device_value_current - assert s._is_host_value_current - - -@pytest.mark.parametrize("value", DECIMAL_VALUES) -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_scalar_device_initialization_decimal(value, decimal_type): - dtype = decimal_type._from_decimal(value) - column = cudf.Series([str(value)]).astype(dtype)._column - dev_slr = get_element(column, 0) - - s = cudf.Scalar.from_device_scalar(dev_slr) - - assert s._is_device_value_current - assert not s._is_host_value_current - - assert s.value == value - - assert s._is_device_value_current - assert s._is_host_value_current - - -@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) -def test_scalar_roundtrip(value): - s = cudf.Scalar(value) - - assert s._is_host_value_current - assert not s._is_device_value_current - - # call this property to sync the scalar - s.device_value - - assert s._is_host_value_current - assert s._is_device_value_current - - # invalidate the host cache - s._host_value = None - s._host_dtype = None - - assert not s._is_host_value_current - assert s._is_device_value_current - - # this should trigger a host copy - - assert s.value == value or np.isnan(s.value) and np.isnan(value) - - -@pytest.mark.parametrize( - "dtype", - NUMERIC_TYPES - + DATETIME_TYPES - + TIMEDELTA_TYPES - + ["object"] - + TEST_DECIMAL_TYPES, -) -def test_null_scalar(dtype): - s = cudf.Scalar(None, dtype=dtype) - if s.dtype.kind in "mM": - assert s.value is cudf.NaT - else: - assert s.value is cudf.NA - assert s.dtype == ( - cudf.dtype(dtype) - if not isinstance(dtype, cudf.core.dtypes.DecimalDtype) - else dtype - ) - assert s.is_valid() is False - - -@pytest.mark.parametrize( - "value", - [ - np.datetime64("NaT", "ns"), - np.datetime64("NaT", "us"), - np.datetime64("NaT", "ms"), - np.datetime64("NaT", "s"), - np.timedelta64("NaT", "ns"), - np.timedelta64("NaT", "us"), - np.timedelta64("NaT", "ms"), - np.timedelta64("NaT", "s"), - ], -) -def test_nat_to_null_scalar_succeeds(value): - s = cudf.Scalar(value) - assert s.value is cudf.NaT - assert not s.is_valid() - assert s.dtype == value.dtype - - -@pytest.mark.parametrize( - "value", [None, np.datetime64("NaT"), np.timedelta64("NaT")] -) -def test_generic_null_scalar_construction_fails(value): - with pytest.raises(TypeError): - cudf.Scalar(value) - - -@pytest.mark.parametrize( - "value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")] -) -@pytest.mark.filterwarnings("ignore::DeprecationWarning") -def test_scalar_out_of_bounds_pyint_fails(value, dtype): - # Test that we align with NumPy on scalar creation behavior from - # Python integers. - if version.parse(np.__version__) >= version.parse("2.0"): - with pytest.raises(OverflowError): - cudf.Scalar(value, dtype) - else: - # NumPy allowed this, but it gives a DeprecationWarning on newer - # versions (which cudf did not used to do). - assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value) - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] -) -def test_scalar_dtype_and_validity(dtype): - s = cudf.Scalar(1, dtype=dtype) - - assert s.dtype == cudf.dtype(dtype) - assert s.is_valid() is True - - -@pytest.mark.parametrize( - "slr,dtype,expect", - [ - (1, cudf.Decimal64Dtype(1, 0), Decimal("1")), - (Decimal(1), cudf.Decimal64Dtype(1, 0), Decimal("1")), - (Decimal("1.1"), cudf.Decimal64Dtype(2, 1), Decimal("1.1")), - (Decimal("1.1"), cudf.Decimal64Dtype(4, 3), Decimal("1.100")), - (Decimal("41.123"), cudf.Decimal32Dtype(5, 3), Decimal("41.123")), - ( - Decimal("41345435344353535344373628492731234.123"), - cudf.Decimal128Dtype(38, 3), - Decimal("41345435344353535344373628492731234.123"), - ), - (Decimal("1.11"), cudf.Decimal64Dtype(2, 2), pa.lib.ArrowInvalid), - ], -) -def test_scalar_dtype_and_validity_decimal(slr, dtype, expect): - if expect is pa.lib.ArrowInvalid: - with pytest.raises(expect): - cudf.Scalar(slr, dtype=dtype) - return - else: - result = cudf.Scalar(slr, dtype=dtype) - assert result.dtype == dtype - assert result.is_valid - - -@pytest.mark.parametrize( - "value", - [ - datetime.timedelta(seconds=76), - datetime.timedelta(microseconds=7), - datetime.timedelta(minutes=47), - datetime.timedelta(hours=4427), - datetime.timedelta(weeks=7134), - pd.Timestamp(15133.5, unit="s"), - pd.Timestamp(15133.5, unit="D"), - pd.Timedelta(1513393355.5, unit="s"), - pd.Timedelta(34765, unit="D"), - ], -) -def test_date_duration_scalars(value): - s = cudf.Scalar(value) - - actual = s.value - - if isinstance(value, datetime.datetime): - expected = np.datetime64(value) - elif isinstance(value, datetime.timedelta): - expected = np.timedelta64(value) - elif isinstance(value, pd.Timestamp): - expected = value.to_datetime64() - elif isinstance(value, pd.Timedelta): - expected = value.to_timedelta64() - - np.testing.assert_equal(actual, expected) - assert s.is_valid() is True - - -def test_scalar_implicit_bool_conversion(): - assert cudf.Scalar(True) - assert not cudf.Scalar(False) - assert cudf.Scalar(0) == cudf.Scalar(0) - assert cudf.Scalar(1) <= cudf.Scalar(2) - assert cudf.Scalar(1) <= 2 - - -@pytest.mark.parametrize("value", [1, -1, 1.5, 0, "1.5", "1", True, False]) -def test_scalar_implicit_float_conversion(value): - expect = float(value) - got = float(cudf.Scalar(value)) - - assert expect == got - assert type(expect) == type(got) - - -@pytest.mark.parametrize("value", [1, -1, 1.5, 0, "1", True, False]) -def test_scalar_implicit_int_conversion(value): - expect = int(value) - got = int(cudf.Scalar(value)) - - assert expect == got - assert type(expect) == type(got) - - -@pytest.mark.parametrize("cls", [int, float, bool]) -@pytest.mark.parametrize("dtype", sorted(set(ALL_TYPES) - {"category"})) -def test_scalar_invalid_implicit_conversion(cls, dtype): - try: - cls(pd.NaT if cudf.dtype(dtype).kind in "mM" else pd.NA) - except TypeError as e: - with pytest.raises(TypeError, match=re.escape(str(e))): - slr = cudf.Scalar(None, dtype=dtype) - cls(slr) - - -@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) -@pytest.mark.parametrize( - "decimal_type", - [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], -) -def test_device_scalar_direct_construction(value, decimal_type): - value = cudf.utils.dtypes.to_cudf_compatible_scalar(value) - - dtype = ( - value.dtype - if not isinstance(value, Decimal) - else decimal_type._from_decimal(value) - ) - - s = cudf._lib.scalar.DeviceScalar(value, dtype) - - assert s.value == value or np.isnan(s.value) and np.isnan(value) - if isinstance( - dtype, (cudf.Decimal64Dtype, cudf.Decimal128Dtype, cudf.Decimal32Dtype) - ): - assert s.dtype.precision == dtype.precision - assert s.dtype.scale == dtype.scale - elif dtype.char == "U": - assert s.dtype == "object" - else: - assert s.dtype == dtype - - -@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) -def test_construct_from_scalar(value): - value = cudf.utils.dtypes.to_cudf_compatible_scalar(value) - x = cudf.Scalar( - value, value.dtype if not isinstance(value, Decimal) else None - ) - y = cudf.Scalar(x) - assert x.value == y.value or np.isnan(x.value) and np.isnan(y.value) - - # check that this works: - y.device_value - - x._is_host_value_current == y._is_host_value_current - x._is_device_value_current == y._is_device_value_current - - -@pytest.mark.parametrize( - "data", ["20000101", "2000-01-01", "2000-01-01T00:00:00.000000000", "2000"] -) -@pytest.mark.parametrize("dtype", DATETIME_TYPES) -def test_datetime_scalar_from_string(data, dtype): - slr = cudf.Scalar(data, dtype) - - expected = np.datetime64(datetime.datetime(2000, 1, 1)).astype(dtype) - - assert expected == slr.value - - -def test_scalar_cache(): - s = cudf.Scalar(1) - s2 = cudf.Scalar(1) - - assert s is s2 - - -def test_scalar_cache_rmm_hook(): - # test that reinitializing rmm clears the cuDF scalar cache, as we - # register a hook with RMM that does that on reinitialization - s = cudf.Scalar(1) - s2 = cudf.Scalar(1) - - assert s is s2 - - rmm.reinitialize() - - s3 = cudf.Scalar(1) - assert s3 is not s - - -def test_default_integer_bitwidth_scalar(default_integer_bitwidth): - # Test that integer scalars are default to 32 bits under user options. - slr = cudf.Scalar(128) - assert slr.dtype == np.dtype(f"i{default_integer_bitwidth//8}") - - -def test_default_float_bitwidth_scalar(default_float_bitwidth): - # Test that float scalars are default to 32 bits under user options. - slr = cudf.Scalar(128.0) - assert slr.dtype == np.dtype(f"f{default_float_bitwidth//8}") - - -def test_scalar_numpy_casting(): - # binop should upcast to wider type - s1 = cudf.Scalar(1, dtype=np.int32) - s2 = np.int64(2) - assert s1 < s2 - - -def test_construct_timezone_scalar_error(): - pd_scalar = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") - with pytest.raises(NotImplementedError): - cudf.utils.dtypes.to_cudf_compatible_scalar(pd_scalar) - - date_scalar = datetime.datetime.now(datetime.timezone.utc) - with pytest.raises(NotImplementedError): - cudf.utils.dtypes.to_cudf_compatible_scalar(date_scalar) diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py deleted file mode 100644 index b76566b00e2..00000000000 --- a/python/cudf/cudf/tests/test_scan.py +++ /dev/null @@ -1,275 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from itertools import product - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype -from cudf.testing import assert_eq -from cudf.testing._utils import INTEGER_TYPES, NUMERIC_TYPES, gen_rand - -params_sizes = [0, 1, 2, 5] - - -def _gen_params(): - for t, n in product(NUMERIC_TYPES, params_sizes): - if (t == np.int8 or t == np.int16) and n > 20: - # to keep data in range - continue - yield t, n - - -@pytest.mark.parametrize("dtype,nelem", list(_gen_params())) -def test_cumsum(dtype, nelem): - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, nelem, low=-2, high=2) - else: - data = gen_rand(dtype, nelem) - - decimal = 4 if dtype == np.float32 else 6 - - # series - gs = cudf.Series(data) - ps = pd.Series(data) - np.testing.assert_array_almost_equal( - gs.cumsum().to_numpy(), ps.cumsum(), decimal=decimal - ) - - # dataframe series (named series) - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series(data) - pdf = pd.DataFrame() - pdf["a"] = pd.Series(data) - np.testing.assert_array_almost_equal( - gdf.a.cumsum().to_numpy(), pdf.a.cumsum(), decimal=decimal - ) - - -def test_cumsum_masked(): - data = [1, 2, None, 4, 5] - float_types = ["float32", "float64"] - - for type_ in float_types: - gs = cudf.Series(data).astype(type_) - ps = pd.Series(data).astype(type_) - assert_eq(gs.cumsum(), ps.cumsum()) - - for type_ in INTEGER_TYPES: - gs = cudf.Series(data).astype(type_) - got = gs.cumsum() - expected = pd.Series([1, 3, np.nan, 7, 12], dtype="float64") - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(8, 4), - Decimal64Dtype(10, 5), - Decimal64Dtype(12, 7), - Decimal32Dtype(8, 5), - Decimal128Dtype(13, 6), - ], -) -def test_cumsum_decimal(dtype): - data = ["243.32", "48.245", "-7234.298", np.nan, "-467.2"] - gser = cudf.Series(data).astype(dtype) - pser = pd.Series(data, dtype="float64") - - got = gser.cumsum() - expected = cudf.Series.from_pandas(pser.cumsum()).astype(dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize("dtype,nelem", list(_gen_params())) -def test_cummin(dtype, nelem): - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, nelem, low=-2, high=2) - else: - data = gen_rand(dtype, nelem) - - decimal = 4 if dtype == np.float32 else 6 - - # series - gs = cudf.Series(data) - ps = pd.Series(data) - np.testing.assert_array_almost_equal( - gs.cummin().to_numpy(), ps.cummin(), decimal=decimal - ) - - # dataframe series (named series) - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series(data) - pdf = pd.DataFrame() - pdf["a"] = pd.Series(data) - np.testing.assert_array_almost_equal( - gdf.a.cummin().to_numpy(), pdf.a.cummin(), decimal=decimal - ) - - -def test_cummin_masked(): - data = [1, 2, None, 4, 5] - float_types = ["float32", "float64"] - - for type_ in float_types: - gs = cudf.Series(data).astype(type_) - ps = pd.Series(data).astype(type_) - assert_eq(gs.cummin(), ps.cummin()) - - for type_ in INTEGER_TYPES: - gs = cudf.Series(data).astype(type_) - expected = pd.Series([1, 1, np.nan, 1, 1]).astype("float64") - assert_eq(gs.cummin(), expected) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(8, 4), - Decimal64Dtype(11, 6), - Decimal64Dtype(14, 7), - Decimal32Dtype(8, 4), - Decimal128Dtype(11, 6), - ], -) -def test_cummin_decimal(dtype): - data = ["8394.294", np.nan, "-9940.444", np.nan, "-23.928"] - gser = cudf.Series(data).astype(dtype) - pser = pd.Series(data, dtype="float64") - - got = gser.cummin() - expected = cudf.Series.from_pandas(pser.cummin()).astype(dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize("dtype,nelem", list(_gen_params())) -def test_cummax(dtype, nelem): - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, nelem, low=-2, high=2) - else: - data = gen_rand(dtype, nelem) - - decimal = 4 if dtype == np.float32 else 6 - - # series - gs = cudf.Series(data) - ps = pd.Series(data) - np.testing.assert_array_almost_equal( - gs.cummax().to_numpy(), ps.cummax(), decimal=decimal - ) - - # dataframe series (named series) - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series(data) - pdf = pd.DataFrame() - pdf["a"] = pd.Series(data) - np.testing.assert_array_almost_equal( - gdf.a.cummax().to_numpy(), pdf.a.cummax(), decimal=decimal - ) - - -def test_cummax_masked(): - data = [1, 2, None, 4, 5] - float_types = ["float32", "float64"] - - for type_ in float_types: - gs = cudf.Series(data).astype(type_) - ps = pd.Series(data).astype(type_) - assert_eq(gs.cummax(), ps.cummax()) - - for type_ in INTEGER_TYPES: - gs = cudf.Series(data).astype(type_) - expected = pd.Series([1, 2, np.nan, 4, 5]).astype("float64") - assert_eq(gs.cummax(), expected) - - -@pytest.mark.parametrize( - "dtype", - [ - Decimal64Dtype(8, 4), - Decimal64Dtype(11, 6), - Decimal64Dtype(14, 7), - Decimal32Dtype(8, 4), - Decimal128Dtype(11, 6), - ], -) -def test_cummax_decimal(dtype): - data = [np.nan, "54.203", "8.222", "644.32", "-562.272"] - gser = cudf.Series(data).astype(dtype) - pser = pd.Series(data, dtype="float64") - - got = gser.cummax() - expected = cudf.Series.from_pandas(pser.cummax()).astype(dtype) - - assert_eq(got, expected) - - -@pytest.mark.parametrize("dtype,nelem", list(_gen_params())) -def test_cumprod(dtype, nelem): - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, nelem, low=-2, high=2) - else: - data = gen_rand(dtype, nelem) - - decimal = 4 if dtype == np.float32 else 6 - - # series - gs = cudf.Series(data) - ps = pd.Series(data) - np.testing.assert_array_almost_equal( - gs.cumprod().to_numpy(), ps.cumprod(), decimal=decimal - ) - - # dataframe series (named series) - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series(data) - pdf = pd.DataFrame() - pdf["a"] = pd.Series(data) - np.testing.assert_array_almost_equal( - gdf.a.cumprod().to_numpy(), pdf.a.cumprod(), decimal=decimal - ) - - -def test_cumprod_masked(): - data = [1, 2, None, 4, 5] - float_types = ["float32", "float64"] - - for type_ in float_types: - gs = cudf.Series(data).astype(type_) - ps = pd.Series(data).astype(type_) - assert_eq(gs.cumprod(), ps.cumprod()) - - for type_ in INTEGER_TYPES: - gs = cudf.Series(data).astype(type_) - got = gs.cumprod() - expected = pd.Series([1, 2, np.nan, 8, 40], dtype="float64") - assert_eq(got, expected) - - -def test_scan_boolean_cumsum(): - s = cudf.Series([0, -1, -300, 23, 4, -3, 0, 0, 100]) - - # cumsum test - got = (s > 0).cumsum() - expect = (s > 0).to_pandas().cumsum() - - assert_eq(expect, got) - - -def test_scan_boolean_cumprod(): - s = cudf.Series([0, -1, -300, 23, 4, -3, 0, 0, 100]) - - # cumprod test - got = (s > 0).cumprod() - expect = (s > 0).to_pandas().cumprod() - - assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py deleted file mode 100644 index 65943518113..00000000000 --- a/python/cudf/cudf/tests/test_search.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. -import cupy -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq -from cudf.testing._utils import gen_rand, random_bitmask - - -@pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("obj_class", ["series", "index", "column"]) -@pytest.mark.parametrize("vals_class", ["series", "index"]) -def test_searchsorted(side, obj_class, vals_class): - nelem = 1000 - column_data = gen_rand("float64", nelem) - column_mask = random_bitmask(nelem) - - values_data = gen_rand("float64", nelem) - values_mask = random_bitmask(nelem) - - sr = cudf.Series.from_masked_array(column_data, column_mask) - vals = cudf.Series.from_masked_array(values_data, values_mask) - - sr = sr.sort_values() - - # Reference object can be Series, Index, or Column - if obj_class == "index": - sr.reset_index(drop=True) - elif obj_class == "column": - sr = sr._column - - # Values can be Series or Index - if vals_class == "index": - vals.reset_index(drop=True) - - psr = sr.to_pandas() - pvals = vals.to_pandas() - - expect = psr.searchsorted(pvals, side) - if obj_class == "column": - got = sr.searchsorted(vals._column, side) - else: - got = sr.searchsorted(vals, side) - - assert_eq(expect, cupy.asnumpy(got)) - - -@pytest.mark.parametrize("side", ["left", "right"]) -@pytest.mark.parametrize("multiindex", [True, False]) -def test_searchsorted_dataframe(side, multiindex): - values = cudf.DataFrame( - { - "a": [1, 0, 5, 1], - "b": [-0.998, 0.031, -0.888, -0.998], - "c": ["C", "A", "G", "B"], - } - ) - base = cudf.DataFrame( - { - "a": [1, 1, 1, 5], - "b": [-0.999, -0.998, -0.997, -0.888], - "c": ["A", "C", "E", "G"], - } - ) - - if multiindex: - base = base.set_index(["a", "b", "c"]).index - values = values.set_index(["a", "b", "c"]).index - - result = base.searchsorted(values, side=side).tolist() - - if side == "left": - assert result == [1, 0, 3, 1] - else: - assert result == [2, 0, 4, 1] - - -def test_search_sorted_dataframe_unequal_number_of_columns(): - values = cudf.DataFrame({"a": [1, 0, 5, 1]}) - base = cudf.DataFrame({"a": [1, 0, 5, 1], "b": ["x", "z", "w", "a"]}) - - with pytest.raises(ValueError, match="Mismatch number of columns"): - base.searchsorted(values) - - -@pytest.mark.parametrize("side", ["left", "right"]) -def test_searchsorted_categorical(side): - cat1 = pd.Categorical( - ["a", "a", "b", "c", "a"], categories=["a", "b", "c"], ordered=True - ) - psr1 = pd.Series(cat1).sort_values() - sr1 = cudf.Series(cat1).sort_values() - cat2 = pd.Categorical( - ["a", "b", "a", "c", "b"], categories=["a", "b", "c"], ordered=True - ) - psr2 = pd.Series(cat2) - sr2 = cudf.Series(cat2) - - expect = psr1.searchsorted(psr2, side) - got = sr1.searchsorted(sr2, side) - - assert_eq(expect, cupy.asnumpy(got)) - - -@pytest.mark.parametrize("side", ["left", "right"]) -def test_searchsorted_datetime(side): - psr1 = pd.Series( - pd.date_range("20190101", "20200101", freq="400h", name="times") - ) - sr1 = cudf.from_pandas(psr1) - - psr2 = pd.Series( - np.array( - [ - np.datetime64("2019-11-20"), - np.datetime64("2019-04-15"), - np.datetime64("2019-02-20"), - np.datetime64("2019-05-31"), - np.datetime64("2020-01-02"), - ] - ) - ) - - sr2 = cudf.from_pandas(psr2) - - expect = psr1.searchsorted(psr2, side) - got = sr1.searchsorted(sr2, side) - - assert_eq(expect, cupy.asnumpy(got)) - - -def test_searchsorted_misc(): - psr = pd.Series([1, 2, 3.4, 6]) - sr = cudf.from_pandas(psr) - - assert_eq(psr.searchsorted(1), sr.searchsorted(1)) - assert_eq(psr.searchsorted(0), sr.searchsorted(0)) - assert_eq(psr.searchsorted(4), sr.searchsorted(4)) - assert_eq(psr.searchsorted(5), sr.searchsorted(5)) - assert_eq( - psr.searchsorted([-100, 3.4, 2.2, 2.0, 2.000000001]), - sr.searchsorted([-100, 3.4, 2.2, 2.0, 2.000000001]), - ) - - psr = pd.Series([1, 2, 3]) - sr = cudf.from_pandas(psr) - assert_eq(psr.searchsorted(1), sr.searchsorted(1)) - assert_eq( - psr.searchsorted([0, 1, 2, 3, 4, -4, -3, -2, -1, 0, -120]), - sr.searchsorted([0, 1, 2, 3, 4, -4, -3, -2, -1, 0, -120]), - ) - assert_eq(psr.searchsorted(1.5), sr.searchsorted(1.5)) - assert_eq(psr.searchsorted(1.99), sr.searchsorted(1.99)) - assert_eq(psr.searchsorted(3.00001), sr.searchsorted(3.00001)) - assert_eq( - psr.searchsorted([-100, 3.00001, 2.2, 2.0, 2.000000001]), - sr.searchsorted([-100, 3.00001, 2.2, 2.0, 2.000000001]), - ) - - -@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/54668") -def test_searchsorted_mixed_str_int(): - psr = pd.Series([1, 2, 3], dtype="int") - sr = cudf.from_pandas(psr) - - with pytest.raises(ValueError): - actual = sr.searchsorted("a") - with pytest.raises(ValueError): - expect = psr.searchsorted("a") - assert_eq(expect, actual) diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py deleted file mode 100644 index 0b892a51895..00000000000 --- a/python/cudf/cudf/tests/test_serialize.py +++ /dev/null @@ -1,416 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import itertools -import pickle - -import msgpack -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import _utils as utils, assert_eq - - -@pytest.mark.parametrize( - "df", - [ - lambda: cudf.Index([1, 2, 3]), - lambda: cudf.Index([1.0, 2.0, 3.0]), - lambda: cudf.Series([1, 2, 3]), - lambda: cudf.Series([1, 2, 3], index=[4, 5, 6]), - lambda: cudf.Series([1, None, 3]), - lambda: cudf.Series([1, 2, 3], index=[4, 5, None]), - lambda: cudf.Series([1, 2, 3])[:2], - lambda: cudf.Series([1, 2, 3])[:2]._column, - lambda: cudf.Series(["a", "bb", "ccc"]), - lambda: cudf.Series(["a", None, "ccc"]), - lambda: cudf.Series( - [ - {"a": ({"b": [1, 2, 3], "c": [4, 5, 6]}, {"d": [2, 4, 6]})}, - {"e": ({"b": [0, 2, 4], "c": [-1, -2, -3]}, {"d": [1, 1, 1]})}, - ] - ), - lambda: cudf.Series( - [ - 14.12302, - 97938.2, - np.nan, - 0.0, - -8.302014, - np.nan, - -112.2314, - ] - ).astype(cudf.Decimal64Dtype(7, 2)), - lambda: cudf.DataFrame({"x": [1, 2, 3]}), - lambda: cudf.DataFrame({"x": [1, 2, 3], "y": [1.0, None, 3.0]}), - lambda: cudf.DataFrame( - {"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}, index=[1, None, 3] - ), - lambda: cudf.DataFrame( - {"x": [1, 2, 3], "y": [1.0, None, 3.0]}, index=[1, None, 3] - ), - lambda: cudf.DataFrame( - {"x": ["a", "bb", "ccc"], "y": [1.0, None, 3.0]}, - index=[1, None, 3], - ), - lambda: pd.Index([True, False] * 5), - lambda: pd.CategoricalIndex(["a", "b", "a", "b"], ["a", "b", "c"]), - lambda: ( - cudf.DataFrame( - { - "a": [1, 2, 3], - "b": ["c", "e", "g"], - "d": [True, False, True], - }, - index=cudf.MultiIndex.from_tuples( - [("i1", "i2"), ("i3", "i4"), ("i5", "i6")], - names=["foo", "bar"], - ), - ) - ), - lambda: cudf.Index( - cudf.date_range(start="2011-01-01", end="2012-01-01", periods=13) - ), - lambda: cudf.Index([1.2, 3.4, 5.6]), - lambda: cudf.Series([1.2, 3.4, 5.6]), - lambda: pd.IntervalIndex.from_breaks(range(10)), - lambda: cudf.MultiIndex.from_tuples( - [("i1", "i2"), ("i3", "i4"), ("i5", "i6")], names=["foo", "bar"] - ), - lambda: cudf.RangeIndex(10), - lambda: cudf.DataFrame( - {"a": list(range(13)), "b": [float(x) for x in range(13)]}, - index=cudf.Index( - cudf.date_range( - start="2011-01-01", end="2012-01-01", periods=13 - ) - ), - ), - lambda: cudf.Series( - list(range(13)), - index=cudf.Index( - cudf.date_range( - start="2011-01-01", end="2012-01-01", periods=13 - ) - ), - ), - lambda: cudf.TimedeltaIndex( - [1132223, 2023232, 342234324, 4234324], - dtype="timedelta64[ns]", - name="foo", - ), - lambda: cudf.Index( - [ - "y7ssMP1PWJ", - "rZDLbzIQsX", - "NrPwYMsxNw", - "4zja1Vw9Rq", - "Y9TNDhjXgR", - "Ryjt7up2hT", - "dxYKtRGHkb", - "nMCWj5yhMu", - "Rt7S362FNX", - "OGbssOJLUI", - ] - ), - ], - ids=itertools.count(), -) -@pytest.mark.parametrize("to_host", [True, False]) -def test_serialize(df, to_host): - """This should hopefully replace all functions below""" - a = df() - if "cudf" not in type(a).__module__: - a = cudf.from_pandas(a) - if to_host: - header, frames = a.host_serialize() - else: - header, frames = a.device_serialize() - msgpack.dumps(header) # ensure that header is msgpack serializable - ndevice = 0 - for frame in frames: - if hasattr(frame, "__cuda_array_interface__"): - ndevice += 1 - # Indices etc. will not be DeviceNDArray - # but data should be... - if to_host: - assert ndevice == 0 - elif hasattr(df, "_cols"): - assert ndevice >= len(df._data) - else: - # If there are frames, something should be on the device - assert ndevice > 0 or not frames - - typ = type(a) - b = typ.deserialize(header, frames) - assert_eq(a, b) - - -def test_serialize_dtype_error_checking(): - dtype = cudf.IntervalDtype("float", "right") - header, frames = dtype.serialize() - with pytest.raises(AssertionError): - # Invalid number of frames - type(dtype).deserialize(header, [None] * (header["frame_count"] + 1)) - with pytest.raises(AssertionError): - # mismatching class - cudf.StructDtype.deserialize(header, frames) - - -def test_serialize_dataframe(): - df = cudf.DataFrame() - df["a"] = np.arange(100) - df["b"] = np.arange(100, dtype=np.float32) - df["c"] = pd.Categorical( - ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] - ) - outdf = cudf.DataFrame.deserialize(*df.serialize()) - assert_eq(df, outdf) - - -def test_serialize_dataframe_with_index(): - df = cudf.DataFrame() - df["a"] = np.arange(100) - df["b"] = np.random.random(100) - df["c"] = pd.Categorical( - ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] - ) - df = df.sort_values("b") - outdf = cudf.DataFrame.deserialize(*df.serialize()) - assert_eq(df, outdf) - - -def test_serialize_series(): - sr = cudf.Series(np.arange(100)) - outsr = cudf.Series.deserialize(*sr.serialize()) - assert_eq(sr, outsr) - - -def test_serialize_range_index(): - index = cudf.core.index.RangeIndex(10, 20) - outindex = cudf.core.index.RangeIndex.deserialize(*index.serialize()) - assert_eq(index, outindex) - - -def test_serialize_generic_index(): - index = cudf.core.index.Index(cudf.Series(np.arange(10))) - outindex = cudf.core.index.Index.deserialize(*index.serialize()) - assert_eq(index, outindex) - - -def test_serialize_multi_index(): - pdf = pd.DataFrame( - { - "a": [4, 17, 4, 9, 5], - "b": [1, 4, 4, 3, 2], - "x": np.random.normal(size=5), - } - ) - gdf = cudf.DataFrame.from_pandas(pdf) - gdg = gdf.groupby(["a", "b"]).sum() - multiindex = gdg.index - outindex = cudf.core.multiindex.MultiIndex.deserialize( - *multiindex.serialize() - ) - assert_eq(multiindex, outindex) - - -def test_serialize_masked_series(): - nelem = 50 - data = np.random.random(nelem) - mask = utils.random_bitmask(nelem) - bitmask = utils.expand_bits_to_bytes(mask)[:nelem] - null_count = utils.count_zero(bitmask) - assert null_count >= 0 - sr = cudf.Series.from_masked_array(data, mask, null_count=null_count) - outsr = cudf.Series.deserialize(*sr.serialize()) - assert_eq(sr, outsr) - - -def test_serialize_groupby_df(): - df = cudf.DataFrame() - df["key_1"] = np.random.randint(0, 20, 100) - df["key_2"] = np.random.randint(0, 20, 100) - df["val"] = np.arange(100, dtype=np.float32) - gb = df.groupby(["key_1", "key_2"], sort=True) - outgb = gb.deserialize(*gb.serialize()) - expect = gb.mean() - got = outgb.mean() - assert_eq(got.sort_index(), expect.sort_index()) - - -def test_serialize_groupby_external(): - df = cudf.DataFrame() - df["val"] = np.arange(100, dtype=np.float32) - gb = df.groupby(cudf.Series(np.random.randint(0, 20, 100))) - outgb = gb.deserialize(*gb.serialize()) - expect = gb.mean() - got = outgb.mean() - assert_eq(got.sort_index(), expect.sort_index()) - - -def test_serialize_groupby_level(): - idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (2, 2)], names=("a", "b")) - pdf = pd.DataFrame({"c": [1, 2, 3], "d": [2, 3, 4]}, index=idx) - df = cudf.from_pandas(pdf) - gb = df.groupby(level="a") - expect = gb.mean() - outgb = gb.deserialize(*gb.serialize()) - got = outgb.mean() - assert_eq(expect.sort_index(), got.sort_index()) - - -def test_serialize_groupby_sr(): - sr = cudf.Series(np.random.randint(0, 20, 100)) - gb = sr.groupby(sr // 2) - outgb = gb.deserialize(*gb.serialize()) - got = gb.mean() - expect = outgb.mean() - assert_eq(got.sort_index(), expect.sort_index()) - - -def test_serialize_datetime(): - # Make frame with datetime column - df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} - ) - ts = np.arange(0, len(df), dtype=np.dtype("datetime64[ms]")) - df["timestamp"] = ts - gdf = cudf.DataFrame.from_pandas(df) - # (De)serialize roundtrip - recreated = cudf.DataFrame.deserialize(*gdf.serialize()) - # Check - assert_eq(recreated, df) - - -def test_serialize_string(): - # Make frame with string column - df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=5), "y": np.random.normal(size=5)} - ) - str_data = ["a", "bc", "def", "ghij", "klmno"] - df["timestamp"] = str_data - gdf = cudf.DataFrame.from_pandas(df) - # (De)serialize roundtrip - recreated = cudf.DataFrame.deserialize(*gdf.serialize()) - # Check - assert_eq(recreated, df) - - -@pytest.mark.parametrize( - "frames", - [ - (cudf.Series([], dtype="str"), pd.Series([], dtype="str")), - (cudf.DataFrame(), pd.DataFrame()), - (cudf.DataFrame([]), pd.DataFrame([])), - (cudf.DataFrame({}), pd.DataFrame({})), - (cudf.DataFrame([1]).head(0), pd.DataFrame([1]).head(0)), - (cudf.DataFrame({"a": []}), pd.DataFrame({"a": []})), - ( - cudf.DataFrame({"a": ["a"]}).head(0), - pd.DataFrame({"a": ["a"]}).head(0), - ), - ( - cudf.DataFrame({"a": [1.0]}).head(0), - pd.DataFrame({"a": [1.0]}).head(0), - ), - ], -) -def test_serialize_empty(frames): - gdf, pdf = frames - - typ = type(gdf) - res = typ.deserialize(*gdf.serialize()) - assert_eq(res, gdf) - - -def test_serialize_all_null_string(): - data = [None, None, None, None, None] - pd_series = pd.Series(data, dtype="str") - gd_series = cudf.Series(data, dtype="str") - - recreated = cudf.Series.deserialize(*gd_series.serialize()) - assert_eq(recreated, pd_series) - - -def test_serialize_named_series(): - gdf = cudf.DataFrame({"a": [1, 2, 3, 4], "b": [5, 1, 2, 5]}) - - ser = gdf["b"] - recreated = cudf.Series.deserialize(*ser.serialize()) - assert_eq(recreated, ser) - - -def test_serialize_seriesgroupby(): - gdf = cudf.DataFrame({"a": [1, 2, 3, 4], "b": [5, 1, 2, 5]}) - - gb = gdf.groupby(["a"]).b - recreated = gb.__class__.deserialize(*gb.serialize()) - assert_eq(recreated.sum(), gb.sum()) - - -def test_serialize_seriesresampler(): - index = cudf.date_range(start="2001-01-01", periods=10, freq="1min") - sr = cudf.Series(range(10), index=index) - re_sampler = sr.resample("3min") - actual = re_sampler.sum() - recreated = re_sampler.__class__.deserialize(*re_sampler.serialize()) - expected = recreated.sum() - - assert_eq(actual, expected) - - -def test_serialize_string_check_buffer_sizes(): - df = cudf.DataFrame({"a": ["a", "b", "cd", None]}) - expect = df.memory_usage(deep=True).loc["a"] - header, frames = df.serialize() - got = sum(b.nbytes for b in frames) - assert expect == got - - -def test_deserialize_cudf_23_12(datadir): - fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl" - - expected = cudf.DataFrame({"a": ["hi", "hello", "world", None]}) - with open(fname, "rb") as f: - actual = pickle.load(f) - - assert_eq(expected, actual) - - -def test_serialize_sliced_string(): - # https://github.com/rapidsai/cudf/issues/7735 - data = ["hi", "hello", None] - pd_series = pd.Series(data, dtype=pd.StringDtype()) - gd_series = cudf.Series(data, dtype="str") - sliced = gd_series[0:3] - serialized_gd_series = gd_series.serialize() - serialized_sliced = sliced.serialize() - - # validate frames are equal or not - # because both should be identical - for i in range(3): - assert_eq( - serialized_gd_series[1][i].memoryview(), - serialized_sliced[1][i].memoryview(), - ) - - recreated = cudf.Series.deserialize(*sliced.serialize()) - assert_eq(recreated.to_pandas(nullable=True), pd_series) - - -@pytest.mark.parametrize( - "columns", - [ - cudf.RangeIndex(2), - cudf.Index([1, 2], dtype="int8"), - cudf.MultiIndex( - levels=[["a", "b"], [1, 2]], codes=[[0, 1], [0, 1]], names=["a", 0] - ), - ], -) -def test_serialize_column_types_preserved(columns): - expected = cudf.DataFrame([[10, 11]], columns=columns) - result = cudf.DataFrame.deserialize(*expected.serialize()) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py deleted file mode 100644 index a24002dc38e..00000000000 --- a/python/cudf/cudf/tests/test_series.py +++ /dev/null @@ -1,2931 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import datetime -import decimal -import hashlib -import operator -import re -from collections import OrderedDict, defaultdict -from string import ascii_letters, digits - -import cupy as cp -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.api.extensions import no_default -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.errors import MixedTypeError -from cudf.testing import assert_eq -from cudf.testing._utils import ( - NUMERIC_TYPES, - SERIES_OR_INDEX_NAMES, - TIMEDELTA_TYPES, - assert_exceptions_equal, - expect_warning_if, - gen_rand, -) - - -def _series_na_data(): - return [ - pd.Series([0, 1, 2, np.nan, 4, None, 6]), - pd.Series( - [0, 1, 2, np.nan, 4, None, 6], - index=["q", "w", "e", "r", "t", "y", "u"], - name="a", - ), - pd.Series([0, 1, 2, 3, 4]), - pd.Series(["a", "b", "u", "h", "d"]), - pd.Series([None, None, np.nan, None, np.inf, -np.inf]), - pd.Series([], dtype="float64"), - pd.Series( - [pd.NaT, pd.Timestamp("1939-05-27"), pd.Timestamp("1940-04-25")] - ), - pd.Series([np.nan]), - pd.Series([None]), - pd.Series(["a", "b", "", "c", None, "e"]), - ] - - -@pytest.mark.parametrize( - "data", - [ - {"a": 1, "b": 2, "c": 24, "d": 1010}, - {"a": 1}, - {1: "a", 2: "b", 24: "c", 1010: "d"}, - {1: "a"}, - ], -) -def test_series_init_dict(data): - pandas_series = pd.Series(data) - cudf_series = cudf.Series(data) - - assert_eq(pandas_series, cudf_series) - - -@pytest.mark.parametrize( - "data", - [ - { - "a": [1, 2, 3], - "b": [2, 3, 5], - "c": [24, 12212, 22233], - "d": [1010, 101010, 1111], - }, - {"a": [1]}, - ], -) -def test_series_init_dict_lists(data): - assert_eq(pd.Series(data), cudf.Series(data)) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - [1.0, 12.221, 12.34, 13.324, 324.3242], - [-10, -1111, 100, 11, 133], - ], -) -@pytest.mark.parametrize( - "others", - [ - [10, 11, 12, 13], - [0.1, 0.002, 324.2332, 0.2342], - [-10, -1111, 100, 11, 133], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_concat_basic(data, others, ignore_index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - other_ps = pd.Series(others) - other_gs = cudf.Series(others) - - expected = pd.concat([psr, other_ps], ignore_index=ignore_index) - actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [ - "abc", - "def", - "this is a string", - "this is another string", - "a", - "b", - "c", - ], - ["a"], - ], -) -@pytest.mark.parametrize( - "others", - [ - [ - "abc", - "def", - "this is a string", - "this is another string", - "a", - "b", - "c", - ], - ["a"], - ["1", "2", "3", "4", "5"], - ["+", "-", "!", "_", "="], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_concat_basic_str(data, others, ignore_index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - other_ps = pd.Series(others) - other_gs = cudf.Series(others) - - expected = pd.concat([psr, other_ps], ignore_index=ignore_index) - actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series( - [ - "abc", - "def", - "this is a string", - "this is another string", - "a", - "b", - "c", - ], - index=[10, 20, 30, 40, 50, 60, 70], - ), - pd.Series(["a"], index=[2]), - ], -) -@pytest.mark.parametrize( - "others", - [ - pd.Series( - [ - "abc", - "def", - "this is a string", - "this is another string", - "a", - "b", - "c", - ], - index=[10, 20, 30, 40, 50, 60, 70], - ), - pd.Series(["a"], index=[133]), - pd.Series(["1", "2", "3", "4", "5"], index=[-10, 22, 33, 44, 49]), - pd.Series(["+", "-", "!", "_", "="], index=[11, 22, 33, 44, 2]), - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_concat_series_with_index(data, others, ignore_index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - other_ps = others - other_gs = cudf.from_pandas(others) - - expected = pd.concat([psr, other_ps], ignore_index=ignore_index) - actual = cudf.concat([gsr, other_gs], ignore_index=ignore_index) - - assert_eq(expected, actual) - - -def test_series_concat_error_mixed_types(): - gsr = cudf.Series([1, 2, 3, 4]) - other = cudf.Series(["a", "b", "c", "d"]) - - with pytest.raises( - TypeError, - match="cudf does not support mixed types, please type-cast " - "both series to same dtypes.", - ): - cudf.concat([gsr, other]) - - with pytest.raises( - TypeError, - match="cudf does not support mixed types, please type-cast " - "both series to same dtypes.", - ): - cudf.concat([gsr, gsr, other, gsr, other]) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([1, 2, 3, 4], index=["a", "b", "c", "d"]), - pd.Series( - [1.0, 12.221, 12.34, 13.324, 324.3242], - index=[ - "float one", - "float two", - "float three", - "float four", - "float five", - ], - ), - pd.Series( - [-10, -1111, 100, 11, 133], - index=["one", "two", "three", "four", "five"], - ), - ], -) -@pytest.mark.parametrize( - "others", - [ - [ - pd.Series([10, 11, 12, 13], index=["a", "b", "c", "d"]), - pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), - ], - [ - pd.Series([10, 11, 12, 13], index=["a", "b", "c", "d"]), - pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), - ] - * 25, - [ - pd.Series( - [0.1, 0.002, 324.2332, 0.2342], index=["-", "+", "%", "#"] - ), - pd.Series([12, 14, 15, 27], index=["d", "e", "z", "x"]), - ] - * 46, - [ - pd.Series( - [-10, -1111, 100, 11, 133], - index=["aa", "vv", "bb", "dd", "ll"], - ) - ], - ], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_concat_list_series_with_index(data, others, ignore_index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - other_ps = others - other_gs = [cudf.from_pandas(obj) for obj in others] - - expected = pd.concat([psr] + other_ps, ignore_index=ignore_index) - actual = cudf.concat([gsr] + other_gs, ignore_index=ignore_index) - - assert_eq(expected, actual) - - -def test_series_concat_existing_buffers(): - a1 = np.arange(10, dtype=np.float64) - gs = cudf.Series(a1) - - # Add new buffer - a2 = cudf.Series(np.arange(5)) - gs = cudf.concat([gs, a2]) - assert len(gs) == 15 - np.testing.assert_equal(gs.to_numpy(), np.hstack([a1, a2.to_numpy()])) - - # Ensure appending to previous buffer - a3 = cudf.Series(np.arange(3)) - gs = cudf.concat([gs, a3]) - assert len(gs) == 18 - a4 = np.hstack([a1, a2.to_numpy(), a3.to_numpy()]) - np.testing.assert_equal(gs.to_numpy(), a4) - - # Appending different dtype - a5 = cudf.Series(np.array([1, 2, 3], dtype=np.int32)) - a6 = cudf.Series(np.array([4.5, 5.5, 6.5], dtype=np.float64)) - gs = cudf.concat([a5, a6]) - np.testing.assert_equal( - gs.to_numpy(), np.hstack([a5.to_numpy(), a6.to_numpy()]) - ) - gs = cudf.concat([cudf.Series(a6), a5]) - np.testing.assert_equal( - gs.to_numpy(), np.hstack([a6.to_numpy(), a5.to_numpy()]) - ) - - -def test_series_column_iter_error(): - gs = cudf.Series([1, 2, 3]) - - with pytest.raises( - TypeError, - match=re.escape( - f"{gs.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - iter(gs) - - with pytest.raises( - TypeError, - match=re.escape( - f"{gs.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - gs.items() - - with pytest.raises( - TypeError, - match=re.escape( - f"{gs.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ), - ): - gs.iteritems() - - with pytest.raises(TypeError): - iter(gs._column) - - -@pytest.mark.parametrize( - "data", - [ - [1.0, 2.0, None, 4.0, 5.0], - ["a", "b", "c", "d", "e"], - ["a", "b", None, "d", "e"], - [None, None, None, None, None], - np.array(["1991-11-20", "2004-12-04"], dtype=np.datetime64), - np.array(["1991-11-20", None], dtype=np.datetime64), - np.array( - ["1991-11-20 05:15:00", "2004-12-04 10:00:00"], dtype=np.datetime64 - ), - np.array(["1991-11-20 05:15:00", None], dtype=np.datetime64), - ], -) -def test_series_tolist(data): - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - - with pytest.raises( - TypeError, - match=re.escape( - r"cuDF does not support conversion to host memory " - r"via the `tolist()` method. Consider using " - r"`.to_arrow().to_pylist()` to construct a Python list." - ), - ): - gsr.tolist() - - -@pytest.mark.parametrize( - "data", - [[], [None, None], ["a"], ["a", "b", "c"] * 500, [1.0, 2.0, 0.3] * 57], -) -def test_series_size(data): - psr = pd.Series(data) - gsr = cudf.Series(data) - - assert_eq(psr.size, gsr.size) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -def test_series_describe_numeric(dtype): - ps = pd.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) - gs = cudf.from_pandas(ps) - actual = gs.describe() - expected = ps.describe() - - assert_eq(expected, actual, check_dtype=True) - - -@pytest.mark.parametrize("dtype", ["datetime64[ns]"]) -def test_series_describe_datetime(dtype): - # Note that other datetime units are not tested because pandas does not - # support them. When specified coarser units, cuDF datetime columns cannot - # represent fractional time for quantiles of the column, which may require - # interpolation, this differs from pandas which always stay in [ns] unit. - gs = cudf.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) - ps = gs.to_pandas() - - # Treating datetimes as categoricals is deprecated in pandas and will - # be removed in future. Future behavior is treating datetime as numeric. - expected = ps.describe() - actual = gs.describe() - - assert_eq(expected.astype("str"), actual) - - -@pytest.mark.parametrize("dtype", TIMEDELTA_TYPES) -def test_series_describe_timedelta(dtype): - ps = pd.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype) - gs = cudf.from_pandas(ps) - - expected = ps.describe() - actual = gs.describe() - - assert_eq(actual, expected.astype("str")) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series(["a", "b", "c", "d", "e", "a"]), - pd.Series([True, False, True, True, False]), - pd.Series([], dtype="str"), - pd.Series(["a", "b", "c", "a"], dtype="category"), - pd.Series(["d", "e", "f"], dtype="category"), - pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])), - pd.Series( - pd.Categorical( - ["d", "e", "f"], categories=["f", "e", "d"], ordered=True - ) - ), - ], -) -def test_series_describe_other_types(ps): - gs = cudf.from_pandas(ps) - - expected = ps.describe() - actual = gs.describe() - - if len(ps) == 0: - assert_eq(expected.fillna("a").astype("str"), actual.fillna("a")) - else: - assert_eq(expected.astype("str"), actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 2, 1], - [1, 2, None, 3, 1, 1], - [], - ["a", "b", "c", None, "z", "a"], - ], -) -@pytest.mark.parametrize("use_na_sentinel", [True, False]) -def test_series_factorize_use_na_sentinel(data, use_na_sentinel): - gsr = cudf.Series(data) - psr = gsr.to_pandas(nullable=True) - - expected_labels, expected_cats = psr.factorize( - use_na_sentinel=use_na_sentinel, sort=True - ) - actual_labels, actual_cats = gsr.factorize( - use_na_sentinel=use_na_sentinel, sort=True - ) - assert_eq(expected_labels, actual_labels.get()) - assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 2, 1], - [1, 2, None, 3, 1, 1], - [], - ["a", "b", "c", None, "z", "a"], - ], -) -@pytest.mark.parametrize("sort", [True, False]) -def test_series_factorize_sort(data, sort): - gsr = cudf.Series(data) - psr = gsr.to_pandas(nullable=True) - - expected_labels, expected_cats = psr.factorize(sort=sort) - actual_labels, actual_cats = gsr.factorize(sort=sort) - assert_eq(expected_labels, actual_labels.get()) - assert_eq(expected_cats, actual_cats.to_pandas(nullable=True)) - - -@pytest.mark.parametrize( - "data", - [ - pd.Series([], dtype="datetime64[ns]"), - pd.Series(pd.date_range("2010-01-01", "2010-02-01")), - pd.Series([None, None], dtype="datetime64[ns]"), - ], -) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("nulls", ["none", "some"]) -def test_series_datetime_value_counts(data, nulls, normalize, dropna): - psr = data.copy() - - if len(data) > 0: - if nulls == "one": - p = np.random.randint(0, len(data)) - psr[p] = None - elif nulls == "some": - p = np.random.randint(0, len(data), 2) - psr[p] = None - - gsr = cudf.from_pandas(psr) - expected = psr.value_counts(dropna=dropna, normalize=normalize) - got = gsr.value_counts(dropna=dropna, normalize=normalize) - - assert_eq(expected.sort_index(), got.sort_index(), check_dtype=False) - assert_eq( - expected.reset_index(drop=True), - got.reset_index(drop=True), - check_dtype=False, - check_index_type=True, - ) - - -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("num_elements", [10, 100, 1000]) -def test_categorical_value_counts(dropna, normalize, num_elements): - # create categorical series - np.random.seed(12) - pd_cat = pd.Categorical( - pd.Series( - np.random.choice(list(ascii_letters + digits), num_elements), - dtype="category", - ) - ) - - # gdf - gdf = cudf.DataFrame() - gdf["a"] = cudf.Series.from_categorical(pd_cat) - gdf_value_counts = gdf["a"].value_counts( - dropna=dropna, normalize=normalize - ) - - # pandas - pdf = pd.DataFrame() - pdf["a"] = pd_cat - pdf_value_counts = pdf["a"].value_counts( - dropna=dropna, normalize=normalize - ) - - # verify - assert_eq( - pdf_value_counts.sort_index(), - gdf_value_counts.sort_index(), - check_dtype=False, - check_index_type=True, - ) - assert_eq( - pdf_value_counts.reset_index(drop=True), - gdf_value_counts.reset_index(drop=True), - check_dtype=False, - check_index_type=True, - ) - - -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -def test_series_value_counts(dropna, normalize): - for size in [10**x for x in range(5)]: - arr = np.random.randint(low=-1, high=10, size=size) - mask = arr != -1 - sr = cudf.Series.from_masked_array( - arr, cudf.Series(mask)._column.as_mask() - ) - sr.name = "col" - - expect = ( - sr.to_pandas() - .value_counts(dropna=dropna, normalize=normalize) - .sort_index() - ) - got = sr.value_counts(dropna=dropna, normalize=normalize).sort_index() - - assert_eq(expect, got, check_dtype=True, check_index_type=False) - - -@pytest.mark.parametrize("bins", [1, 2, 3]) -def test_series_value_counts_bins(bins): - psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0]) - gsr = cudf.from_pandas(psr) - - expected = psr.value_counts(bins=bins) - got = gsr.value_counts(bins=bins) - - assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) - - -@pytest.mark.parametrize("bins", [1, 2, 3]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_series_value_counts_bins_dropna(bins, dropna): - psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, np.nan]) - gsr = cudf.from_pandas(psr) - - expected = psr.value_counts(bins=bins, dropna=dropna) - got = gsr.value_counts(bins=bins, dropna=dropna) - - assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) - - -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -def test_series_value_counts_optional_arguments(ascending, dropna, normalize): - psr = pd.Series([1.0, 2.0, 2.0, 3.0, 3.0, 3.0, None]) - gsr = cudf.from_pandas(psr) - - expected = psr.value_counts( - ascending=ascending, dropna=dropna, normalize=normalize - ) - got = gsr.value_counts( - ascending=ascending, dropna=dropna, normalize=normalize - ) - - assert_eq(expected.sort_index(), got.sort_index(), check_dtype=True) - assert_eq( - expected.reset_index(drop=True), - got.reset_index(drop=True), - check_dtype=True, - ) - - -@pytest.mark.parametrize( - "gs", - [ - cudf.Series([1, 2, 3]), - cudf.Series([None]), - cudf.Series([4]), - cudf.Series([2, 3, -1, 0, 1], name="test name"), - cudf.Series( - [1, 2, 3, None, 2, 1], index=["a", "v", "d", "e", "f", "g"] - ), - cudf.Series([1, 2, 3, None, 2, 1, None], name="abc"), - cudf.Series(["ab", "bc", "ab", None, "bc", None, None]), - cudf.Series([None, None, None, None, None], dtype="str"), - cudf.Series([None, None, None, None, None]), - cudf.Series( - [ - 123213, - 23123, - 123123, - 12213123, - 12213123, - 12213123, - 23123, - 2312323123, - None, - None, - ], - dtype="timedelta64[ns]", - ), - cudf.Series( - [ - None, - 1, - 2, - 3242434, - 3233243, - 1, - 2, - 1023, - None, - 12213123, - None, - 2312323123, - None, - None, - ], - dtype="datetime64[ns]", - ), - cudf.Series(name="empty series", dtype="float64"), - cudf.Series(["a", "b", "c", " ", "a", "b", "z"], dtype="category"), - ], -) -@pytest.mark.parametrize("dropna", [True, False]) -def test_series_mode(gs, dropna): - ps = gs.to_pandas() - - expected = ps.mode(dropna=dropna) - actual = gs.mode(dropna=dropna) - - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "arr", - [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), - np.zeros(100), - np.repeat([-0.6459412758761901], 100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - np.arange(-100.5, 101.5, 1), - ], -) -@pytest.mark.parametrize("decimals", [-5, -3, -1, 0, 1, 4, 12, np.int8(1)]) -def test_series_round(arr, decimals): - pser = pd.Series(arr) - ser = cudf.Series(arr) - result = ser.round(decimals) - expected = pser.round(decimals) - - assert_eq(result, expected) - - # with nulls, maintaining existing null mask - arr = arr.astype("float64") # for pandas nulls - arr.ravel()[ - np.random.choice(arr.shape[0], arr.shape[0] // 2, replace=False) - ] = np.nan - - pser = pd.Series(arr) - ser = cudf.Series(arr) - result = ser.round(decimals) - expected = pser.round(decimals) - - assert_eq(result, expected) - - -def test_series_round_half_up(): - s = cudf.Series([0.0, 1.0, 1.2, 1.7, 0.5, 1.5, 2.5, None]) - expect = cudf.Series([0.0, 1.0, 1.0, 2.0, 1.0, 2.0, 3.0, None]) - got = s.round(how="half_up") - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "series", - [ - cudf.Series([1.0, None, np.nan, 4.0], nan_as_null=False), - cudf.Series([1.24430, None, np.nan, 4.423530], nan_as_null=False), - cudf.Series([1.24430, np.nan, 4.423530], nan_as_null=False), - cudf.Series([-1.24430, np.nan, -4.423530], nan_as_null=False), - cudf.Series(np.repeat(np.nan, 100)), - ], -) -@pytest.mark.parametrize("decimal", [0, 1, 2, 3]) -def test_round_nan_as_null_false(series, decimal): - pser = series.to_pandas() - result = series.round(decimal) - expected = pser.round(decimal) - assert_eq(result, expected, atol=1e-10) - - -@pytest.mark.parametrize("ps", _series_na_data()) -@pytest.mark.parametrize("nan_as_null", [True, False, None]) -def test_series_isnull_isna(ps, nan_as_null): - nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) - if nan_as_null is False and ( - nan_contains.any() and not nan_contains.all() and ps.dtype == object - ): - with pytest.raises(MixedTypeError): - cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) - else: - gs = cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) - - assert_eq(ps.isnull(), gs.isnull()) - assert_eq(ps.isna(), gs.isna()) - - -@pytest.mark.parametrize("ps", _series_na_data()) -@pytest.mark.parametrize("nan_as_null", [True, False, None]) -def test_series_notnull_notna(ps, nan_as_null): - nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x)) - if nan_as_null is False and ( - nan_contains.any() and not nan_contains.all() and ps.dtype == object - ): - with pytest.raises(MixedTypeError): - cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) - else: - gs = cudf.Series.from_pandas(ps, nan_as_null=nan_as_null) - - assert_eq(ps.notnull(), gs.notnull()) - assert_eq(ps.notna(), gs.notna()) - - -@pytest.mark.parametrize( - "sr1", [pd.Series([10, 11, 12], index=["a", "b", "z"]), pd.Series(["a"])] -) -@pytest.mark.parametrize( - "sr2", - [pd.Series([], dtype="float64"), pd.Series(["a", "a", "c", "z", "A"])], -) -@pytest.mark.parametrize( - "op", - [ - operator.eq, - operator.ne, - operator.lt, - operator.gt, - operator.le, - operator.ge, - ], -) -def test_series_error_equality(sr1, sr2, op): - gsr1 = cudf.from_pandas(sr1) - gsr2 = cudf.from_pandas(sr2) - - assert_exceptions_equal(op, op, ([sr1, sr2],), ([gsr1, gsr2],)) - - -def test_series_memory_usage(): - sr = cudf.Series([1, 2, 3, 4], dtype="int64") - assert sr.memory_usage() == 32 - - sliced_sr = sr[2:] - assert sliced_sr.memory_usage() == 16 - - sliced_sr[3] = None - assert sliced_sr.memory_usage() == 80 - - sr = cudf.Series(["hello world", "rapids ai", "abc", "z"]) - assert sr.memory_usage() == 44 - - assert sr[3:].memory_usage() == 9 # z - assert sr[:1].memory_usage() == 19 # hello world - - -@pytest.mark.parametrize( - "sr,expected_psr", - [ - ( - cudf.Series([1, 2, None, 3], dtype="uint8"), - pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), - ), - ( - cudf.Series([23, None, None, 32], dtype="uint16"), - pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), - ), - ( - cudf.Series([None, 123, None, 1], dtype="uint32"), - pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), - ), - ( - cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), - pd.Series( - [234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype() - ), - ), - ( - cudf.Series([-10, 1, None, -1, None, 3], dtype="int8"), - pd.Series([-10, 1, None, -1, None, 3], dtype=pd.Int8Dtype()), - ), - ( - cudf.Series([111, None, 222, None, 13], dtype="int16"), - pd.Series([111, None, 222, None, 13], dtype=pd.Int16Dtype()), - ), - ( - cudf.Series([11, None, 22, 33, None, 2, None, 3], dtype="int32"), - pd.Series( - [11, None, 22, 33, None, 2, None, 3], dtype=pd.Int32Dtype() - ), - ), - ( - cudf.Series( - [32431, None, None, 32322, 0, 10, -32324, None], dtype="int64" - ), - pd.Series( - [32431, None, None, 32322, 0, 10, -32324, None], - dtype=pd.Int64Dtype(), - ), - ), - ( - cudf.Series( - [True, None, False, None, False, True, True, False], - dtype="bool_", - ), - pd.Series( - [True, None, False, None, False, True, True, False], - dtype=pd.BooleanDtype(), - ), - ), - ( - cudf.Series( - [ - "abc", - "a", - None, - "hello world", - "foo buzz", - "", - None, - "rapids ai", - ], - dtype="object", - ), - pd.Series( - [ - "abc", - "a", - None, - "hello world", - "foo buzz", - "", - None, - "rapids ai", - ], - dtype=pd.StringDtype(), - ), - ), - ( - cudf.Series( - [1, 2, None, 10.2, None], - dtype="float32", - ), - pd.Series( - [1, 2, None, 10.2, None], - dtype=pd.Float32Dtype(), - ), - ), - ], -) -def test_series_to_pandas_nullable_dtypes(sr, expected_psr): - actual_psr = sr.to_pandas(nullable=True) - - assert_eq(actual_psr, expected_psr) - - -def test_series_pipe(): - psr = pd.Series([10, 20, 30, 40]) - gsr = cudf.Series([10, 20, 30, 40]) - - def custom_add_func(sr, val): - new_sr = sr + val - return new_sr - - def custom_to_str_func(sr, val): - new_sr = sr.astype("str") + val - return new_sr - - expected = ( - psr.pipe(custom_add_func, 11) - .pipe(custom_add_func, val=12) - .pipe(custom_to_str_func, "rapids") - ) - actual = ( - gsr.pipe(custom_add_func, 11) - .pipe(custom_add_func, val=12) - .pipe(custom_to_str_func, "rapids") - ) - - assert_eq(expected, actual) - - expected = ( - psr.pipe((custom_add_func, "sr"), val=11) - .pipe(custom_add_func, val=1) - .pipe(custom_to_str_func, "rapids-ai") - ) - actual = ( - gsr.pipe((custom_add_func, "sr"), val=11) - .pipe(custom_add_func, val=1) - .pipe(custom_to_str_func, "rapids-ai") - ) - - assert_eq(expected, actual) - - -def test_series_pipe_error(): - psr = pd.Series([10, 20, 30, 40]) - gsr = cudf.Series([10, 20, 30, 40]) - - def custom_add_func(sr, val): - new_sr = sr + val - return new_sr - - assert_exceptions_equal( - lfunc=psr.pipe, - rfunc=gsr.pipe, - lfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), - rfunc_args_and_kwargs=([(custom_add_func, "val")], {"val": 11}), - ) - - -@pytest.mark.parametrize( - "data", - [cudf.Series([1, 2, 3]), cudf.Series([10, 11, 12], index=[1, 2, 3])], -) -@pytest.mark.parametrize( - "other", - [ - cudf.Series([4, 5, 6]), - cudf.Series([4, 5, 6, 7, 8]), - cudf.Series([4, np.nan, 6], nan_as_null=False), - [4, np.nan, 6], - {1: 9}, - ], -) -def test_series_update(data, other): - gs = data.copy(deep=True) - if isinstance(other, cudf.Series): - g_other = other.copy(deep=True) - p_other = g_other.to_pandas() - else: - g_other = other - p_other = other - - ps = gs.to_pandas() - - ps.update(p_other) - with expect_warning_if( - isinstance(other, cudf.Series) and other.isna().any(), UserWarning - ): - gs.update(g_other) - assert_eq(gs, ps) - - -@pytest.mark.parametrize( - "data", - [ - [1, None, 11, 2.0, np.nan], - [np.nan], - [None, None, None], - [np.nan, 1, 10, 393.32, np.nan], - ], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) -@pytest.mark.parametrize("fill_value", [1.2, 332, np.nan]) -def test_fillna_with_nan(data, nan_as_null, fill_value): - gs = cudf.Series(data, dtype="float64", nan_as_null=nan_as_null) - ps = gs.to_pandas() - - expected = ps.fillna(fill_value) - actual = gs.fillna(fill_value) - - assert_eq(expected, actual) - - -def test_fillna_categorical_with_non_categorical_raises(): - ser = cudf.Series([1, None], dtype="category") - with pytest.raises(TypeError): - ser.fillna(cudf.Series([1, 2])) - - -def test_fillna_categorical_with_different_categories_raises(): - ser = cudf.Series([1, None], dtype="category") - with pytest.raises(TypeError): - ser.fillna(cudf.Series([1, 2]), dtype="category") - - -def test_series_mask_mixed_dtypes_error(): - s = cudf.Series(["a", "b", "c"]) - with pytest.raises( - TypeError, - match=re.escape( - "cudf does not support mixed types, please type-cast " - "the column of dataframe/series and other " - "to same dtypes." - ), - ): - s.where([True, False, True], [1, 2, 3]) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series(["a"] * 20, index=range(0, 20)), - pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), - pd.Series( - ["b", None] * 5, - index=pd.Index(list(range(10)), dtype="uint64"), - name="BSeries", - ), - ], -) -@pytest.mark.parametrize( - "labels", - [ - [1], - [0], - 1, - 5, - [5, 9], - pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), - pd.Index([0, 1, 2, 3, 4], dtype="float32"), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_drop_labels(ps, labels, inplace): - ps = ps.copy() - gs = cudf.from_pandas(ps) - - expected = ps.drop(labels=labels, axis=0, inplace=inplace) - actual = gs.drop(labels=labels, axis=0, inplace=inplace) - - if inplace: - expected = ps - actual = gs - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series(["a"] * 20, index=range(0, 20)), - pd.Series(["b", None] * 10, index=range(0, 20), name="ASeries"), - ], -) -@pytest.mark.parametrize( - "index", - [[1], [0], 1, 5, [5, 9], pd.Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_drop_index(ps, index, inplace): - ps = ps.copy() - gs = cudf.from_pandas(ps) - - expected = ps.drop(index=index, inplace=inplace) - actual = gs.drop(index=index, inplace=inplace) - - if inplace: - expected = ps - actual = gs - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series( - ["a" if i % 2 == 0 else "b" for i in range(0, 10)], - index=pd.MultiIndex( - levels=[ - ["lama", "cow", "falcon"], - ["speed", "weight", "length"], - ], - codes=[ - [0, 0, 0, 1, 1, 1, 2, 2, 2, 1], - [0, 1, 2, 0, 1, 2, 0, 1, 2, 1], - ], - ), - name="abc", - ) - ], -) -@pytest.mark.parametrize( - "index,level", - [ - ("cow", 0), - ("lama", 0), - ("falcon", 0), - ("speed", 1), - ("weight", 1), - ("length", 1), - ( - "cow", - None, - ), - ( - "lama", - None, - ), - ( - "falcon", - None, - ), - ], -) -@pytest.mark.parametrize("inplace", [True, False]) -def test_series_drop_multiindex(ps, index, level, inplace): - ps = ps.copy() - gs = cudf.from_pandas(ps) - - expected = ps.drop(index=index, inplace=inplace, level=level) - actual = gs.drop(index=index, inplace=inplace, level=level) - - if inplace: - expected = ps - actual = gs - - assert_eq(expected, actual) - - -def test_series_drop_edge_inputs(): - gs = cudf.Series([42], name="a") - ps = gs.to_pandas() - - assert_eq(ps.drop(columns=["b"]), gs.drop(columns=["b"])) - - assert_eq(ps.drop(columns="b"), gs.drop(columns="b")) - - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - rfunc_args_and_kwargs=(["a"], {"columns": "a", "axis": 1}), - ) - - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=([], {}), - rfunc_args_and_kwargs=([], {}), - ) - - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=(["b"], {"axis": 1}), - rfunc_args_and_kwargs=(["b"], {"axis": 1}), - ) - - -def test_series_drop_raises(): - gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c") - ps = gs.to_pandas() - - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=(["p"],), - rfunc_args_and_kwargs=(["p"],), - ) - - # dtype specified mismatch - assert_exceptions_equal( - lfunc=ps.drop, - rfunc=gs.drop, - lfunc_args_and_kwargs=([3],), - rfunc_args_and_kwargs=([3],), - ) - - expect = ps.drop("p", errors="ignore") - actual = gs.drop("p", errors="ignore") - - assert_eq(actual, expect) - - -@pytest.mark.parametrize( - "data", - [[[1, 2, 3], None, [4], [], [5, 6]], [1, 2, 3, 4, 5]], -) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize( - "p_index", - [ - None, - ["ia", "ib", "ic", "id", "ie"], - pd.MultiIndex.from_tuples( - [(0, "a"), (0, "b"), (0, "c"), (1, "a"), (1, "b")] - ), - ], -) -def test_explode(data, ignore_index, p_index): - pdf = pd.Series(data, index=p_index, name="someseries") - gdf = cudf.from_pandas(pdf) - - expect = pdf.explode(ignore_index) - got = gdf.explode(ignore_index) - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data, expected", - [ - ( - [cudf.Series([1, 2, 3]), cudf.Series([10, 20])], - cudf.Series([[1, 2, 3], [10, 20]]), - ), - ( - [cudf.Series([1, 2, 3]), None, cudf.Series([10, 20, np.nan])], - cudf.Series([[1, 2, 3], None, [10, 20, np.nan]]), - ), - ( - [cp.array([5, 6]), cudf.NA, cp.array([1])], - cudf.Series([[5, 6], None, [1]]), - ), - ( - [None, None, None, None, None, cudf.Series([10, 20])], - cudf.Series([None, None, None, None, None, [10, 20]]), - ), - ], -) -def test_nested_series_from_sequence_data(data, expected): - actual = cudf.Series(data) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data", - [ - cp.ones(5, dtype=cp.float16), - np.ones(5, dtype="float16"), - pd.Series([0.1, 1.2, 3.3], dtype="float16"), - pytest.param( - pa.array(np.ones(5, dtype="float16")), - marks=pytest.mark.xfail( - reason="https://issues.apache.org/jira/browse/ARROW-13762" - ), - ), - ], -) -def test_series_raises_float16(data): - with pytest.raises(TypeError): - cudf.Series(data) - - -@pytest.mark.parametrize( - "index", - [ - pd.RangeIndex(0, 3, 1), - [3.0, 1.0, np.nan], - ["a", "z", None], - pd.RangeIndex(4, -1, -2), - ], -) -@pytest.mark.parametrize("axis", [0, "index"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_series_sort_index( - index, axis, ascending, inplace, ignore_index, na_position -): - ps = pd.Series([10, 3, 12], index=index) - gs = cudf.from_pandas(ps) - - expected = ps.sort_index( - axis=axis, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - got = gs.sort_index( - axis=axis, - ascending=ascending, - ignore_index=ignore_index, - inplace=inplace, - na_position=na_position, - ) - - if inplace is True: - assert_eq(ps, gs, check_index_type=True) - else: - assert_eq(expected, got, check_index_type=True) - - -@pytest.mark.parametrize( - "method", ["md5", "sha1", "sha224", "sha256", "sha384", "sha512"] -) -def test_series_hash_values(method): - inputs = cudf.Series( - [ - "", - "0", - "A 56 character string to test message padding algorithm.", - "A 63 character string to test message padding algorithm, again.", - "A 64 character string to test message padding algorithm, again!!", - ( - "A very long (greater than 128 bytes/char string) to execute " - "a multi hash-step data point in the hash function being " - "tested. This string needed to be longer." - ), - "All work and no play makes Jack a dull boy", - "!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~", - "\x00\x00\x00\x10\x00\x00\x00\x00", - "\x00\x00\x00\x00", - ] - ) - - def hashlib_compute_digest(data): - hasher = getattr(hashlib, method)() - hasher.update(data.encode("utf-8")) - return hasher.hexdigest() - - hashlib_validation = inputs.to_pandas().apply(hashlib_compute_digest) - validation_results = cudf.Series(hashlib_validation) - hash_values = inputs.hash_values(method=method) - assert_eq(hash_values, validation_results) - - -def test_series_hash_values_invalid_method(): - inputs = cudf.Series(["", "0"]) - with pytest.raises(ValueError): - inputs.hash_values(method="invalid_method") - - -def test_set_index_unequal_length(): - s = cudf.Series(dtype="float64") - with pytest.raises(ValueError): - s.index = [1, 2, 3] - - -@pytest.mark.parametrize( - "lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0), (None, None), (None, "a")] -) -def test_equals_names(lhs, rhs): - lhs = cudf.Series([1, 2], name=lhs) - rhs = cudf.Series([1, 2], name=rhs) - - got = lhs.equals(rhs) - expect = lhs.to_pandas().equals(rhs.to_pandas()) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", [[True, False, None, True, False], [None, None], []] -) -@pytest.mark.parametrize("bool_dtype", ["bool", "boolean", pd.BooleanDtype()]) -def test_nullable_bool_dtype_series(data, bool_dtype): - psr = pd.Series(data, dtype=pd.BooleanDtype()) - gsr = cudf.Series(data, dtype=bool_dtype) - - assert_eq(psr, gsr.to_pandas(nullable=True)) - - -@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]]) -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("original_name", [None, "original_ser"]) -@pytest.mark.parametrize("name", [None, "ser", no_default]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_reset_index(level, drop, inplace, original_name, name): - midx = pd.MultiIndex.from_tuples( - [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None] - ) - ps = pd.Series(range(4), index=midx, name=original_name) - gs = cudf.from_pandas(ps) - - if not drop and inplace: - pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" - ) - - expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace) - - got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace) - if inplace: - expect = ps - got = gs - - assert_eq(expect, got) - - -@pytest.mark.parametrize("level", [None, 0, 1, [None]]) -@pytest.mark.parametrize("drop", [False, True]) -@pytest.mark.parametrize("inplace", [False, True]) -@pytest.mark.parametrize("original_name", [None, "original_ser"]) -@pytest.mark.parametrize("name", [None, "ser"]) -def test_reset_index_dup_level_name(level, drop, inplace, original_name, name): - # midx levels are named [None, None] - midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) - ps = pd.Series(range(4), index=midx, name=original_name) - gs = cudf.from_pandas(ps) - if level == [None] or not drop and inplace: - pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" - ) - - expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name) - got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name) - if inplace: - expect = ps - got = gs - - assert_eq(expect, got) - - -@pytest.mark.parametrize("drop", [True, False]) -@pytest.mark.parametrize("inplace", [True, False]) -@pytest.mark.parametrize("original_name", [None, "original_ser"]) -@pytest.mark.parametrize("name", [None, "ser"]) -def test_reset_index_named(drop, inplace, original_name, name): - ps = pd.Series(range(4), index=["x", "y", "z", "w"], name=original_name) - gs = cudf.from_pandas(ps) - - ps.index.name = "cudf" - gs.index.name = "cudf" - - if not drop and inplace: - pytest.skip( - "For exception checks, see " - "test_reset_index_dup_level_name_exceptions" - ) - - expect = ps.reset_index(drop=drop, inplace=inplace, name=name) - got = gs.reset_index(drop=drop, inplace=inplace, name=name) - - if inplace: - expect = ps - got = gs - - assert_eq(expect, got) - - -def test_reset_index_dup_level_name_exceptions(): - midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]) - ps = pd.Series(range(4), index=midx) - gs = cudf.from_pandas(ps) - - # Should specify duplicate level names with level number. - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": [None]}, - ), - rfunc_args_and_kwargs=( - [], - {"level": [None]}, - ), - ) - - # Cannot use drop=False and inplace=True to turn a series into dataframe. - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"drop": False, "inplace": True}, - ), - rfunc_args_and_kwargs=( - [], - {"drop": False, "inplace": True}, - ), - ) - - # Pandas raises the above exception should these two inputs crosses. - assert_exceptions_equal( - lfunc=ps.reset_index, - rfunc=gs.reset_index, - lfunc_args_and_kwargs=( - [], - {"level": [None], "drop": False, "inplace": True}, - ), - rfunc_args_and_kwargs=( - [], - {"level": [None], "drop": False, "inplace": True}, - ), - ) - - -def test_series_add_prefix(): - cd_s = cudf.Series([1, 2, 3, 4]) - pd_s = cd_s.to_pandas() - - got = cd_s.add_prefix("item_") - expected = pd_s.add_prefix("item_") - - assert_eq(got, expected) - - -def test_series_add_suffix(): - cd_s = cudf.Series([1, 2, 3, 4]) - pd_s = cd_s.to_pandas() - - got = cd_s.add_suffix("_item") - expected = pd_s.add_suffix("_item") - - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "cudf_series", - [ - cudf.Series([0.25, 0.5, 0.2, -0.05]), - cudf.Series([0, 1, 2, np.nan, 4, cudf.NA, 6]), - ], -) -@pytest.mark.parametrize("lag", [1, 2, 3, 4]) -def test_autocorr(cudf_series, lag): - psr = cudf_series.to_pandas() - - cudf_corr = cudf_series.autocorr(lag=lag) - - # autocorrelation is undefined (nan) for less than two entries, but pandas - # short-circuits when there are 0 entries and bypasses the numpy function - # call that generates an error. - num_both_valid = (psr.notna() & psr.shift(lag).notna()).sum() - with expect_warning_if(num_both_valid == 1, RuntimeWarning): - pd_corr = psr.autocorr(lag=lag) - - assert_eq(pd_corr, cudf_corr) - - -@pytest.mark.parametrize( - "data", - [ - [0, 1, 2, 3], - ["abc", "a", None, "hello world", "foo buzz", "", None, "rapids ai"], - ], -) -def test_series_transpose(data): - psr = pd.Series(data=data) - csr = cudf.Series(data=data) - - cudf_transposed = csr.transpose() - pd_transposed = psr.transpose() - cudf_property = csr.T - pd_property = psr.T - - assert_eq(pd_transposed, cudf_transposed) - assert_eq(pd_property, cudf_property) - assert_eq(cudf_transposed, csr) - - -@pytest.mark.parametrize( - "data", - [1, 3, 5, 7, 7], -) -def test_series_nunique(data): - cd_s = cudf.Series(data) - pd_s = cd_s.to_pandas() - - actual = cd_s.nunique() - expected = pd_s.nunique() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [1, 3, 5, 7, 7], -) -def test_series_nunique_index(data): - cd_s = cudf.Series(data) - pd_s = cd_s.to_pandas() - - actual = cd_s.index.nunique() - expected = pd_s.index.nunique() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [], - [1, 2, 3, 4], - ["a", "b", "c"], - [1.2, 2.2, 4.5], - [np.nan, np.nan], - [None, None, None], - ], -) -def test_axes(data): - csr = cudf.Series(data) - psr = csr.to_pandas() - - expected = psr.axes - actual = csr.axes - - for e, a in zip(expected, actual): - assert_eq(e, a) - - -def test_series_truncate(): - csr = cudf.Series([1, 2, 3, 4]) - psr = csr.to_pandas() - - assert_eq(csr.truncate(), psr.truncate()) - assert_eq(csr.truncate(1, 2), psr.truncate(1, 2)) - assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2)) - - -def test_series_truncate_errors(): - csr = cudf.Series([1, 2, 3, 4]) - with pytest.raises(ValueError): - csr.truncate(axis=1) - with pytest.raises(ValueError): - csr.truncate(copy=False) - - csr.index = [3, 2, 1, 6] - psr = csr.to_pandas() - assert_exceptions_equal( - lfunc=csr.truncate, - rfunc=psr.truncate, - ) - - -def test_series_truncate_datetimeindex(): - dates = cudf.date_range( - "2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s" - ) - csr = cudf.Series(range(len(dates)), index=dates) - psr = csr.to_pandas() - - assert_eq( - csr.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ), - psr.truncate( - before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" - ), - ) - - -@pytest.mark.parametrize( - "data", - [ - [], - [0, 12, 14], - [0, 14, 12, 12, 3, 10, 12, 14], - np.random.randint(-100, 100, 200), - pd.Series([0.0, 1.0, None, 10.0]), - [None, None, None, None], - [np.nan, None, -1, 2, 3], - ], -) -@pytest.mark.parametrize( - "values", - [ - np.random.randint(-100, 100, 10), - [], - [np.nan, None, -1, 2, 3], - [1.0, 12.0, None, None, 120], - [0, 14, 12, 12, 3, 10, 12, 14, None], - [None, None, None], - ["0", "12", "14"], - ["0", "12", "14", "a"], - ], -) -def test_isin_numeric(data, values): - index = np.random.randint(0, 100, len(data)) - psr = pd.Series(data, index=index) - gsr = cudf.Series.from_pandas(psr, nan_as_null=False) - - expected = psr.isin(values) - got = gsr.isin(values) - - assert_eq(got, expected) - - -@pytest.mark.xfail(raises=TypeError) -def test_fill_new_category(): - gs = cudf.Series(pd.Categorical(["a", "b", "c"])) - gs[0:1] = "d" - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warning newly introduced in pandas-2.2.0", -) -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series( - ["2018-01-01", "2019-04-03", None, "2019-12-30"], - dtype="datetime64[ns]", - ), - pd.Series( - [ - "2018-01-01", - "2019-04-03", - None, - "2019-12-30", - "2018-01-01", - "2018-01-01", - ], - dtype="datetime64[ns]", - ), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - [1514764800000000000, 1577664000000000000], - [ - 1514764800000000000, - 1577664000000000000, - 1577664000000000000, - 1577664000000000000, - 1514764800000000000, - ], - ["2019-04-03", "2019-12-30", "2012-01-01"], - [ - "2012-01-01", - "2012-01-01", - "2012-01-01", - "2019-04-03", - "2019-12-30", - "2012-01-01", - ], - ], -) -def test_isin_datetime(data, values): - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - is_len_str = isinstance(next(iter(values), None), str) and len(data) - with expect_warning_if(is_len_str): - got = gsr.isin(values) - with expect_warning_if(is_len_str): - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series(["this", "is", None, "a", "test"]), - pd.Series(["test", "this", "test", "is", None, "test", "a", "test"]), - pd.Series(["0", "12", "14"]), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["this", "is"], - [None, None, None], - ["12", "14", "19"], - [12, 14, 19], - ["is", "this", "is", "this", "is"], - ], -) -def test_isin_string(data, values): - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - got = gsr.isin(values) - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [], - pd.Series(["a", "b", "c", "c", "c", "d", "e"], dtype="category"), - pd.Series(["a", "b", None, "c", "d", "e"], dtype="category"), - pd.Series([0, 3, 10, 12], dtype="category"), - pd.Series([0, 3, 10, 12, 0, 10, 3, 0, 0, 3, 3], dtype="category"), - ], -) -@pytest.mark.parametrize( - "values", - [ - [], - ["a", "b", None, "f", "words"], - ["0", "12", None, "14"], - [0, 10, 12, None, 39, 40, 1000], - [0, 0, 0, 0, 3, 3, 3, None, 1, 2, 3], - ], -) -def test_isin_categorical(data, values): - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - got = gsr.isin(values) - expected = psr.isin(values) - assert_eq(got, expected) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES) -@pytest.mark.parametrize("period", [-1, -5, -10, -20, 0, 1, 5, 10, 20]) -@pytest.mark.parametrize("data_empty", [False, True]) -def test_diff(dtype, period, data_empty): - if data_empty: - data = None - else: - if dtype == np.int8: - # to keep data in range - data = gen_rand(dtype, 100000, low=-2, high=2) - else: - data = gen_rand(dtype, 100000) - - gs = cudf.Series(data, dtype=dtype) - ps = pd.Series(data, dtype=dtype) - - expected_outcome = ps.diff(period) - diffed_outcome = gs.diff(period).astype(expected_outcome.dtype) - - if data_empty: - assert_eq(diffed_outcome, expected_outcome, check_index_type=False) - else: - assert_eq(diffed_outcome, expected_outcome) - - -@pytest.mark.parametrize( - "data", - [ - ["a", "b", "c", "d", "e"], - ], -) -def test_diff_unsupported_dtypes(data): - gs = cudf.Series(data) - with pytest.raises( - TypeError, - match=r"unsupported operand type\(s\)", - ): - gs.diff() - - -@pytest.mark.parametrize( - "data", - [ - pd.date_range("2020-01-01", "2020-01-06", freq="D"), - [True, True, True, False, True, True], - [1.0, 2.0, 3.5, 4.0, 5.0, -1.7], - [1, 2, 3, 3, 4, 5], - [np.nan, None, None, np.nan, np.nan, None], - ], -) -def test_diff_many_dtypes(data): - ps = pd.Series(data) - gs = cudf.from_pandas(ps) - assert_eq(ps.diff(), gs.diff()) - assert_eq(ps.diff(periods=2), gs.diff(periods=2)) - - -@pytest.mark.parametrize("num_rows", [1, 100]) -@pytest.mark.parametrize("num_bins", [1, 10]) -@pytest.mark.parametrize("right", [True, False]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) -@pytest.mark.parametrize("series_bins", [True, False]) -def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): - data = np.random.randint(0, 100, num_rows).astype(dtype) - bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) - s = cudf.Series(data) - if series_bins: - s_bins = cudf.Series(bins) - indices = s.digitize(s_bins, right) - else: - indices = s.digitize(bins, right) - np.testing.assert_array_equal( - np.digitize(data, bins, right), indices.to_numpy() - ) - - -def test_series_digitize_invalid_bins(): - s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") - bins = cudf.Series([2, None, None, 50, 90], dtype="int32") - - with pytest.raises( - ValueError, match="`bins` cannot contain null entries." - ): - _ = s.digitize(bins) - - -@pytest.mark.parametrize( - "data,left,right", - [ - ([0, 1, 2, 3, 4, 5, 10], 0, 5), - ([0, 1, 2, 3, 4, 5, 10], 10, 1), - ([0, 1, 2, 3, 4, 5], [0, 10, 11] * 2, [1, 2, 5] * 2), - (["a", "few", "set", "of", "strings", "xyz", "abc"], "banana", "few"), - (["a", "few", "set", "of", "strings", "xyz", "abc"], "phone", "hello"), - ( - ["a", "few", "set", "of", "strings", "xyz", "abc"], - ["a", "hello", "rapids", "ai", "world", "chars", "strs"], - ["yes", "no", "hi", "bye", "test", "pass", "fail"], - ), - ([0, 1, 2, np.nan, 4, np.nan, 10], 10, 1), - ], -) -@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) -def test_series_between(data, left, right, inclusive): - ps = pd.Series(data) - gs = cudf.from_pandas(ps, nan_as_null=False) - - expected = ps.between(left, right, inclusive=inclusive) - actual = gs.between(left, right, inclusive=inclusive) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,left,right", - [ - ([0, 1, 2, None, 4, 5, 10], 0, 5), - ([0, 1, 2, 3, None, 5, 10], 10, 1), - ([None, 1, 2, 3, 4, None], [0, 10, 11] * 2, [1, 2, 5] * 2), - ( - ["a", "few", "set", None, "strings", "xyz", "abc"], - ["a", "hello", "rapids", "ai", "world", "chars", "strs"], - ["yes", "no", "hi", "bye", "test", "pass", "fail"], - ), - ], -) -@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) -def test_series_between_with_null(data, left, right, inclusive): - gs = cudf.Series(data) - ps = gs.to_pandas(nullable=True) - - expected = ps.between(left, right, inclusive=inclusive) - actual = gs.between(left, right, inclusive=inclusive) - - assert_eq(expected, actual.to_pandas(nullable=True)) - - -def test_default_construction(): - s = cudf.Series([np.int8(8), np.int16(128)]) - assert s.dtype == np.dtype("i2") - - -@pytest.mark.parametrize( - "data", [[0, 1, 2, 3, 4], range(5), [np.int8(8), np.int16(128)]] -) -def test_default_integer_bitwidth_construction(default_integer_bitwidth, data): - s = cudf.Series(data) - assert s.dtype == np.dtype(f"i{default_integer_bitwidth//8}") - - -@pytest.mark.parametrize("data", [[1.5, 2.5, 4.5], [1000, 2000, 4000, 3.14]]) -def test_default_float_bitwidth_construction(default_float_bitwidth, data): - s = cudf.Series(data) - assert s.dtype == np.dtype(f"f{default_float_bitwidth//8}") - - -def test_series_ordered_dedup(): - # part of https://github.com/rapidsai/cudf/issues/11486 - sr = cudf.Series(np.random.randint(0, 100, 1000)) - # pandas unique() preserves order - expect = pd.Series(sr.to_pandas().unique()) - got = cudf.Series._from_column(sr._column.unique()) - assert_eq(expect.values, got.values) - - -@pytest.mark.parametrize("dtype", ["int64", "float64"]) -@pytest.mark.parametrize("bool_scalar", [True, False]) -def test_set_bool_error(dtype, bool_scalar): - sr = cudf.Series([1, 2, 3], dtype=dtype) - psr = sr.to_pandas(nullable=True) - - assert_exceptions_equal( - lfunc=sr.__setitem__, - rfunc=psr.__setitem__, - lfunc_args_and_kwargs=([bool_scalar],), - rfunc_args_and_kwargs=([bool_scalar],), - ) - - -def test_int64_equality(): - s = cudf.Series(np.asarray([2**63 - 10, 2**63 - 100], dtype=np.int64)) - assert (s != np.int64(2**63 - 1)).all() - assert (s != cudf.Scalar(2**63 - 1, dtype=np.int64)).all() - - -@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)]) -def test_series_to_dict(into): - gs = cudf.Series(["ab", "de", "zx"], index=[10, 20, 100]) - ps = gs.to_pandas() - - actual = gs.to_dict(into=into) - expected = ps.to_dict(into=into) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3], - pytest.param( - [np.nan, 10, 15, 16], - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/49818" - ), - ), - [np.nan, None, 10, 20], - ["ab", "zx", "pq"], - ["ab", "zx", None, "pq"], - [], - ], -) -def test_series_hasnans(data): - gs = cudf.Series(data, nan_as_null=False) - ps = gs.to_pandas(nullable=True) - - # Check type to avoid mixing Python bool and NumPy bool - assert isinstance(gs.hasnans, bool) - assert gs.hasnans == ps.hasnans - - -@pytest.mark.parametrize( - "data,index", - [ - ([1, 2, 3], [10, 11, 12]), - ([1, 2, 3, 1, 1, 2, 3, 2], [10, 20, 23, 24, 25, 26, 27, 28]), - ([1, None, 2, None, 3, None, 3, 1], [5, 6, 7, 8, 9, 10, 11, 12]), - ([np.nan, 1.0, np.nan, 5.4, 5.4, 1.0], ["a", "b", "c", "d", "e", "f"]), - ( - ["lama", "cow", "lama", None, "beetle", "lama", None, None], - [1, 4, 10, 11, 2, 100, 200, 400], - ), - ], -) -@pytest.mark.parametrize("keep", ["first", "last", False]) -@pytest.mark.parametrize("name", [None, "a"]) -def test_series_duplicated(data, index, keep, name): - gs = cudf.Series(data, index=index, name=name) - ps = gs.to_pandas() - - assert_eq(gs.duplicated(keep=keep), ps.duplicated(keep=keep)) - - -@pytest.mark.parametrize( - "data", - [ - [1, 2, 3, 4], - [10, 20, None, None], - ], -) -@pytest.mark.parametrize("copy", [True, False]) -def test_series_copy(data, copy): - psr = pd.Series(data) - gsr = cudf.from_pandas(psr) - - new_psr = pd.Series(psr, copy=copy) - new_gsr = cudf.Series(gsr, copy=copy) - - new_psr.iloc[0] = 999 - new_gsr.iloc[0] = 999 - - assert_eq(psr, gsr) - assert_eq(new_psr, new_gsr) - - -@pytest.mark.parametrize( - "data", - [ - {"a": 1, "b": 2, "c": 24, "d": 1010}, - {"a": 1}, - ], -) -@pytest.mark.parametrize( - "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] -) -def test_series_init_dict_with_index(data, index): - pandas_series = pd.Series(data, index=index) - cudf_series = cudf.Series(data, index=index) - - assert_eq(pandas_series, cudf_series) - - -@pytest.mark.parametrize("data", ["abc", None, 1, 3.7]) -@pytest.mark.parametrize( - "index", [None, ["b", "c"], ["d", "a", "c", "b"], ["a"]] -) -def test_series_init_scalar_with_index(data, index): - pandas_series = pd.Series(data, index=index) - cudf_series = cudf.Series(data, index=index) - - assert_eq( - pandas_series, - cudf_series, - check_index_type=data is not None or index is not None, - check_dtype=data is not None, - ) - - -def test_series_init_error(): - assert_exceptions_equal( - lfunc=pd.Series, - rfunc=cudf.Series, - lfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}), - rfunc_args_and_kwargs=([], {"data": [11], "index": [10, 11]}), - ) - - -def test_series_init_from_series_and_index(): - ser = cudf.Series([4, 7, -5, 3], index=["d", "b", "a", "c"]) - result = cudf.Series(ser, index=list("abcd")) - expected = cudf.Series([-5, 7, 3, 4], index=list("abcd")) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "dtype", ["datetime64[ns]", "timedelta64[ns]", "object", "str"] -) -def test_series_mixed_dtype_error(dtype): - ps = pd.concat([pd.Series([1, 2, 3], dtype=dtype), pd.Series([10, 11])]) - with pytest.raises(TypeError): - cudf.Series(ps) - with pytest.raises(TypeError): - cudf.Series(ps.array) - - -@pytest.mark.parametrize("data", [[True, False, None], [10, 200, 300]]) -@pytest.mark.parametrize("index", [None, [10, 20, 30]]) -def test_series_contains(data, index): - ps = pd.Series(data, index=index) - gs = cudf.Series(data, index=index) - - assert_eq(1 in ps, 1 in gs) - assert_eq(10 in ps, 10 in gs) - assert_eq(True in ps, True in gs) - assert_eq(False in ps, False in gs) - - -def test_series_from_pandas_sparse(): - pser = pd.Series(range(2), dtype=pd.SparseDtype(np.int64, 0)) - with pytest.raises(NotImplementedError): - cudf.Series(pser) - - -def test_series_constructor_unbounded_sequence(): - class A: - def __getitem__(self, key): - return 1 - - with pytest.raises(TypeError): - cudf.Series(A()) - - -def test_series_constructor_error_mixed_type(): - with pytest.raises(MixedTypeError): - cudf.Series(["abc", np.nan, "123"], nan_as_null=False) - - -def test_series_typecast_to_object_error(): - actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(ValueError): - actual.astype(object) - with pytest.raises(ValueError): - actual.astype(np.dtype("object")) - new_series = actual.astype("str") - assert new_series[0] == "1970-01-01 00:00:00.000000001" - - -def test_series_typecast_to_object(): - actual = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - with cudf.option_context("mode.pandas_compatible", False): - new_series = actual.astype(object) - assert new_series[0] == "1970-01-01 00:00:00.000000001" - new_series = actual.astype(np.dtype("object")) - assert new_series[0] == "1970-01-01 00:00:00.000000001" - - -@pytest.mark.parametrize("attr", ["nlargest", "nsmallest"]) -def test_series_nlargest_nsmallest_str_error(attr): - gs = cudf.Series(["a", "b", "c", "d", "e"]) - ps = gs.to_pandas() - - assert_exceptions_equal( - getattr(gs, attr), getattr(ps, attr), ([], {"n": 1}), ([], {"n": 1}) - ) - - -def test_series_unique_pandas_compatibility(): - gs = cudf.Series([10, 11, 12, 11, 10]) - ps = gs.to_pandas() - with cudf.option_context("mode.pandas_compatible", True): - actual = gs.unique() - expected = ps.unique() - assert_eq(actual, expected) - - -@pytest.mark.parametrize("initial_name", SERIES_OR_INDEX_NAMES) -@pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES) -def test_series_rename(initial_name, name): - gsr = cudf.Series([1, 2, 3], name=initial_name) - psr = pd.Series([1, 2, 3], name=initial_name) - - assert_eq(gsr, psr) - - actual = gsr.rename(name) - expected = psr.rename(name) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize("index", [lambda x: x * 2, {1: 2}]) -def test_rename_index_not_supported(index): - ser = cudf.Series(range(2)) - with pytest.raises(NotImplementedError): - ser.rename(index=index) - - -@pytest.mark.parametrize( - "data", - [ - [1.2234242333234, 323432.3243423, np.nan], - pd.Series([34224, 324324, 324342], dtype="datetime64[ns]"), - pd.Series([224.242, None, 2424.234324], dtype="category"), - [ - decimal.Decimal("342.3243234234242"), - decimal.Decimal("89.32432497687622"), - None, - ], - ], -) -@pytest.mark.parametrize("digits", [0, 1, 3, 4, 10]) -def test_series_round_builtin(data, digits): - ps = pd.Series(data) - gs = cudf.from_pandas(ps, nan_as_null=False) - - # TODO: Remove `to_frame` workaround - # after following issue is fixed: - # https://github.com/pandas-dev/pandas/issues/55114 - expected = round(ps.to_frame(), digits)[0] - expected.name = None - actual = round(gs, digits) - - assert_eq(expected, actual) - - -def test_series_empty_dtype(): - expected = pd.Series([]) - actual = cudf.Series([]) - assert_eq(expected, actual, check_dtype=True) - - -@pytest.mark.parametrize("data", [None, {}, []]) -def test_series_empty_index_rangeindex(data): - expected = cudf.RangeIndex(0) - result = cudf.Series(data).index - assert_eq(result, expected) - - -def test_series_count_invalid_param(): - s = cudf.Series([], dtype="float64") - with pytest.raises(TypeError): - s.count(skipna=True) - - -@pytest.mark.parametrize( - "data", [[0, 1, 2], ["a", "b", "c"], [0.324, 32.32, 3243.23]] -) -def test_series_setitem_nat_with_non_datetimes(data): - s = cudf.Series(data) - with pytest.raises(TypeError): - s[0] = cudf.NaT - - -def test_series_string_setitem(): - gs = cudf.Series(["abc", "def", "ghi", "xyz", "pqr"]) - ps = gs.to_pandas() - - gs[0] = "NaT" - gs[1] = "NA" - gs[2] = "" - gs[3] = "NaN" - - ps[0] = "NaT" - ps[1] = "NA" - ps[2] = "" - ps[3] = "NaN" - - assert_eq(gs, ps) - - -def test_multi_dim_series_error(): - arr = cp.array([(1, 2), (3, 4)]) - with pytest.raises(ValueError): - cudf.Series(arr) - - -def test_bool_series_mixed_dtype_error(): - ps = pd.Series([True, False, None]) - all_bool_ps = pd.Series([True, False, True], dtype="object") - # ps now has `object` dtype, which - # isn't supported by `cudf`. - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(TypeError): - cudf.Series(ps) - with pytest.raises(TypeError): - cudf.from_pandas(ps) - with pytest.raises(TypeError): - cudf.Series(ps, dtype=bool) - expected = cudf.Series(all_bool_ps, dtype=bool) - assert_eq(expected, all_bool_ps.astype(bool)) - nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object") - gs = cudf.Series(nan_bools_mix, nan_as_null=True) - assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean")) - with pytest.raises(TypeError): - cudf.Series(nan_bools_mix, nan_as_null=False) - - -@pytest.mark.parametrize( - "pandas_type", - [ - pd.ArrowDtype(pa.int8()), - pd.ArrowDtype(pa.int16()), - pd.ArrowDtype(pa.int32()), - pd.ArrowDtype(pa.int64()), - pd.ArrowDtype(pa.uint8()), - pd.ArrowDtype(pa.uint16()), - pd.ArrowDtype(pa.uint32()), - pd.ArrowDtype(pa.uint64()), - pd.ArrowDtype(pa.float32()), - pd.ArrowDtype(pa.float64()), - pd.Int8Dtype(), - pd.Int16Dtype(), - pd.Int32Dtype(), - pd.Int64Dtype(), - pd.UInt8Dtype(), - pd.UInt16Dtype(), - pd.UInt32Dtype(), - pd.UInt64Dtype(), - pd.Float32Dtype(), - pd.Float64Dtype(), - ], -) -def test_series_arrow_numeric_types_roundtrip(pandas_type): - ps = pd.Series([1, 2, 3], dtype=pandas_type) - pi = pd.Index(ps) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pi) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "pandas_type", [pd.ArrowDtype(pa.bool_()), pd.BooleanDtype()] -) -def test_series_arrow_bool_types_roundtrip(pandas_type): - ps = pd.Series([True, False, None], dtype=pandas_type) - pi = pd.Index(ps) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pi) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "pandas_type", [pd.ArrowDtype(pa.string()), pd.StringDtype()] -) -def test_series_arrow_string_types_roundtrip(pandas_type): - ps = pd.Series(["abc", None, "xyz"], dtype=pandas_type) - pi = pd.Index(ps) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pi) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pdf) - - -def test_series_arrow_category_types_roundtrip(): - pa_array = pa.array(pd.Series([1, 2, 3], dtype="category")) - ps = pd.Series([1, 2, 3], dtype=pd.ArrowDtype(pa_array.type)) - pi = pd.Index(ps) - pdf = pi.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pi) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize( - "pa_type", - [pa.decimal128(10, 2), pa.decimal128(5, 2), pa.decimal128(20, 2)], -) -def test_series_arrow_decimal_types_roundtrip(pa_type): - ps = pd.Series( - [ - decimal.Decimal("1.2"), - decimal.Decimal("20.56"), - decimal.Decimal("3"), - ], - dtype=pd.ArrowDtype(pa_type), - ) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pdf) - - -def test_series_arrow_struct_types_roundtrip(): - ps = pd.Series( - [{"a": 1}, {"b": "abc"}], - dtype=pd.ArrowDtype(pa.struct({"a": pa.int64(), "b": pa.string()})), - ) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pdf) - - -def test_series_arrow_list_types_roundtrip(): - ps = pd.Series([[1], [2], [4]], dtype=pd.ArrowDtype(pa.list_(pa.int64()))) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - pdf = ps.to_frame() - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(ps) - - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - cudf.from_pandas(pdf) - - -@pytest.mark.parametrize("base_name", [None, "a"]) -def test_series_to_frame_none_name(base_name): - result = cudf.Series(range(1), name=base_name).to_frame(name=None) - expected = pd.Series(range(1), name=base_name).to_frame(name=None) - assert_eq(result, expected) - - -@pytest.mark.parametrize("klass", [cudf.Index, cudf.Series]) -@pytest.mark.parametrize( - "data", [pa.array([float("nan")]), pa.chunked_array([[float("nan")]])] -) -def test_nan_as_null_from_arrow_objects(klass, data): - result = klass(data, nan_as_null=True) - expected = klass(pa.array([None], type=pa.float64())) - assert_eq(result, expected) - - -@pytest.mark.parametrize("reso", ["M", "ps"]) -@pytest.mark.parametrize("typ", ["M", "m"]) -def test_series_invalid_reso_dtype(reso, typ): - with pytest.raises(TypeError): - cudf.Series([], dtype=f"{typ}8[{reso}]") - - -def test_series_categorical_missing_value_count(): - ps = pd.Series(pd.Categorical(list("abcccb"), categories=list("cabd"))) - gs = cudf.from_pandas(ps) - - expected = ps.value_counts() - actual = gs.value_counts() - - assert_eq(expected, actual, check_dtype=False) - - -def test_series_error_nan_mixed_types(): - ps = pd.Series([np.nan, "ab", "cd"]) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(MixedTypeError): - cudf.from_pandas(ps) - - -def test_series_error_nan_non_float_dtypes(): - s = cudf.Series(["a", "b", "c"]) - with pytest.raises(TypeError): - s[0] = np.nan - - s = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - with pytest.raises(TypeError): - s[0] = np.nan - - -@pytest.mark.parametrize( - "dtype", - [ - pd.ArrowDtype(pa.int8()), - pd.ArrowDtype(pa.int16()), - pd.ArrowDtype(pa.int32()), - pd.ArrowDtype(pa.int64()), - pd.ArrowDtype(pa.uint8()), - pd.ArrowDtype(pa.uint16()), - pd.ArrowDtype(pa.uint32()), - pd.ArrowDtype(pa.uint64()), - pd.ArrowDtype(pa.float32()), - pd.ArrowDtype(pa.float64()), - pd.Int8Dtype(), - pd.Int16Dtype(), - pd.Int32Dtype(), - pd.Int64Dtype(), - pd.UInt8Dtype(), - pd.UInt16Dtype(), - pd.UInt32Dtype(), - pd.UInt64Dtype(), - pd.Float32Dtype(), - pd.Float64Dtype(), - ], -) -@pytest.mark.parametrize("klass", [cudf.Series, cudf.DataFrame, cudf.Index]) -@pytest.mark.parametrize("kind", [lambda x: x, str], ids=["obj", "string"]) -def test_astype_pandas_nullable_pandas_compat(dtype, klass, kind): - ser = klass([1, 2, 3]) - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises(NotImplementedError): - ser.astype(kind(dtype)) - - -@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) -@pytest.mark.parametrize( - "data", - [ - pa.array([1, None], type=pa.int64()), - pa.chunked_array([[1, None]], type=pa.int64()), - ], -) -def test_from_arrow_array_dtype(klass, data): - obj = klass(data, dtype="int8") - assert obj.dtype == np.dtype("int8") - - -@pytest.mark.parametrize("klass", [cudf.Series, cudf.Index]) -def test_from_pandas_object_dtype_passed_dtype(klass): - result = klass(pd.Series([True, False], dtype=object), dtype="int8") - expected = klass(pa.array([1, 0], type=pa.int8())) - assert_eq(result, expected) - - -def test_series_where_mixed_bool_dtype(): - s = cudf.Series([True, False, True]) - with pytest.raises(TypeError): - s.where(~s, 10) - - -def test_series_setitem_mixed_bool_dtype(): - s = cudf.Series([True, False, True]) - with pytest.raises(TypeError): - s[0] = 10 - - -@pytest.mark.parametrize( - "nat, value", - [ - [np.datetime64("nat", "ns"), np.datetime64("2020-01-01", "ns")], - [np.timedelta64("nat", "ns"), np.timedelta64(1, "ns")], - ], -) -@pytest.mark.parametrize("nan_as_null", [True, False]) -def test_series_np_array_nat_nan_as_nulls(nat, value, nan_as_null): - expected = np.array([nat, value]) - ser = cudf.Series(expected, nan_as_null=nan_as_null) - assert ser[0] is pd.NaT - assert ser[1] == value - - -def test_series_unitness_np_datetimelike_units(): - data = np.array([np.timedelta64(1)]) - with pytest.raises(TypeError): - cudf.Series(data) - with pytest.raises(TypeError): - pd.Series(data) - - -def test_series_duplicate_index_reindex(): - gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1]) - ps = gs.to_pandas() - - assert_exceptions_equal( - gs.reindex, - ps.reindex, - lfunc_args_and_kwargs=([10, 11, 12, 13], {}), - rfunc_args_and_kwargs=([10, 11, 12, 13], {}), - ) - - -def test_list_category_like_maintains_dtype(): - dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True) - data = [1, 2, 3] - result = cudf.Series._from_column( - cudf.core.column.as_column(data, dtype=dtype) - ) - expected = pd.Series(data, dtype=dtype.to_pandas()) - assert_eq(result, expected) - - -def test_list_interval_like_maintains_dtype(): - dtype = cudf.IntervalDtype(subtype=np.int8) - data = [pd.Interval(1, 2)] - result = cudf.Series._from_column( - cudf.core.column.as_column(data, dtype=dtype) - ) - expected = pd.Series(data, dtype=dtype.to_pandas()) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "klass", [cudf.Series, cudf.Index, pd.Series, pd.Index] -) -def test_series_from_named_object_name_priority(klass): - result = cudf.Series(klass([1], name="a"), name="b") - assert result.name == "b" - - -@pytest.mark.parametrize( - "data", - [ - {"a": 1, "b": 2, "c": 3}, - cudf.Series([1, 2, 3], index=list("abc")), - pd.Series([1, 2, 3], index=list("abc")), - ], -) -def test_series_from_object_with_index_index_arg_reindex(data): - result = cudf.Series(data, index=list("bca")) - expected = cudf.Series([2, 3, 1], index=list("bca")) - assert_eq(result, expected) - - -@pytest.mark.parametrize( - "data", - [ - {0: 1, 1: 2, 2: 3}, - cudf.Series([1, 2, 3]), - cudf.Index([1, 2, 3]), - pd.Series([1, 2, 3]), - pd.Index([1, 2, 3]), - [1, 2, 3], - ], -) -def test_series_dtype_astypes(data): - result = cudf.Series(data, dtype="float64") - expected = cudf.Series([1.0, 2.0, 3.0]) - assert_eq(result, expected) - - -@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string]) -def test_series_from_large_string(pa_type): - pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type()) - got = cudf.Series(pa_string_array) - expected = pd.Series(pa_string_array) - - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - [1], - decimal.Decimal("1.0"), - ], -) -def test_series_to_pandas_arrow_type_nullable_raises(scalar): - pa_array = pa.array([scalar, None]) - ser = cudf.Series(pa_array) - with pytest.raises(ValueError, match=".* cannot both be set"): - ser.to_pandas(nullable=True, arrow_type=True) - - -@pytest.mark.parametrize( - "scalar", - [ - 1, - 1.0, - "a", - datetime.datetime(2020, 1, 1), - datetime.timedelta(1), - {"1": 2}, - [1], - decimal.Decimal("1.0"), - ], -) -def test_series_to_pandas_arrow_type(scalar): - pa_array = pa.array([scalar, None]) - ser = cudf.Series(pa_array) - result = ser.to_pandas(arrow_type=True) - expected = pd.Series(pd.arrays.ArrowExtensionArray(pa_array)) - pd.testing.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("axis", [None, 0, "index"]) -@pytest.mark.parametrize("data", [[1, 2], [1]]) -def test_squeeze(axis, data): - ser = cudf.Series(data) - result = ser.squeeze(axis=axis) - expected = ser.to_pandas().squeeze(axis=axis) - assert_eq(result, expected) - - -@pytest.mark.parametrize("axis", [1, "columns"]) -def test_squeeze_invalid_axis(axis): - with pytest.raises(ValueError): - cudf.Series([1]).squeeze(axis=axis) - - -def test_series_init_with_nans(): - with cudf.option_context("mode.pandas_compatible", True): - gs = cudf.Series([1, 2, 3, np.nan]) - assert gs.dtype == np.dtype("float64") - ps = pd.Series([1, 2, 3, np.nan]) - assert_eq(ps, gs) - - -@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) -def test_timestamp_series_init(data): - scalar = pd.Timestamp(data) - expected = pd.Series([scalar]) - actual = cudf.Series([scalar]) - - assert_eq(expected, actual) - - expected = pd.Series(scalar) - actual = cudf.Series(scalar) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0]) -def test_timedelta_series_init(data): - scalar = pd.Timedelta(data) - expected = pd.Series([scalar]) - actual = cudf.Series([scalar]) - - assert_eq(expected, actual) - - expected = pd.Series(scalar) - actual = cudf.Series(scalar) - - assert_eq(expected, actual) - - -def test_series_from_series_index_no_shallow_copy(): - ser1 = cudf.Series(range(3), index=list("abc")) - ser2 = cudf.Series(ser1) - assert ser1.index is ser2.index - - -@pytest.mark.parametrize("value", [1, 1.1]) -def test_nans_to_nulls_noop_copies_column(value): - ser1 = cudf.Series([value]) - ser2 = ser1.nans_to_nulls() - assert ser1._column is not ser2._column - - -@pytest.mark.parametrize("dropna", [False, True]) -def test_nunique_all_null(dropna): - data = [None, None] - pd_ser = pd.Series(data) - cudf_ser = cudf.Series(data) - result = pd_ser.nunique(dropna=dropna) - expected = cudf_ser.nunique(dropna=dropna) - assert result == expected - - -@pytest.mark.parametrize( - "type1", - [ - "category", - "interval[int64, right]", - "int64", - "float64", - "str", - "datetime64[ns]", - "timedelta64[ns]", - ], -) -@pytest.mark.parametrize( - "type2", - [ - "category", - "interval[int64, right]", - "int64", - "float64", - "str", - "datetime64[ns]", - "timedelta64[ns]", - ], -) -@pytest.mark.parametrize( - "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"] -) -@pytest.mark.parametrize("copy", [True, False]) -def test_empty_astype_always_castable(type1, type2, as_dtype, copy): - ser = cudf.Series([], dtype=as_dtype(type1)) - result = ser.astype(as_dtype(type2), copy=copy) - expected = cudf.Series([], dtype=as_dtype(type2)) - assert_eq(result, expected) - if not copy and cudf.dtype(type1) == cudf.dtype(type2): - assert ser._column is result._column - else: - assert ser._column is not result._column diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py deleted file mode 100644 index 3d8b6a79d2a..00000000000 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from itertools import product -from math import floor - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Series -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal - - -def test_series_map_basic(): - gd1 = cudf.Series(["cat", np.nan, "rabbit", "dog"]) - pdf1 = gd1.to_pandas() - - expected_dict = pdf1.map({"cat": "kitten", "dog": "puppy"}) - actual_dict = gd1.map({"cat": "kitten", "dog": "puppy"}) - - assert_eq(expected_dict, actual_dict) - - -@pytest.mark.parametrize("name", ["a", None, 2]) -def test_series_map_series_input(name): - gd1 = cudf.Series(["cat", "dog", np.nan, "rabbit"], name=name) - pdf1 = gd1.to_pandas() - - expected_series = pdf1.map(pd.Series({"cat": "kitten", "dog": "puppy"})) - actual_series = gd1.map(cudf.Series({"cat": "kitten", "dog": "puppy"})) - - assert_eq(expected_series, actual_series) - - -def test_series_map_callable_numeric_basic(): - gd2 = cudf.Series([1, 2, 3, 4, np.nan]) - pdf2 = gd2.to_pandas() - - expected_function = pdf2.map(lambda x: x**2) - actual_function = gd2.map(lambda x: x**2) - - assert_eq(expected_function, actual_function) - - -@pytest.mark.parametrize("nelem", list(product([2, 10, 100, 1000]))) -def test_series_map_callable_numeric_random(nelem): - # Generate data - np.random.seed(0) - data = np.random.random(nelem) * 100 - - sr = Series(data) - pdsr = pd.Series(data) - - # Call map - got = sr.map(lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x))) - expect = pdsr.map( - lambda x: (floor(x) + 1 if x - floor(x) >= 0.5 else floor(x)) - ) - - # Check - assert_eq(expect, got, check_dtype=False) - - -def test_series_map_callable_numeric_random_dtype_change(): - # Test for changing the out_dtype using map - - data = list(range(10)) - - sr = Series(data) - pdsr = pd.Series(data) - - got = sr.map(lambda x: float(x)) - expect = pdsr.map(lambda x: float(x)) - - # Check - assert_eq(expect, got) - - -def test_series_map_non_unique_index(): - # test for checking correct error is produced - - gd1 = cudf.Series([1, 2, 3, 4, np.nan]) - pd1 = pd.Series([1, 2, 3, 4, np.nan]) - - gd_map_series = cudf.Series(["a", "b", "c"], index=[1, 1, 2]) - pd_map_series = pd.Series(["a", "b", "c"], index=[1, 1, 2]) - - assert_exceptions_equal( - lfunc=pd1.map, - rfunc=gd1.map, - check_exception_type=False, - lfunc_args_and_kwargs=([pd_map_series],), - rfunc_args_and_kwargs=([gd_map_series],), - ) diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py deleted file mode 100644 index 5406836ba61..00000000000 --- a/python/cudf/cudf/tests/test_setitem.py +++ /dev/null @@ -1,495 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - - -@pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) -@pytest.mark.parametrize("arg", [[True, False, True], [True, True, True]]) -@pytest.mark.parametrize("value", [0, -1]) -def test_dataframe_setitem_bool_mask_scaler(df, arg, value): - gdf = cudf.from_pandas(df) - - df[arg] = value - gdf[arg] = value - assert_eq(df, gdf) - - -def test_dataframe_setitem_scaler_bool(): - df = pd.DataFrame({"a": [1, 2, 3]}) - df[[True, False, True]] = pd.DataFrame({"a": [-1, -2]}) - - gdf = cudf.DataFrame({"a": [1, 2, 3]}) - gdf[[True, False, True]] = cudf.DataFrame({"a": [-1, -2]}) - assert_eq(df, gdf) - - -@pytest.mark.parametrize( - "df", - [pd.DataFrame({"a": [1, 2, 3]}), pd.DataFrame({"a": ["x", "y", "z"]})], -) -@pytest.mark.parametrize("arg", [["a"], "a", "b"]) -@pytest.mark.parametrize( - "value", [-10, pd.DataFrame({"a": [-1, -2, -3]}), "abc"] -) -def test_dataframe_setitem_columns(df, arg, value): - gdf = cudf.from_pandas(df) - cudf_replace_value = value - - if isinstance(cudf_replace_value, pd.DataFrame): - cudf_replace_value = cudf.from_pandas(value) - - df[arg] = value - gdf[arg] = cudf_replace_value - assert_eq(df, gdf, check_dtype=False) - - -@pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})]) -@pytest.mark.parametrize("arg", [["b", "c"]]) -@pytest.mark.parametrize( - "value", - [ - pd.DataFrame({"0": [-1, -2, -3], "1": [-0, -10, -1]}), - 10, - 20, - 30, - "rapids", - "ai", - 0.32234, - np.datetime64(1324232423423342, "ns"), - np.timedelta64(34234324234324234, "ns"), - ], -) -def test_dataframe_setitem_new_columns(df, arg, value): - gdf = cudf.from_pandas(df) - cudf_replace_value = value - - if isinstance(cudf_replace_value, pd.DataFrame): - cudf_replace_value = cudf.from_pandas(value) - - df[arg] = value - gdf[arg] = cudf_replace_value - assert_eq(df, gdf, check_dtype=True) - - -# set_item_series inconsistency -def test_series_setitem_index(): - df = pd.DataFrame( - data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] - ) - - df["b"] = pd.Series(data=[12, 11, 10], index=[3, 2, 1]) - gdf = cudf.DataFrame( - data={"b": [-1, -2, -3], "c": [1, 2, 3]}, index=[1, 2, 3] - ) - gdf["b"] = cudf.Series(data=[12, 11, 10], index=[3, 2, 1]) - assert_eq(df, gdf, check_dtype=False) - - -@pytest.mark.parametrize("psr", [pd.Series([1, 2, 3], index=["a", "b", "c"])]) -@pytest.mark.parametrize( - "arg", ["b", ["a", "c"], slice(1, 2, 1), [True, False, True]] -) -def test_series_set_item(psr, arg): - gsr = cudf.from_pandas(psr) - - psr[arg] = 11 - gsr[arg] = 11 - - assert_eq(psr, gsr) - - -def test_series_setitem_singleton_range(): - sr = cudf.Series([1, 2, 3], dtype=np.int64) - psr = sr.to_pandas() - value = np.asarray([7], dtype=np.int64) - sr.iloc[:1] = value - psr.iloc[:1] = value - assert_eq(sr, cudf.Series([7, 2, 3], dtype=np.int64)) - assert_eq(sr, psr, check_dtype=True) - - -@pytest.mark.xfail(reason="Copy-on-Write should make a copy") -@pytest.mark.parametrize( - "index", - [ - pd.MultiIndex.from_frame( - pd.DataFrame({"b": [3, 2, 1], "c": ["a", "b", "c"]}) - ), - ["a", "b", "c"], - ], -) -def test_setitem_dataframe_series_inplace(index): - gdf = cudf.DataFrame({"a": [1, 2, 3]}, index=index) - expected = gdf.copy() - with cudf.option_context("copy_on_write", True): - gdf["a"].replace(1, 500, inplace=True) - - assert_eq(expected, gdf) - - -@pytest.mark.parametrize( - "replace_data", - [ - [100, 200, 300, 400, 500], - cudf.Series([100, 200, 300, 400, 500]), - cudf.Series([100, 200, 300, 400, 500], index=[2, 3, 4, 5, 6]), - ], -) -def test_series_set_equal_length_object_by_mask(replace_data): - psr = pd.Series([1, 2, 3, 4, 5], dtype="Int64") - gsr = cudf.from_pandas(psr) - - # Lengths match in trivial case - pd_bool_col = pd.Series([True] * len(psr), dtype="boolean") - gd_bool_col = cudf.from_pandas(pd_bool_col) - psr[pd_bool_col] = ( - replace_data.to_pandas(nullable=True) - if hasattr(replace_data, "to_pandas") - else pd.Series(replace_data) - ) - gsr[gd_bool_col] = replace_data - - assert_eq(psr.astype("float"), gsr.astype("float")) - - # Test partial masking - psr[psr > 1] = ( - replace_data.to_pandas() - if hasattr(replace_data, "to_pandas") - else pd.Series(replace_data) - ) - gsr[gsr > 1] = replace_data - - assert_eq(psr.astype("float"), gsr.astype("float")) - - -def test_column_set_equal_length_object_by_mask(): - # Series.__setitem__ might bypass some of the cases - # handled in column.__setitem__ so this test is needed - - data = cudf.Series([0, 0, 1, 1, 1])._column - replace_data = cudf.Series([100, 200, 300, 400, 500])._column - bool_col = cudf.Series([True, True, True, True, True])._column - - data[bool_col] = replace_data - assert_eq( - cudf.Series._from_column(data), - cudf.Series._from_column(replace_data), - ) - - data = cudf.Series([0, 0, 1, 1, 1])._column - bool_col = cudf.Series([True, False, True, False, True])._column - data[bool_col] = replace_data - - assert_eq( - cudf.Series._from_column(data), - cudf.Series([100, 0, 300, 1, 500]), - ) - - -def test_column_set_unequal_length_object_by_mask(): - data = [1, 2, 3, 4, 5] - replace_data_1 = [8, 9] - replace_data_2 = [8, 9, 10, 11] - mask = [True, True, False, True, False] - - psr = pd.Series(data) - gsr = cudf.Series(data) - assert_exceptions_equal( - psr.__setitem__, - gsr.__setitem__, - ([mask, replace_data_1], {}), - ([mask, replace_data_1], {}), - ) - - psr = pd.Series(data) - gsr = cudf.Series(data) - assert_exceptions_equal( - psr.__setitem__, - gsr.__setitem__, - ([mask, replace_data_2], {}), - ([mask, replace_data_2], {}), - ) - - -def test_categorical_setitem_invalid(): - ps = pd.Series([1, 2, 3], dtype="category") - gs = cudf.Series([1, 2, 3], dtype="category") - - assert_exceptions_equal( - lfunc=ps.__setitem__, - rfunc=gs.__setitem__, - lfunc_args_and_kwargs=([0, 5], {}), - rfunc_args_and_kwargs=([0, 5], {}), - ) - - -def test_series_slice_setitem_list(): - actual = cudf.Series([[[1, 2], [2, 3]], [[3, 4]], [[4, 5]], [[6, 7]]]) - actual[slice(0, 3, 1)] = [[10, 11], [12, 23]] - expected = cudf.Series( - [ - [[10, 11], [12, 23]], - [[10, 11], [12, 23]], - [[10, 11], [12, 23]], - [[6, 7]], - ] - ) - assert_eq(actual, expected) - - actual = cudf.Series([[[1, 2], [2, 3]], [[3, 4]], [[4, 5]], [[6, 7]]]) - actual[0:3] = cudf.Scalar([[10, 11], [12, 23]]) - - assert_eq(actual, expected) - - -def test_series_slice_setitem_struct(): - actual = cudf.Series( - [ - {"a": {"b": 10}, "b": 11}, - {"a": {"b": 100}, "b": 5}, - {"a": {"b": 50}, "b": 2}, - {"a": {"b": 1000}, "b": 67}, - {"a": {"b": 4000}, "b": 1090}, - ] - ) - actual[slice(0, 3, 1)] = {"a": {"b": 5050}, "b": 101} - expected = cudf.Series( - [ - {"a": {"b": 5050}, "b": 101}, - {"a": {"b": 5050}, "b": 101}, - {"a": {"b": 5050}, "b": 101}, - {"a": {"b": 1000}, "b": 67}, - {"a": {"b": 4000}, "b": 1090}, - ] - ) - assert_eq(actual, expected) - - actual = cudf.Series( - [ - {"a": {"b": 10}, "b": 11}, - {"a": {"b": 100}, "b": 5}, - {"a": {"b": 50}, "b": 2}, - {"a": {"b": 1000}, "b": 67}, - {"a": {"b": 4000}, "b": 1090}, - ] - ) - actual[0:3] = cudf.Scalar({"a": {"b": 5050}, "b": 101}) - - assert_eq(actual, expected) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) -@pytest.mark.parametrize("indices", [0, [1, 2]]) -def test_series_setitem_upcasting(dtype, indices): - sr = pd.Series([0, 0, 0], dtype=dtype) - cr = cudf.from_pandas(sr) - assert_eq(sr, cr) - # Must be a non-integral floating point value that can't be losslessly - # converted to float32, otherwise pandas will try and match the source - # column dtype. - new_value = np.float64(np.pi) - col_ref = cr._column - with expect_warning_if(dtype != np.float64): - sr[indices] = new_value - with expect_warning_if(dtype != np.float64): - cr[indices] = new_value - assert_eq(sr, cr) - - if dtype == np.float64: - # no-op type cast should not modify backing column - assert col_ref == cr._column - - -# TODO: these two tests could perhaps be changed once specifics of -# pandas compat wrt upcasting are decided on; this is just baking in -# status-quo. -def test_series_setitem_upcasting_string_column(): - sr = pd.Series([0, 0, 0], dtype=str) - cr = cudf.from_pandas(sr) - new_value = np.float64(10.5) - sr[0] = str(new_value) - cr[0] = new_value - assert_eq(sr, cr) - - -def test_series_setitem_upcasting_string_value(): - sr = cudf.Series([0, 0, 0], dtype=int) - # This is a distinction with pandas, which lets you instead make an - # object column with ["10", 0, 0] - sr[0] = "10" - assert_eq(pd.Series([10, 0, 0], dtype=int), sr) - with pytest.raises(ValueError): - sr[0] = "non-integer" - - -def test_scatter_by_slice_with_start_and_step(): - source = pd.Series([1, 2, 3, 4, 5]) - csource = cudf.from_pandas(source) - target = pd.Series([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) - ctarget = cudf.from_pandas(target) - target[1::2] = source - ctarget[1::2] = csource - assert_eq(target, ctarget) - - -@pytest.mark.parametrize("n", [1, 3]) -def test_setitem_str_trailing_null(n): - trailing_nulls = "\x00" * n - s = cudf.Series(["a", "b", "c" + trailing_nulls]) - assert s[2] == "c" + trailing_nulls - s[0] = "a" + trailing_nulls - assert s[0] == "a" + trailing_nulls - s[1] = trailing_nulls - assert s[1] == trailing_nulls - s[0] = "" - assert s[0] == "" - s[0] = "\x00" - assert s[0] == "\x00" - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/7448") -def test_iloc_setitem_7448(): - index = pd.MultiIndex.from_product([(1, 2), (3, 4)]) - expect = cudf.Series([1, 2, 3, 4], index=index) - actual = cudf.from_pandas(expect) - expect[(1, 3)] = 101 - actual[(1, 3)] = 101 - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "value", - [ - "7", - pytest.param( - ["7", "8"], - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/11298" - ), - ), - ], -) -def test_loc_setitem_string_11298(value): - df = pd.DataFrame({"a": ["a", "b", "c"]}) - cdf = cudf.from_pandas(df) - - df.loc[:1, "a"] = value - - cdf.loc[:1, "a"] = value - - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11944") -def test_loc_setitem_list_11944(): - df = pd.DataFrame( - data={"a": ["yes", "no"], "b": [["l1", "l2"], ["c", "d"]]} - ) - cdf = cudf.from_pandas(df) - df.loc[df.a == "yes", "b"] = [["hello"]] - cdf.loc[df.a == "yes", "b"] = [["hello"]] - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12504") -def test_loc_setitem_extend_empty_12504(): - df = pd.DataFrame(columns=["a"]) - cdf = cudf.from_pandas(df) - - df.loc[0] = [1] - - cdf.loc[0] = [1] - - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12505") -def test_loc_setitem_extend_existing_12505(): - df = pd.DataFrame({"a": [0]}) - cdf = cudf.from_pandas(df) - - df.loc[1] = 1 - - cdf.loc[1] = 1 - - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/12801") -def test_loc_setitem_add_column_partial_12801(): - df = pd.DataFrame({"a": [0, 1, 2]}) - cdf = cudf.from_pandas(df) - - df.loc[df.a < 2, "b"] = 1 - - cdf.loc[cdf.a < 2, "b"] = 1 - - assert_eq(df, cdf) - - -@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/13031") -@pytest.mark.parametrize("other_index", [["1", "3", "2"], [1, 2, 3]]) -def test_loc_setitem_series_index_alignment_13031(other_index): - s = pd.Series([1, 2, 3], index=["1", "2", "3"]) - other = pd.Series([5, 6, 7], index=other_index) - - cs = cudf.from_pandas(s) - cother = cudf.from_pandas(other) - - s.loc[["1", "3"]] = other - - cs.loc[["1", "3"]] = cother - - assert_eq(s, cs) - - -@pytest.mark.parametrize( - "ps", - [ - pd.Series([1, 2, 3], index=pd.RangeIndex(0, 3)), - pd.Series([1, 2, 3], index=pd.RangeIndex(start=2, stop=-1, step=-1)), - pd.Series([1, 2, 3], index=pd.RangeIndex(start=1, stop=6, step=2)), - pd.Series( - [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-9, step=-2) - ), - pd.Series( - [1, 2, 3, 4, 5], index=pd.RangeIndex(start=1, stop=-12, step=-3) - ), - pd.Series([1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=14, step=4)), - pd.Series( - [1, 2, 3, 4], index=pd.RangeIndex(start=1, stop=-14, step=-4) - ), - ], -) -@pytest.mark.parametrize("arg", list(range(-20, 20)) + [5.6, 3.1]) -def test_series_set_item_range_index(ps, arg): - gsr = cudf.from_pandas(ps) - psr = ps.copy(deep=True) - psr[arg] = 11 - gsr[arg] = 11 - - assert_eq(psr, gsr, check_index_type=True) - - -def test_series_set_item_index_reference(): - gs1 = cudf.Series([1], index=[7]) - gs2 = cudf.Series([2], index=gs1.index) - gs1.loc[11] = 2 - - ps1 = pd.Series([1], index=[7]) - ps2 = pd.Series([2], index=ps1.index) - ps1.loc[11] = 2 - - assert_eq(ps1, gs1) - assert_eq(ps2, gs2) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py deleted file mode 100644 index 2cf2259d9ec..00000000000 --- a/python/cudf/cudf/tests/test_sorting.py +++ /dev/null @@ -1,427 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import string -from itertools import product - -import numpy as np -import pandas as pd -import pytest - -from cudf import DataFrame, Series -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.column import NumericalColumn -from cudf.testing import assert_eq -from cudf.testing._utils import ( - DATETIME_TYPES, - NUMERIC_TYPES, - assert_exceptions_equal, - expect_warning_if, -) - -sort_nelem_args = [2, 257] -sort_dtype_args = [ - np.int32, - np.int64, - np.uint32, - np.uint64, - np.float32, - np.float64, -] -sort_slice_args = [slice(1, None), slice(None, -1), slice(1, -1)] - - -@pytest.mark.parametrize( - "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args)) -) -def test_dataframe_sort_values(nelem, dtype): - np.random.seed(0) - df = DataFrame() - df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype) - df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype) - sorted_df = df.sort_values(by="a") - # Check - sorted_index = np.argsort(aa, kind="mergesort") - assert_eq(sorted_df.index.values, sorted_index) - assert_eq(sorted_df["a"].values, aa[sorted_index]) - assert_eq(sorted_df["b"].values, bb[sorted_index]) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -@pytest.mark.parametrize("index", ["a", "b", ["a", "b"]]) -def test_dataframe_sort_values_ignore_index(index, ignore_index): - if ( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and isinstance(index, list) - and not ignore_index - ): - pytest.skip( - reason="Unstable sorting by pandas(numpy): https://github.com/pandas-dev/pandas/issues/57531" - ) - - gdf = DataFrame( - {"a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1]} - ) - gdf = gdf.set_index(index) - - pdf = gdf.to_pandas() - - expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) - got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("ignore_index", [True, False]) -def test_series_sort_values_ignore_index(ignore_index): - gsr = Series([1, 3, 5, 2, 4]) - psr = gsr.to_pandas() - - expect = psr.sort_values(ignore_index=ignore_index) - got = gsr.sort_values(ignore_index=ignore_index) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "nelem,sliceobj", list(product([10, 100], sort_slice_args)) -) -def test_dataframe_sort_values_sliced(nelem, sliceobj): - np.random.seed(0) - df = pd.DataFrame() - df["a"] = np.random.random(nelem) - - expect = df[sliceobj]["a"].sort_values() - gdf = DataFrame.from_pandas(df) - got = gdf[sliceobj]["a"].sort_values() - assert (got.to_pandas() == expect).all() - - -@pytest.mark.parametrize( - "nelem,dtype,asc", - list(product(sort_nelem_args, sort_dtype_args, [True, False])), -) -def test_series_argsort(nelem, dtype, asc): - np.random.seed(0) - sr = Series((100 * np.random.random(nelem)).astype(dtype)) - res = sr.argsort(ascending=asc) - - if asc: - expected = np.argsort(sr.to_numpy(), kind="mergesort") - else: - # -1 multiply works around missing desc sort (may promote to float64) - expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort") - np.testing.assert_array_equal(expected, res.to_numpy()) - - -@pytest.mark.parametrize( - "nelem,asc", list(product(sort_nelem_args, [True, False])) -) -def test_series_sort_index(nelem, asc): - np.random.seed(0) - sr = Series(100 * np.random.random(nelem)) - psr = sr.to_pandas() - - expected = psr.sort_index(ascending=asc) - got = sr.sort_index(ascending=asc) - - assert_eq(expected, got) - - -@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) -@pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 7]) -def test_series_nlargest(data, n): - """Indirectly tests Series.sort_values()""" - sr = Series(data) - psr = pd.Series(data) - assert_eq(sr.nlargest(n), psr.nlargest(n)) - assert_eq(sr.nlargest(n, keep="last"), psr.nlargest(n, keep="last")) - - assert_exceptions_equal( - lfunc=psr.nlargest, - rfunc=sr.nlargest, - lfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), - rfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), - ) - - -@pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]]) -@pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 9]) -def test_series_nsmallest(data, n): - """Indirectly tests Series.sort_values()""" - sr = Series(data) - psr = pd.Series(data) - assert_eq(sr.nsmallest(n), psr.nsmallest(n)) - assert_eq( - sr.nsmallest(n, keep="last").sort_index(), - psr.nsmallest(n, keep="last").sort_index(), - ) - - assert_exceptions_equal( - lfunc=psr.nsmallest, - rfunc=sr.nsmallest, - lfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), - rfunc_args_and_kwargs=([], {"n": 3, "keep": "what"}), - ) - - -@pytest.mark.parametrize("nelem,n", [(1, 1), (100, 100), (10, 5), (100, 10)]) -@pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) -@pytest.mark.parametrize("columns", ["a", ["b", "a"]]) -def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): - np.random.seed(0) - aa = np.random.random(nelem) - bb = np.random.random(nelem) - - df = DataFrame({"a": aa, "b": bb}) - pdf = df.to_pandas() - assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns)) - - -@pytest.mark.parametrize( - "counts,sliceobj", list(product([(10, 5), (100, 10)], sort_slice_args)) -) -def test_dataframe_nlargest_sliced(counts, sliceobj): - nelem, n = counts - np.random.seed(0) - df = pd.DataFrame() - df["a"] = np.random.random(nelem) - df["b"] = np.random.random(nelem) - - expect = df[sliceobj].nlargest(n, "a") - gdf = DataFrame.from_pandas(df) - got = gdf[sliceobj].nlargest(n, "a") - assert (got.to_pandas() == expect).all().all() - - -@pytest.mark.parametrize( - "counts,sliceobj", list(product([(10, 5), (100, 10)], sort_slice_args)) -) -def test_dataframe_nsmallest_sliced(counts, sliceobj): - nelem, n = counts - np.random.seed(0) - df = pd.DataFrame() - df["a"] = np.random.random(nelem) - df["b"] = np.random.random(nelem) - - expect = df[sliceobj].nsmallest(n, "a") - gdf = DataFrame.from_pandas(df) - got = gdf[sliceobj].nsmallest(n, "a") - assert (got.to_pandas() == expect).all().all() - - -@pytest.mark.parametrize("num_cols", [1, 2, 3, 5]) -@pytest.mark.parametrize("num_rows", [0, 1, 2, 1000]) -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column( - num_cols, num_rows, dtype, ascending, na_position -): - np.random.seed(0) - by = list(string.ascii_lowercase[:num_cols]) - pdf = pd.DataFrame() - - for i in range(5): - colname = string.ascii_lowercase[i] - data = np.random.randint(0, 26, num_rows).astype(dtype) - pdf[colname] = data - - gdf = DataFrame.from_pandas(pdf) - - got = gdf.sort_values(by, ascending=ascending, na_position=na_position) - expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) - - assert_eq( - got[by].reset_index(drop=True), expect[by].reset_index(drop=True) - ) - - -@pytest.mark.parametrize("num_cols", [1, 2, 3]) -@pytest.mark.parametrize("num_rows", [0, 1, 2, 3, 5]) -@pytest.mark.parametrize("dtype", ["float32", "float64"]) -@pytest.mark.parametrize("nulls", ["some", "all"]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column_nulls( - num_cols, num_rows, dtype, nulls, ascending, na_position -): - np.random.seed(0) - by = list(string.ascii_lowercase[:num_cols]) - pdf = pd.DataFrame() - - for i in range(3): - colname = string.ascii_lowercase[i] - data = np.random.randint(0, 26, num_rows).astype(dtype) - if nulls == "some": - idx = np.array([], dtype="int64") - if num_rows > 0: - idx = np.random.choice( - num_rows, size=int(num_rows / 4), replace=False - ) - data[idx] = np.nan - elif nulls == "all": - data[:] = np.nan - pdf[colname] = data - - gdf = DataFrame.from_pandas(pdf) - - got = gdf.sort_values(by, ascending=ascending, na_position=na_position) - expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) - - assert_eq( - got[by].reset_index(drop=True), expect[by].reset_index(drop=True) - ) - - -@pytest.mark.parametrize( - "ascending", list(product((True, False), (True, False))) -) -@pytest.mark.parametrize("na_position", ["first", "last"]) -def test_dataframe_multi_column_nulls_multiple_ascending( - ascending, na_position -): - pdf = pd.DataFrame( - {"a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7]} - ) - gdf = DataFrame.from_pandas(pdf) - expect = pdf.sort_values( - by=["a", "b"], ascending=ascending, na_position=na_position - ) - actual = gdf.sort_values( - by=["a", "b"], ascending=ascending, na_position=na_position - ) - - assert_eq(actual, expect) - - -@pytest.mark.parametrize("nelem", [1, 100]) -def test_series_nlargest_nelem(nelem): - np.random.seed(0) - elems = np.random.random(nelem) - gds = Series(elems).nlargest(nelem) - pds = pd.Series(elems).nlargest(nelem) - - assert (pds == gds.to_pandas()).all().all() - - -@pytest.mark.parametrize("map_size", [1, 2, 8]) -@pytest.mark.parametrize("nelem", [1, 10, 100]) -@pytest.mark.parametrize("keep", [True, False]) -def test_dataframe_scatter_by_map(map_size, nelem, keep): - strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] - np.random.seed(0) - df = DataFrame() - df["a"] = np.random.choice(strlist[:map_size], nelem) - df["b"] = np.random.uniform(low=0, high=map_size, size=nelem) - df["c"] = np.random.randint(map_size, size=nelem) - df["d"] = df["a"].astype("category") - - def _check_scatter_by_map(dfs, col): - assert len(dfs) == map_size - nrows = 0 - # print(col._column) - name = col.name - for i, df in enumerate(dfs): - nrows += len(df) - if len(df) > 0: - # Make sure the column types were preserved - assert isinstance(df[name]._column, type(col._column)) - try: - sr = df[name].astype(np.int32) - except ValueError: - sr = df[name] - assert sr.nunique() <= 1 - if sr.nunique() == 1: - if isinstance(df[name]._column, NumericalColumn): - assert sr.iloc[0] == i - assert nrows == nelem - - with pytest.warns(UserWarning): - _check_scatter_by_map( - df.scatter_by_map("a", map_size, keep_index=keep), df["a"] - ) - _check_scatter_by_map( - df.scatter_by_map("b", map_size, keep_index=keep), df["b"] - ) - _check_scatter_by_map( - df.scatter_by_map("c", map_size, keep_index=keep), df["c"] - ) - with pytest.warns(UserWarning): - _check_scatter_by_map( - df.scatter_by_map("d", map_size, keep_index=keep), df["d"] - ) - - if map_size == 2 and nelem == 100: - with pytest.warns(UserWarning): - df.scatter_by_map("a") # Auto-detect map_size - with pytest.raises(ValueError): - with pytest.warns(UserWarning): - df.scatter_by_map("a", map_size=1, debug=True) # Bad map_size - - # Test Index - df2 = df.set_index("c") - generic_result = df2.scatter_by_map("b", map_size, keep_index=keep) - _check_scatter_by_map(generic_result, df2["b"]) - if keep: - for frame in generic_result: - isinstance(frame.index, type(df2.index)) - - # Test MultiIndex - df2 = df.set_index(["a", "c"]) - multiindex_result = df2.scatter_by_map("b", map_size, keep_index=keep) - _check_scatter_by_map(multiindex_result, df2["b"]) - if keep: - for frame in multiindex_result: - isinstance(frame.index, type(df2.index)) - - -@pytest.mark.parametrize( - "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args)) -) -@pytest.mark.parametrize( - "kind", ["quicksort", "mergesort", "heapsort", "stable"] -) -def test_dataframe_sort_values_kind(nelem, dtype, kind): - np.random.seed(0) - df = DataFrame() - df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype) - df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype) - with expect_warning_if(kind != "quicksort", UserWarning): - sorted_df = df.sort_values(by="a", kind=kind) - # Check - sorted_index = np.argsort(aa, kind="mergesort") - assert_eq(sorted_df.index.values, sorted_index) - assert_eq(sorted_df["a"].values, aa[sorted_index]) - assert_eq(sorted_df["b"].values, bb[sorted_index]) - - -@pytest.mark.parametrize("ids", [[-1, 0, 1, 0], [0, 2, 3, 0]]) -def test_dataframe_scatter_by_map_7513(ids): - df = DataFrame({"id": ids, "val": [0, 1, 2, 3]}) - with pytest.raises(ValueError): - df.scatter_by_map(df["id"]) - - -def test_dataframe_scatter_by_map_empty(): - df = DataFrame({"a": [], "b": []}, dtype="float64") - scattered = df.scatter_by_map(df["a"]) - assert len(scattered) == 0 - - -def test_sort_values_by_index_level(): - df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="b")) - cudf_df = DataFrame.from_pandas(df) - result = cudf_df.sort_values("b") - expected = df.sort_values("b") - assert_eq(result, expected) - - -def test_sort_values_by_ambiguous(): - df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="a")) - cudf_df = DataFrame.from_pandas(df) - - assert_exceptions_equal( - lfunc=df.sort_values, - rfunc=cudf_df.sort_values, - lfunc_args_and_kwargs=(["a"], {}), - rfunc_args_and_kwargs=(["a"], {}), - ) diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py deleted file mode 100644 index 3248e7f72c0..00000000000 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. - -import numpy as np - -from cudf import Series - - -def test_to_dense_array(): - data = np.random.random(8) - mask = np.asarray([0b11010110]).astype(np.byte) - - sr = Series.from_masked_array(data=data, mask=mask, null_count=3) - assert sr.has_nulls - assert sr.null_count != len(sr) - filled = sr.to_numpy(na_value=np.nan) - dense = sr.dropna().to_numpy() - assert dense.size < filled.size - assert filled.size == len(sr) diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py deleted file mode 100644 index 7af83a99d60..00000000000 --- a/python/cudf/cudf/tests/test_spilling.py +++ /dev/null @@ -1,786 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import contextlib -import importlib -import random -import time -import warnings -import weakref -from concurrent.futures import ThreadPoolExecutor - -import cupy -import numpy as np -import pandas -import pandas.testing -import pytest - -import rmm - -import cudf -import cudf.core.buffer.spill_manager -import cudf.options -from cudf.core.abc import Serializable -from cudf.core.buffer import ( - Buffer, - acquire_spill_lock, - as_buffer, - get_spill_lock, -) -from cudf.core.buffer.spill_manager import ( - SpillManager, - get_global_manager, - get_rmm_memory_resource_stack, - set_global_manager, - spill_on_demand_globally, -) -from cudf.core.buffer.spillable_buffer import ( - SpillableBuffer, - SpillableBufferOwner, - SpillLock, -) -from cudf.testing import assert_eq - -if get_global_manager() is not None: - pytest.skip( - "cannot test spilling when enabled globally, set `CUDF_SPILL=off`", - allow_module_level=True, - ) - - -@contextlib.contextmanager -def set_rmm_memory_pool(nbytes: int): - mr = rmm.mr.get_current_device_resource() - rmm.mr.set_current_device_resource( - rmm.mr.PoolMemoryResource( - mr, - initial_pool_size=nbytes, - maximum_pool_size=nbytes, - ) - ) - try: - yield - finally: - rmm.mr.set_current_device_resource(mr) - - -def single_column_df(target="gpu") -> cudf.DataFrame: - """Create a standard single column dataframe used for testing - - Use `single_column_df_data`, `single_column_df_base_data`, - `gen_df_data_nbytes` for easy access to the buffer of the column. - - Notice, this is just for convenience, there is nothing special - about this dataframe. - - Parameters - ---------- - target : str, optional - Set the spill state of the dataframe - - Return - ------ - DataFrame - A standard dataframe with a single column - """ - ret = cudf.DataFrame({"a": [1, 2, 3]}) - if target != "gpu": - single_column_df_data(ret).spill(target=target) - return ret - - -def single_column_df_data(df: cudf.DataFrame) -> SpillableBuffer: - """Access `.data` of the column of a standard dataframe""" - ret = df._data._data["a"].data - assert isinstance(ret, SpillableBuffer) - return ret - - -def single_column_df_base_data(df: cudf.DataFrame) -> SpillableBuffer: - """Access `.base_data` of the column of a standard dataframe""" - ret = df._data._data["a"].base_data - assert isinstance(ret, SpillableBuffer) - return ret - - -# Get number of bytes of the column of a standard dataframe -gen_df_data_nbytes = single_column_df()._data._data["a"].data.nbytes - - -def spilled_and_unspilled(manager: SpillManager) -> tuple[int, int]: - """Get bytes spilled and unspilled known by the manager""" - spilled = sum(buf.size for buf in manager.buffers() if buf.is_spilled) - unspilled = sum( - buf.size for buf in manager.buffers() if not buf.is_spilled - ) - return spilled, unspilled - - -@pytest.fixture -def manager(request): - """Fixture to enable and make a spilling manager availabe""" - kwargs = dict(getattr(request, "param", {})) - with warnings.catch_warnings(): - warnings.simplefilter("error") - set_global_manager(manager=SpillManager(**kwargs)) - yield get_global_manager() - # Retrieving the test result using the `pytest_runtest_makereport` - # hook from conftest.py - if request.node.report["call"].failed: - # Ignore `overwriting non-empty manager` errors when - # test is failing. - warnings.simplefilter("ignore") - set_global_manager(manager=None) - - -def test_spillable_buffer(manager: SpillManager): - buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) - assert isinstance(buf, SpillableBuffer) - assert buf.spillable - buf.owner.mark_exposed() - assert buf.owner.exposed - assert not buf.spillable - buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) - # Notice, accessing `__cuda_array_interface__` itself doesn't - # expose the pointer, only accessing the "data" field exposes - # the pointer. - iface = buf.__cuda_array_interface__ - assert not buf.owner.exposed - assert buf.spillable - iface["data"][0] # Expose pointer - assert buf.owner.exposed - assert not buf.spillable - - -@pytest.mark.parametrize( - "attribute", - [ - "get_ptr", - "memoryview", - "is_spilled", - "spillable", - "spill_lock", - "spill", - "memory_info", - ], -) -def test_spillable_buffer_view_attributes(manager: SpillManager, attribute): - base = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) - view = base[:] - attr_base = getattr(base, attribute) - attr_view = getattr(view, attribute) - if callable(attr_view): - pass - else: - assert attr_base == attr_view - - -@pytest.mark.parametrize("target", ["gpu", "cpu"]) -def test_memory_info(manager: SpillManager, target): - if target == "gpu": - mem = rmm.DeviceBuffer(size=10) - ptr = mem.ptr - elif target == "cpu": - mem = np.empty(10, dtype="u1") - ptr = mem.__array_interface__["data"][0] - b = as_buffer(data=mem, exposed=False) - assert b.memory_info() == (ptr, mem.size, target) - assert b[:].memory_info() == (ptr, mem.size, target) - assert b[:-1].memory_info() == (ptr, mem.size - 1, target) - assert b[1:].memory_info() == (ptr + 1, mem.size - 1, target) - assert b[2:4].memory_info() == (ptr + 2, 2, target) - - -def test_from_pandas(manager: SpillManager): - pdf1 = pandas.DataFrame({"a": [1, 2, 3]}) - df = cudf.from_pandas(pdf1) - assert single_column_df_data(df).spillable - pdf2 = df.to_pandas() - pandas.testing.assert_frame_equal(pdf1, pdf2) - - -def test_creations(manager: SpillManager): - df = single_column_df() - assert single_column_df_data(df).spillable - - df = cudf.datasets.timeseries(dtypes={"a": float}) - assert single_column_df_data(df).spillable - - df = cudf.datasets.randomdata(dtypes={"a": float}) - assert single_column_df_data(df).spillable - - -def test_spillable_df_groupby(manager: SpillManager): - df = cudf.DataFrame({"a": [1, 1, 1]}) - gb = df.groupby("a") - assert len(single_column_df_base_data(df).owner._spill_locks) == 0 - gb._groupby - # `gb._groupby`, which is cached on `gb`, holds a spill lock - assert len(single_column_df_base_data(df).owner._spill_locks) == 1 - assert not single_column_df_data(df).spillable - del gb - assert single_column_df_data(df).spillable - - -def test_spilling_buffer(manager: SpillManager): - buf = as_buffer(rmm.DeviceBuffer(size=10), exposed=False) - buf.spill(target="cpu") - assert buf.is_spilled - buf.owner.mark_exposed() # Expose pointer and trigger unspill - assert not buf.is_spilled - with pytest.raises(ValueError, match="unspillable buffer"): - buf.spill(target="cpu") - - -def _reload_options(): - # In order to enabling monkey patching of the environment variables - # mark the global manager as uninitialized. - set_global_manager(None) - cudf.core.buffer.spill_manager._global_manager_uninitialized = True - importlib.reload(cudf.options) - - -@contextlib.contextmanager -def _get_manager_in_env(monkeypatch, var_vals): - with monkeypatch.context() as m: - for var, val in var_vals: - m.setenv(var, val) - _reload_options() - yield get_global_manager() - _reload_options() - - -def test_environment_variables_spill_off(monkeypatch): - with _get_manager_in_env( - monkeypatch, - [("CUDF_SPILL", "off")], - ) as manager: - assert manager is None - - -def test_environment_variables_spill_on(monkeypatch): - with _get_manager_in_env( - monkeypatch, - [("CUDF_SPILL", "on"), ("CUDF_SPILL_ON_DEMAND", "off")], - ) as manager: - assert isinstance(manager, SpillManager) - assert manager._device_memory_limit is None - assert manager.statistics.level == 0 - - -def test_environment_variables_device_limit(monkeypatch): - with _get_manager_in_env( - monkeypatch, - [ - ("CUDF_SPILL", "on"), - ("CUDF_SPILL_ON_DEMAND", "off"), - ("CUDF_SPILL_DEVICE_LIMIT", "1000"), - ], - ) as manager: - assert isinstance(manager, SpillManager) - assert manager._device_memory_limit == 1000 - assert manager.statistics.level == 0 - - -@pytest.mark.parametrize("level", (1, 2)) -def test_environment_variables_spill_stats(monkeypatch, level): - with _get_manager_in_env( - monkeypatch, - [ - ("CUDF_SPILL", "on"), - ("CUDF_SPILL_ON_DEMAND", "off"), - ("CUDF_SPILL_DEVICE_LIMIT", "1000"), - ("CUDF_SPILL_STATS", f"{level}"), - ], - ) as manager: - assert isinstance(manager, SpillManager) - assert manager._device_memory_limit == 1000 - assert manager.statistics.level == level - - -def test_spill_device_memory(manager: SpillManager): - df = single_column_df() - assert spilled_and_unspilled(manager) == (0, gen_df_data_nbytes) - manager.spill_device_memory(nbytes=1) - assert spilled_and_unspilled(manager) == (gen_df_data_nbytes, 0) - del df - assert spilled_and_unspilled(manager) == (0, 0) - df1 = single_column_df() - df2 = single_column_df() - manager.spill_device_memory(nbytes=1) - assert single_column_df_data(df1).is_spilled - assert not single_column_df_data(df2).is_spilled - manager.spill_device_memory(nbytes=1) - assert single_column_df_data(df1).is_spilled - assert single_column_df_data(df2).is_spilled - df3 = df1 + df2 - assert not single_column_df_data(df1).is_spilled - assert not single_column_df_data(df2).is_spilled - assert not single_column_df_data(df3).is_spilled - manager.spill_device_memory(nbytes=1) - assert single_column_df_data(df1).is_spilled - assert not single_column_df_data(df2).is_spilled - assert not single_column_df_data(df3).is_spilled - df2.abs() # Should change the access time - manager.spill_device_memory(nbytes=1) - assert single_column_df_data(df1).is_spilled - assert not single_column_df_data(df2).is_spilled - assert single_column_df_data(df3).is_spilled - - -def test_spill_to_device_limit(manager: SpillManager): - df1 = single_column_df() - df2 = single_column_df() - assert spilled_and_unspilled(manager) == (0, gen_df_data_nbytes * 2) - manager.spill_to_device_limit(device_limit=0) - assert spilled_and_unspilled(manager) == (gen_df_data_nbytes * 2, 0) - df3 = df1 + df2 - manager.spill_to_device_limit(device_limit=0) - assert spilled_and_unspilled(manager) == (gen_df_data_nbytes * 3, 0) - assert single_column_df_data(df1).is_spilled - assert single_column_df_data(df2).is_spilled - assert single_column_df_data(df3).is_spilled - - -@pytest.mark.parametrize( - "manager", [{"device_memory_limit": 0}], indirect=True -) -def test_zero_device_limit(manager: SpillManager): - assert manager._device_memory_limit == 0 - df1 = single_column_df() - df2 = single_column_df() - assert spilled_and_unspilled(manager) == (gen_df_data_nbytes * 2, 0) - df1 + df2 - # Notice, while performing the addintion both df1 and df2 are unspillable - assert spilled_and_unspilled(manager) == (0, gen_df_data_nbytes * 2) - manager.spill_to_device_limit() - assert spilled_and_unspilled(manager) == (gen_df_data_nbytes * 2, 0) - - -def test_spill_df_index(manager: SpillManager): - df = single_column_df() - df.index = [1, 3, 2] # use a materialized index - assert spilled_and_unspilled(manager) == (0, gen_df_data_nbytes * 2) - - manager.spill_to_device_limit(gen_df_data_nbytes) - assert spilled_and_unspilled(manager) == ( - gen_df_data_nbytes, - gen_df_data_nbytes, - ) - - manager.spill_to_device_limit(0) - assert spilled_and_unspilled(manager) == (gen_df_data_nbytes * 2, 0) - - -def test_external_memory(manager): - cupy.cuda.set_allocator() # uses default allocator - cpy = cupy.asarray([1, 2, 3]) - s = cudf.Series(cpy) - # Check that the cupy array is still alive after overwriting `cpy` - cpy = weakref.ref(cpy) - assert cpy() is not None - # Check that the series is spillable and known by the spill manager - assert len(manager.buffers()) == 1 - assert s._data[None].data.spillable - - -def test_spilling_df_views(manager): - df = single_column_df(target="cpu") - assert single_column_df_data(df).is_spilled - df_view = df.loc[1:] - assert single_column_df_data(df_view).spillable - assert single_column_df_data(df).spillable - - -def test_modify_spilled_views(manager): - df = single_column_df() - df_view = df.iloc[1:] - buf = single_column_df_data(df) - buf.spill(target="cpu") - - # modify the spilled df and check that the changes are reflected - # in the view - df.iloc[1:] = 0 - assert_eq(df_view, df.iloc[1:]) - - # now, modify the view and check that the changes are reflected in - # the df - df_view.iloc[:] = -1 - assert_eq(df_view, df.iloc[1:]) - - -@pytest.mark.parametrize("target", ["gpu", "cpu"]) -def test_get_ptr(manager: SpillManager, target): - if target == "gpu": - mem = rmm.DeviceBuffer(size=10) - elif target == "cpu": - mem = np.empty(10, dtype="u1") - buf = as_buffer(data=mem, exposed=False) - assert buf.spillable - assert len(buf.owner._spill_locks) == 0 - with acquire_spill_lock(): - buf.get_ptr(mode="read") - assert not buf.spillable - with acquire_spill_lock(): - buf.get_ptr(mode="read") - assert not buf.spillable - assert not buf.spillable - assert buf.spillable - - -def test_get_spill_lock(manager: SpillManager): - @acquire_spill_lock() - def f(sleep=False, nest=0): - if sleep: - time.sleep(random.random() / 100) - if nest: - return f(nest=nest - 1) - return get_spill_lock() - - assert get_spill_lock() is None - slock = f() - assert isinstance(slock, SpillLock) - assert get_spill_lock() is None - slock = f(nest=2) - assert isinstance(slock, SpillLock) - assert get_spill_lock() is None - - with ThreadPoolExecutor(max_workers=2) as executor: - futures_with_spill_lock = [] - futures_without_spill_lock = [] - for _ in range(100): - futures_with_spill_lock.append( - executor.submit(f, sleep=True, nest=1) - ) - futures_without_spill_lock.append( - executor.submit(f, sleep=True, nest=1) - ) - all(isinstance(f.result(), SpillLock) for f in futures_with_spill_lock) - all(f is None for f in futures_without_spill_lock) - - -def test_get_spill_lock_no_manager(): - """When spilling is disabled, get_spill_lock() should return None always""" - - @acquire_spill_lock() - def f(): - return get_spill_lock() - - assert get_spill_lock() is None - assert f() is None - - -@pytest.mark.parametrize("target", ["gpu", "cpu"]) -@pytest.mark.parametrize("view", [None, slice(0, 2), slice(1, 3)]) -def test_serialize_device(manager, target, view): - df1 = single_column_df() - if view is not None: - df1 = df1.iloc[view] - single_column_df_data(df1).spill(target=target) - - header, frames = df1.device_serialize() - assert len(frames) == 1 - if target == "gpu": - assert isinstance(frames[0], Buffer) - assert not single_column_df_data(df1).is_spilled - assert not single_column_df_data(df1).spillable - frames[0] = cupy.array(frames[0], copy=True) - else: - assert isinstance(frames[0], memoryview) - assert single_column_df_data(df1).is_spilled - assert single_column_df_data(df1).spillable - - df2 = Serializable.device_deserialize(header, frames) - assert_eq(df1, df2) - - -@pytest.mark.parametrize("target", ["gpu", "cpu"]) -@pytest.mark.parametrize("view", [None, slice(0, 2), slice(1, 3)]) -def test_serialize_host(manager, target, view): - df1 = single_column_df() - if view is not None: - df1 = df1.iloc[view] - single_column_df_data(df1).spill(target=target) - - # Unspilled df becomes spilled after host serialization - header, frames = df1.host_serialize() - assert all(isinstance(f, memoryview) for f in frames) - df2 = Serializable.host_deserialize(header, frames) - assert single_column_df_data(df2).is_spilled - assert_eq(df1, df2) - - -def test_serialize_dask_dataframe(manager: SpillManager): - protocol = pytest.importorskip("distributed.protocol") - - df1 = single_column_df(target="gpu") - header, frames = protocol.serialize( - df1, serializers=("dask",), on_error="raise" - ) - buf = single_column_df_data(df1) - assert len(frames) == 1 - assert isinstance(frames[0], memoryview) - # Check that the memoryview and frames is the same memory - assert ( - np.array(buf.memoryview()).__array_interface__["data"] - == np.array(frames[0]).__array_interface__["data"] - ) - - df2 = protocol.deserialize(header, frames) - assert single_column_df_data(df2).is_spilled - assert_eq(df1, df2) - - -def test_serialize_cuda_dataframe(manager: SpillManager): - protocol = pytest.importorskip("distributed.protocol") - - df1 = single_column_df(target="gpu") - header, frames = protocol.serialize( - df1, serializers=("cuda",), on_error="raise" - ) - buf: SpillableBuffer = single_column_df_data(df1) - assert len(buf.owner._spill_locks) == 1 - assert len(frames) == 1 - assert isinstance(frames[0], Buffer) - assert frames[0].get_ptr(mode="read") == buf.get_ptr(mode="read") - - frames[0] = cupy.array(frames[0], copy=True) - df2 = protocol.deserialize(header, frames) - assert_eq(df1, df2) - - -def test_get_rmm_memory_resource_stack(): - mr1 = rmm.mr.CudaMemoryResource() - assert all( - not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor) - for m in get_rmm_memory_resource_stack(mr1) - ) - - mr2 = rmm.mr.FailureCallbackResourceAdaptor(mr1, lambda x: False) - assert get_rmm_memory_resource_stack(mr2)[0] is mr2 - assert get_rmm_memory_resource_stack(mr2)[1] is mr1 - - mr3 = rmm.mr.FixedSizeMemoryResource(mr2) - assert get_rmm_memory_resource_stack(mr3)[0] is mr3 - assert get_rmm_memory_resource_stack(mr3)[1] is mr2 - assert get_rmm_memory_resource_stack(mr3)[2] is mr1 - - mr4 = rmm.mr.FailureCallbackResourceAdaptor(mr3, lambda x: False) - assert get_rmm_memory_resource_stack(mr4)[0] is mr4 - assert get_rmm_memory_resource_stack(mr4)[1] is mr3 - assert get_rmm_memory_resource_stack(mr4)[2] is mr2 - assert get_rmm_memory_resource_stack(mr4)[3] is mr1 - - -def test_df_transpose(manager: SpillManager): - df1 = cudf.DataFrame({"a": [1, 2]}) - df2 = df1.transpose() - # For now, all buffers are marked as exposed - assert df1._data._data["a"].data.owner.exposed - assert df2._data._data[0].data.owner.exposed - assert df2._data._data[1].data.owner.exposed - - -def test_as_buffer_of_spillable_buffer(manager: SpillManager): - data = cupy.arange(10, dtype="u1") - b1 = as_buffer(data, exposed=False) - assert isinstance(b1, SpillableBuffer) - assert isinstance(b1.owner, SpillableBufferOwner) - assert b1.owner.owner is data - b2 = as_buffer(b1) - assert b1 is b2 - - with pytest.raises( - ValueError, - match="owning spillable buffer must either be exposed or spill locked", - ): - # Use `memory_info` to access device point _without_ making - # the buffer unspillable. - b3 = as_buffer(b1.memory_info()[0], size=b1.size, owner=b1) - - with acquire_spill_lock(): - b3 = as_buffer(b1.get_ptr(mode="read"), size=b1.size, owner=b1) - assert isinstance(b3, SpillableBuffer) - assert b3.owner is b1.owner - - b4 = as_buffer( - b1.get_ptr(mode="write") + data.itemsize, - size=b1.size - data.itemsize, - owner=b3, - ) - assert isinstance(b4, SpillableBuffer) - assert b4.owner is b1.owner - assert all(cupy.array(b4.memoryview()) == data[1:]) - - b5 = as_buffer(b4.get_ptr(mode="write"), size=b4.size - 1, owner=b4) - assert isinstance(b5, SpillableBuffer) - assert b5.owner is b1.owner - assert all(cupy.array(b5.memoryview()) == data[1:-1]) - - -@pytest.mark.parametrize("dtype", ["uint8", "uint64"]) -def test_memoryview_slice(manager: SpillManager, dtype): - """Check .memoryview() of a sliced spillable buffer""" - - data = np.arange(10, dtype=dtype) - # memoryview of a sliced spillable buffer - m1 = as_buffer(data=data)[1:-1].memoryview() - # sliced memoryview of data as bytes - m2 = memoryview(data).cast("B")[1:-1] - assert m1 == m2 - - -@pytest.mark.parametrize( - "manager", [{"statistic_level": 0}, {"statistic_level": 1}], indirect=True -) -def test_statistics(manager: SpillManager): - assert len(manager.statistics.spill_totals) == 0 - - buf: SpillableBuffer = as_buffer( - data=rmm.DeviceBuffer(size=10), exposed=False - ) - buf.spill(target="cpu") - - if manager.statistics.level == 0: - assert len(manager.statistics.spill_totals) == 0 - return - - assert len(manager.statistics.spill_totals) == 1 - nbytes, time = manager.statistics.spill_totals[("gpu", "cpu")] - assert nbytes == buf.size - assert time > 0 - - buf.spill(target="gpu") - assert len(manager.statistics.spill_totals) == 2 - nbytes, time = manager.statistics.spill_totals[("cpu", "gpu")] - assert nbytes == buf.size - assert time > 0 - - -@pytest.mark.parametrize("manager", [{"statistic_level": 2}], indirect=True) -def test_statistics_expose(manager: SpillManager): - assert len(manager.statistics.spill_totals) == 0 - - buffers: list[SpillableBuffer] = [ - as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) - for _ in range(10) - ] - - # Expose the first buffer - buffers[0].owner.mark_exposed() - assert len(manager.statistics.exposes) == 1 - stat = list(manager.statistics.exposes.values())[0] - assert stat.count == 1 - assert stat.total_nbytes == buffers[0].nbytes - assert stat.spilled_nbytes == 0 - - # Expose all 10 buffers - for i in range(10): - buffers[i].owner.mark_exposed() - - # The rest of the ptr accesses should accumulate to a single stat - # because they resolve to the same traceback. - assert len(manager.statistics.exposes) == 2 - stat = list(manager.statistics.exposes.values())[1] - assert stat.count == 9 - assert stat.total_nbytes == buffers[0].nbytes * 9 - assert stat.spilled_nbytes == 0 - - # Create and spill 10 new buffers - buffers: list[SpillableBuffer] = [ - as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) - for _ in range(10) - ] - - manager.spill_to_device_limit(0) - - # Expose the new buffers and check that they are counted as spilled - for i in range(10): - buffers[i].owner.mark_exposed() - assert len(manager.statistics.exposes) == 3 - stat = list(manager.statistics.exposes.values())[2] - assert stat.count == 10 - assert stat.total_nbytes == buffers[0].nbytes * 10 - assert stat.spilled_nbytes == buffers[0].nbytes * 10 - - -def test_spill_on_demand(manager: SpillManager): - with set_rmm_memory_pool(1024): - a = as_buffer(data=rmm.DeviceBuffer(size=1024)) - assert isinstance(a, SpillableBuffer) - assert not a.is_spilled - - with pytest.raises(MemoryError, match="Maximum pool size exceeded"): - as_buffer(data=rmm.DeviceBuffer(size=1024)) - - with spill_on_demand_globally(): - b = as_buffer(data=rmm.DeviceBuffer(size=1024)) - assert a.is_spilled - assert not b.is_spilled - - with pytest.raises(MemoryError, match="Maximum pool size exceeded"): - as_buffer(data=rmm.DeviceBuffer(size=1024)) - - -def test_spilling_and_copy_on_write(manager: SpillManager): - with cudf.option_context("copy_on_write", True): - a: SpillableBuffer = as_buffer(data=rmm.DeviceBuffer(size=10)) - - b = a.copy(deep=False) - assert a.owner == b.owner - a.spill(target="cpu") - assert a.is_spilled - assert b.is_spilled - - # Write access trigger copy of `a` into `b` but since `a` is spilled - # the copy is done in host memory and `a` remains spilled. - with acquire_spill_lock(): - b.get_ptr(mode="write") - assert a.is_spilled - assert not b.is_spilled - - # Deep copy of the spilled buffer `a` - b = a.copy(deep=True) - assert a.owner != b.owner - assert a.is_spilled - assert b.is_spilled - a.spill(target="gpu") - assert not a.is_spilled - assert b.is_spilled - - # Deep copy of the unspilled buffer `a` - b = a.copy(deep=True) - assert a.spillable - assert not a.is_spilled - assert not b.is_spilled - - b = a.copy(deep=False) - assert a.owner == b.owner - # Write access trigger copy of `a` into `b` in device memory - with acquire_spill_lock(): - b.get_ptr(mode="write") - assert a.owner != b.owner - assert not a.is_spilled - assert not b.is_spilled - # And `a` and `b` is now seperated with there one spilling status - a.spill(target="cpu") - assert a.is_spilled - assert not b.is_spilled - b.spill(target="cpu") - assert a.is_spilled - assert b.is_spilled - - # Read access with a spill lock unspill `a` and allows copy-on-write - with acquire_spill_lock(): - a.get_ptr(mode="read") - b = a.copy(deep=False) - assert a.owner == b.owner - assert not a.is_spilled - - # Read access without a spill lock exposes `a` and forces a deep copy - a.get_ptr(mode="read") - b = a.copy(deep=False) - assert a.owner != b.owner - assert not a.is_spilled - assert a.owner.exposed - assert not b.owner.exposed diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py deleted file mode 100644 index f952cea07f8..00000000000 --- a/python/cudf/cudf/tests/test_stats.py +++ /dev/null @@ -1,660 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from concurrent.futures import ThreadPoolExecutor - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.api.extensions import no_default -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.datasets import randomdata -from cudf.testing import assert_eq -from cudf.testing._utils import assert_exceptions_equal, expect_warning_if - -params_dtypes = [np.int32, np.uint32, np.float32, np.float64] -methods = ["min", "max", "sum", "mean", "var", "std"] - -interpolation_methods = ["linear", "lower", "higher", "midpoint", "nearest"] - - -@pytest.mark.parametrize("method", methods) -@pytest.mark.parametrize("dtype", params_dtypes) -@pytest.mark.parametrize("skipna", [True, False]) -def test_series_reductions(method, dtype, skipna): - np.random.seed(0) - arr = np.random.random(100) - if np.issubdtype(dtype, np.integer): - arr *= 100 - mask = arr > 10 - else: - mask = arr > 0.5 - - arr = arr.astype(dtype) - if dtype in (np.float32, np.float64): - arr[[2, 5, 14, 19, 50, 70]] = np.nan - sr = cudf.Series(arr) - sr[~mask] = None - psr = sr.to_pandas() - psr[~mask] = np.nan - - def call_test(sr, skipna): - fn = getattr(sr, method) - if method in ["std", "var"]: - return fn(ddof=1, skipna=skipna) - else: - return fn(skipna=skipna) - - expect, got = call_test(psr, skipna=skipna), call_test(sr, skipna=skipna) - - np.testing.assert_approx_equal(expect, got) - - -@pytest.mark.parametrize("method", methods) -def test_series_reductions_concurrency(method): - e = ThreadPoolExecutor(10) - - np.random.seed(0) - srs = [cudf.Series(np.random.random(10000)) for _ in range(1)] - - def call_test(sr): - fn = getattr(sr, method) - if method in ["std", "var"]: - return fn(ddof=1) - else: - return fn() - - def f(sr): - return call_test(sr + 1) - - list(e.map(f, srs * 50)) - - -@pytest.mark.parametrize("ddof", range(3)) -def test_series_std(ddof): - np.random.seed(0) - arr = np.random.random(100) - 0.5 - sr = cudf.Series(arr) - pd = sr.to_pandas() - got = sr.std(ddof=ddof) - expect = pd.std(ddof=ddof) - np.testing.assert_approx_equal(expect, got) - - -def test_series_unique(): - for size in [10**x for x in range(5)]: - arr = np.random.randint(low=-1, high=10, size=size) - mask = arr != -1 - sr = cudf.Series(arr) - sr[~mask] = None - assert set(arr[mask]) == set(sr.unique().dropna().to_numpy()) - assert len(set(arr[mask])) == sr.nunique() - - -@pytest.mark.parametrize( - "nan_as_null, dropna", - [(True, True), (True, False), (False, True), (False, False)], -) -def test_series_nunique(nan_as_null, dropna): - # We remove nulls as opposed to NaNs using the dropna parameter, - # so to test against pandas we replace NaN with another discrete value - cudf_series = cudf.Series([1, 2, 2, 3, 3], nan_as_null=nan_as_null) - pd_series = pd.Series([1, 2, 2, 3, 3]) - expect = pd_series.nunique(dropna=dropna) - got = cudf_series.nunique(dropna=dropna) - assert expect == got - - cudf_series = cudf.Series( - [1.0, 2.0, 3.0, np.nan, None], nan_as_null=nan_as_null - ) - if nan_as_null is True: - pd_series = pd.Series([1.0, 2.0, 3.0, np.nan, None]) - else: - pd_series = pd.Series([1.0, 2.0, 3.0, -1.0, None]) - - expect = pd_series.nunique(dropna=dropna) - got = cudf_series.nunique(dropna=dropna) - assert expect == got - - cudf_series = cudf.Series([1.0, np.nan, np.nan], nan_as_null=nan_as_null) - if nan_as_null is True: - pd_series = pd.Series([1.0, np.nan, np.nan]) - else: - pd_series = pd.Series([1.0, -1.0, -1.0]) - expect = pd_series.nunique(dropna=dropna) - got = cudf_series.nunique(dropna=dropna) - assert expect == got - - -def test_series_scale(): - arr = pd.Series(np.random.randint(low=-10, high=10, size=100)) - sr = cudf.Series(arr) - - vmin = arr.min() - vmax = arr.max() - scaled = (arr - vmin) / (vmax - vmin) - assert scaled.min() == 0 - assert scaled.max() == 1 - assert_eq(sr.scale(), scaled) - - -@pytest.mark.parametrize("int_method", interpolation_methods) -def test_exact_quantiles(int_method): - arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) - quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] - - df = pd.DataFrame(arr) - gdf_series = cudf.Series(arr) - - q1 = gdf_series.quantile( - quant_values, interpolation=int_method, exact=True - ) - - q2 = df.quantile(quant_values, interpolation=int_method) - - np.testing.assert_allclose( - q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10 - ) - - -@pytest.mark.parametrize("int_method", interpolation_methods) -def test_exact_quantiles_int(int_method): - arr = np.asarray([7, 0, 3, 4, 2, 1, -1, 1, 6]) - quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] - - df = pd.DataFrame(arr) - gdf_series = cudf.Series(arr) - - q1 = gdf_series.quantile( - quant_values, interpolation=int_method, exact=True - ) - - q2 = df.quantile(quant_values, interpolation=int_method) - - np.testing.assert_allclose( - q1.to_pandas().values, np.array(q2.values).T.flatten(), rtol=1e-10 - ) - - -def test_approx_quantiles(): - arr = np.asarray([6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7]) - quant_values = [0.0, 0.25, 0.33, 0.5, 1.0] - - gdf_series = cudf.Series(arr) - pdf_series = pd.Series(arr) - - q1 = gdf_series.quantile(quant_values, exact=False) - q2 = pdf_series.quantile(quant_values) - - assert_eq(q1, q2) - - -def test_approx_quantiles_int(): - arr = np.asarray([1, 2, 3]) - quant_values = [0.5] - approx_results = [2] - - gdf_series = cudf.Series(arr) - - q1 = gdf_series.quantile(quant_values, exact=False) - - assert approx_results == q1.to_pandas().values - - -@pytest.mark.parametrize("data", [[], [1, 2, 3, 10, 326497]]) -@pytest.mark.parametrize( - "q", - [ - [], - 0.5, - 1, - 0.234, - [0.345], - [0.243, 0.5, 1], - np.array([0.5, 1]), - cp.array([0.5, 1]), - ], -) -def test_misc_quantiles(data, q): - pdf_series = pd.Series(data, dtype="float64" if len(data) == 0 else None) - gdf_series = cudf.from_pandas(pdf_series) - - expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q) - actual = gdf_series.quantile(q) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - {"data": np.random.normal(-100, 100, 1000)}, - {"data": np.random.randint(-50, 50, 1000)}, - {"data": (np.zeros(100))}, - {"data": np.repeat(np.nan, 100)}, - {"data": np.array([1.123, 2.343, np.nan, 0.0])}, - { - "data": [5, 10, 53, None, np.nan, None, 12, 43, -423], - "nan_as_null": False, - }, - {"data": [1.1032, 2.32, 43.4, 13, -312.0], "index": [0, 4, 3, 19, 6]}, - {"data": [], "dtype": "float64"}, - {"data": [-3]}, - ], -) -@pytest.mark.parametrize("null_flag", [False, True]) -@pytest.mark.parametrize("numeric_only", [False, True]) -def test_kurtosis_series(data, null_flag, numeric_only): - gs = cudf.Series(**data) - ps = gs.to_pandas() - - if null_flag and len(gs) > 2: - gs.iloc[[0, 2]] = None - ps.iloc[[0, 2]] = None - - got = gs.kurtosis(numeric_only=numeric_only) - expected = ps.kurtosis(numeric_only=numeric_only) - - assert_eq(got, expected) - - got = gs.kurt(numeric_only=numeric_only) - expected = ps.kurt(numeric_only=numeric_only) - - assert_eq(got, expected) - - -@pytest.mark.parametrize("op", ["skew", "kurt"]) -def test_kurt_skew_error(op): - gs = cudf.Series(["ab", "cd"]) - ps = gs.to_pandas() - - assert_exceptions_equal( - getattr(gs, op), - getattr(ps, op), - lfunc_args_and_kwargs=([], {"numeric_only": True}), - rfunc_args_and_kwargs=([], {"numeric_only": True}), - ) - - -@pytest.mark.parametrize( - "data", - [ - cudf.Series(np.random.normal(-100, 100, 1000)), - cudf.Series(np.random.randint(-50, 50, 1000)), - cudf.Series(np.zeros(100)), - cudf.Series(np.repeat(np.nan, 100)), - cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), - cudf.Series( - [5, 10, 53, None, np.nan, None, 12, 43, -423], nan_as_null=False - ), - cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), - cudf.Series([], dtype="float64"), - cudf.Series([-3]), - ], -) -@pytest.mark.parametrize("null_flag", [False, True]) -@pytest.mark.parametrize("numeric_only", [False, True]) -def test_skew_series(data, null_flag, numeric_only): - pdata = data.to_pandas() - - if null_flag and len(data) > 2: - data.iloc[[0, 2]] = None - pdata.iloc[[0, 2]] = None - - got = data.skew(numeric_only=numeric_only) - expected = pdata.skew(numeric_only=numeric_only) - - assert_eq(got, expected) - - -@pytest.mark.parametrize("dtype", params_dtypes) -@pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100]) -def test_series_median(dtype, num_na): - np.random.seed(0) - arr = np.random.random(100) - if np.issubdtype(dtype, np.integer): - arr *= 100 - mask = np.arange(100) >= num_na - - arr = arr.astype(dtype) - sr = cudf.Series(arr) - sr[~mask] = None - arr2 = arr[mask] - ps = pd.Series(arr2, dtype=dtype) - - actual = sr.median(skipna=True) - desired = ps.median(skipna=True) - - np.testing.assert_approx_equal(actual, desired) - - # only for float until integer null supported convert to pandas in cudf - # eg. pd.Int64Dtype - if np.issubdtype(dtype, np.floating): - ps = sr.to_pandas() - actual = sr.median(skipna=False) - desired = ps.median(skipna=False) - np.testing.assert_approx_equal(actual, desired) - - -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="warning not present in older pandas versions", -) -@pytest.mark.parametrize( - "data", - [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), - np.zeros(100), - np.array([1.123, 2.343, np.nan, 0.0]), - np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]), - cudf.Series([], dtype="float64"), - cudf.Series([-3]), - ], -) -@pytest.mark.parametrize("periods", range(-5, 5)) -@pytest.mark.parametrize( - "fill_method", ["ffill", "bfill", "pad", "backfill", no_default, None] -) -def test_series_pct_change(data, periods, fill_method): - cs = cudf.Series(data) - ps = cs.to_pandas() - - if np.abs(periods) <= len(cs): - with expect_warning_if(fill_method not in (no_default, None)): - got = cs.pct_change(periods=periods, fill_method=fill_method) - with expect_warning_if( - ( - fill_method not in (no_default, None) - or (fill_method is not None and ps.isna().any()) - ) - ): - expected = ps.pct_change(periods=periods, fill_method=fill_method) - np.testing.assert_array_almost_equal( - got.to_numpy(na_value=np.nan), expected - ) - - -@pytest.mark.parametrize( - "data1", - [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), - np.zeros(100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), - cudf.Series([1.1, 2.32, 43.4], index=[0, 4, 3]), - cudf.Series([], dtype="float64"), - cudf.Series([-3]), - ], -) -@pytest.mark.parametrize( - "data2", - [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), - np.zeros(100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - cudf.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), - cudf.Series([5]), - ], -) -def test_cov1d(data1, data2): - gs1 = cudf.Series(data1) - gs2 = cudf.Series(data2) - - ps1 = gs1.to_pandas() - ps2 = gs2.to_pandas() - - got = gs1.cov(gs2) - ps1_align, ps2_align = ps1.align(ps2, join="inner") - with expect_warning_if( - (len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0) - or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0), - RuntimeWarning, - ): - expected = ps1.cov(ps2) - np.testing.assert_approx_equal(got, expected, significant=8) - - -@pytest.mark.parametrize( - "data1", - [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), - np.zeros(100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - cudf.Series([5, 10, 53, None, np.nan, None], nan_as_null=False), - cudf.Series([1.1032, 2.32, 43.4], index=[0, 4, 3]), - cudf.Series([], dtype="float64"), - cudf.Series([-3]), - ], -) -@pytest.mark.parametrize( - "data2", - [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), - np.zeros(100), - np.repeat(np.nan, 100), - np.array([1.123, 2.343, np.nan, 0.0]), - cudf.Series([1.1, 2.32, 43.4], index=[0, 500, 4000]), - cudf.Series([5]), - ], -) -@pytest.mark.parametrize("method", ["spearman", "pearson"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Warnings missing on older pandas (scipy version seems unrelated?)", -) -def test_corr1d(data1, data2, method): - if method == "spearman": - # Pandas uses scipy.stats.spearmanr code-path - pytest.importorskip("scipy") - - gs1 = cudf.Series(data1) - gs2 = cudf.Series(data2) - - ps1 = gs1.to_pandas() - ps2 = gs2.to_pandas() - - got = gs1.corr(gs2, method) - - ps1_align, ps2_align = ps1.align(ps2, join="inner") - - is_singular = ( - len(ps1_align.dropna()) == 1 and len(ps2_align.dropna()) > 0 - ) or (len(ps2_align.dropna()) == 1 and len(ps1_align.dropna()) > 0) - is_identical = ( - len(ps1_align.dropna().unique()) == 1 and len(ps2_align.dropna()) > 0 - ) or ( - len(ps2_align.dropna().unique()) == 1 and len(ps1_align.dropna()) > 0 - ) - - # Pearson correlation leads to division by 0 when either sample size is 1. - # Spearman allows for size 1 samples, but will error if all data in a - # sample is identical since the covariance is zero and so the correlation - # coefficient is not defined. - cond = ((is_singular or is_identical) and method == "pearson") or ( - is_identical and not is_singular and method == "spearman" - ) - if method == "spearman": - # SciPy has shuffled around the warning it throws a couple of times. - # It's not worth the effort of conditionally importing the appropriate - # warning based on the scipy version, just catching a base Warning is - # good enough validation. - expected_warning = Warning - elif method == "pearson": - expected_warning = RuntimeWarning - - with expect_warning_if(cond, expected_warning): - expected = ps1.corr(ps2, method) - np.testing.assert_approx_equal(got, expected, significant=8) - - -@pytest.mark.parametrize("method", ["spearman", "pearson"]) -def test_df_corr(method): - gdf = randomdata(100, {str(x): float for x in range(50)}) - pdf = gdf.to_pandas() - got = gdf.corr(method) - expected = pdf.corr(method) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], - [np.nan] * 3, - [1, 5, 3], - [], - ], -) -@pytest.mark.parametrize( - "ops", - [ - "mean", - "min", - "max", - "sum", - "product", - "var", - "std", - "prod", - "kurtosis", - "skew", - "any", - "all", - "cummin", - "cummax", - "cumsum", - "cumprod", - ], -) -@pytest.mark.parametrize("skipna", [True, False]) -def test_nans_stats(data, ops, skipna): - psr = pd.Series(data, dtype="float64" if len(data) == 0 else None) - gsr = cudf.from_pandas(psr) - - assert_eq( - getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) - ) - - gsr = cudf.Series( - data, dtype="float64" if len(data) == 0 else None, nan_as_null=False - ) - # Since there is no concept of `nan_as_null` in pandas, - # nulls will be returned in the operations. So only - # testing for `skipna=True` when `nan_as_null=False` - assert_eq(getattr(psr, ops)(skipna=True), getattr(gsr, ops)(skipna=True)) - - -@pytest.mark.parametrize( - "data", - [ - [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100], - [np.nan] * 3, - [1, 5, 3], - ], -) -@pytest.mark.parametrize("ops", ["sum", "product", "prod"]) -@pytest.mark.parametrize("skipna", [True, False]) -@pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 5, 10]) -def test_min_count_ops(data, ops, skipna, min_count): - psr = pd.Series(data) - gsr = cudf.Series(data, nan_as_null=False) - - assert_eq( - getattr(psr, ops)(skipna=skipna, min_count=min_count), - getattr(gsr, ops)(skipna=skipna, min_count=min_count), - ) - - -@pytest.mark.parametrize( - "data1", - [ - [1, 2, 3, 4], - [10, 1, 3, 5], - ], -) -@pytest.mark.parametrize( - "data2", - [ - [1, 2, 3, 4], - [10, 1, 3, 5], - ], -) -@pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"]) -@pytest.mark.skipif( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) -def test_cov_corr_datetime_timedelta(data1, data2, dtype): - gsr1 = cudf.Series(data1, dtype=dtype) - gsr2 = cudf.Series(data2, dtype=dtype) - psr1 = gsr1.to_pandas() - psr2 = gsr2.to_pandas() - - assert_eq(psr1.corr(psr2), gsr1.corr(gsr2)) - assert_eq(psr1.cov(psr2), gsr1.cov(gsr2)) - - -@pytest.mark.parametrize( - "data", - [ - randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ), - ], -) -@pytest.mark.parametrize("null_flag", [False, True]) -@pytest.mark.parametrize("numeric_only", [False, True]) -def test_kurtosis_df(data, null_flag, numeric_only): - if not numeric_only: - data = data.select_dtypes(include="number") - pdata = data.to_pandas() - - if null_flag and len(data) > 2: - data.iloc[[0, 2]] = None - pdata.iloc[[0, 2]] = None - - got = data.kurtosis(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - - expected = pdata.kurtosis(numeric_only=numeric_only) - np.testing.assert_array_almost_equal(got, expected) - - got = data.kurt(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - - expected = pdata.kurt(numeric_only=numeric_only) - np.testing.assert_array_almost_equal(got, expected) - - -@pytest.mark.parametrize( - "data", - [ - randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ), - ], -) -@pytest.mark.parametrize("null_flag", [False, True]) -@pytest.mark.parametrize("numeric_only", [False, True]) -def test_skew_df(data, null_flag, numeric_only): - if not numeric_only: - data = data.select_dtypes(include="number") - pdata = data.to_pandas() - - if null_flag and len(data) > 2: - data.iloc[[0, 2]] = None - pdata.iloc[[0, 2]] = None - - got = data.skew(numeric_only=numeric_only) - expected = pdata.skew(numeric_only=numeric_only) - got = got if np.isscalar(got) else got.to_numpy() - np.testing.assert_array_almost_equal(got, expected) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py deleted file mode 100644 index cc88cc79769..00000000000 --- a/python/cudf/cudf/tests/test_string.py +++ /dev/null @@ -1,3514 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import json -import re -import urllib.parse -from contextlib import ExitStack as does_not_raise -from decimal import Decimal -from sys import getsizeof - -import cupy -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf import concat -from cudf.core.column.string import StringColumn -from cudf.core.index import Index -from cudf.testing import assert_eq -from cudf.testing._utils import ( - DATETIME_TYPES, - NUMERIC_TYPES, - assert_exceptions_equal, -) -from cudf.utils import dtypes as dtypeutils - -data_list = [ - ["AbC", "de", "FGHI", "j", "kLm"], - ["nOPq", None, "RsT", None, "uVw"], - [None, None, None, None, None], -] - -data_id_list = ["no_nulls", "some_nulls", "all_nulls"] - -idx_list = [None, [10, 11, 12, 13, 14]] - -idx_id_list = ["None_index", "Set_index"] - - -def raise_builder(flags, exceptions): - if any(flags): - return pytest.raises(exceptions) - else: - return does_not_raise() - - -@pytest.fixture(params=data_list, ids=data_id_list) -def data(request): - return request.param - - -@pytest.fixture(params=idx_list, ids=idx_id_list) -def index(request): - return request.param - - -@pytest.fixture -def ps_gs(data, index): - ps = pd.Series(data, index=index, dtype="str", name="nice name") - gs = cudf.Series(data, index=index, dtype="str", name="nice name") - return (ps, gs) - - -@pytest.mark.parametrize("construct", [list, np.array, pd.Series, pa.array]) -def test_string_ingest(construct): - expect = ["a", "a", "b", "c", "a"] - data = construct(expect) - got = cudf.Series(data) - assert got.dtype == np.dtype("object") - assert len(got) == 5 - for idx, val in enumerate(expect): - assert expect[idx] == got[idx] - - -def test_string_export(ps_gs): - ps, gs = ps_gs - - expect = ps - got = gs.to_pandas() - assert_eq(expect, got) - - expect = np.array(ps) - got = gs.to_numpy() - assert_eq(expect, got) - - expect = pa.Array.from_pandas(ps) - got = gs.to_arrow() - - assert pa.Array.equals(expect, got) - - -@pytest.mark.parametrize( - "item", - [ - 0, - 2, - 4, - slice(1, 3), - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1], - [0, 1, 2, 3, 4, 4, 3, 2, 1, 0], - np.array([0, 1, 2, 3, 4]), - cupy.asarray(np.array([0, 1, 2, 3, 4])), - ], -) -def test_string_get_item(ps_gs, item): - ps, gs = ps_gs - - got = gs.iloc[item] - if isinstance(got, cudf.Series): - got = got.to_arrow() - - if isinstance(item, cupy.ndarray): - item = cupy.asnumpy(item) - - expect = ps.iloc[item] - if isinstance(expect, pd.Series): - expect = pa.Array.from_pandas(expect) - pa.Array.equals(expect, got) - else: - if got is cudf.NA and expect is None: - return - assert expect == got - - -@pytest.mark.parametrize( - "item", - [ - [True] * 5, - [False] * 5, - np.array([True] * 5), - np.array([False] * 5), - cupy.asarray(np.array([True] * 5)), - cupy.asarray(np.array([False] * 5)), - np.random.randint(0, 2, 5).astype("bool").tolist(), - np.random.randint(0, 2, 5).astype("bool"), - cupy.asarray(np.random.randint(0, 2, 5).astype("bool")), - ], -) -def test_string_bool_mask(ps_gs, item): - ps, gs = ps_gs - - got = gs.iloc[item] - if isinstance(got, cudf.Series): - got = got.to_arrow() - - if isinstance(item, cupy.ndarray): - item = cupy.asnumpy(item) - - expect = ps[item] - if isinstance(expect, pd.Series): - expect = pa.Array.from_pandas(expect) - pa.Array.equals(expect, got) - else: - assert expect == got - - -@pytest.mark.parametrize("item", [0, slice(1, 3), slice(5)]) -def test_string_repr(ps_gs, item): - ps, gs = ps_gs - - got_out = gs.iloc[item] - expect_out = ps.iloc[item] - - expect = str(expect_out) - got = str(got_out) - - if got_out is not cudf.NA and len(got_out) > 1: - expect = expect.replace("None", "") - - assert expect == got or (expect == "None" and got == "") - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"] -) -def test_string_astype(dtype): - if ( - dtype.startswith("int") - or dtype.startswith("uint") - or dtype.startswith("long") - ): - data = ["1", "2", "3", "4", "5"] - elif dtype.startswith("float"): - data = [ - "1.0", - "2.0", - "3.0", - "4.0", - None, - "5.0", - "nan", - "-INF", - "NaN", - "inF", - "NAn", - ] - elif dtype.startswith("bool"): - data = ["True", "False", "True", "False", "False"] - elif dtype.startswith("datetime64"): - data = [ - "2019-06-04T00:00:00", - "2019-06-04T12:12:12", - "2019-06-03T00:00:00", - "2019-05-04T00:00:00", - "2018-06-04T00:00:00", - "1922-07-21T01:02:03", - ] - elif dtype == "str" or dtype == "object": - data = ["ab", "cd", "ef", "gh", "ij"] - ps = pd.Series(data) - gs = cudf.Series(data) - - expect = ps.astype(dtype) - got = gs.astype(dtype) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data, scale, precision", - [ - (["1.11", "2.22", "3.33"], 2, 3), - (["111", "222", "33"], 0, 3), - (["111000", "22000", "3000"], -3, 3), - ([None, None, None], 0, 5), - ([None, "-2345", None], 0, 5), - ([], 0, 5), - ], -) -@pytest.mark.parametrize( - "decimal_dtype", - [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype], -) -def test_string_to_decimal(data, scale, precision, decimal_dtype): - gs = cudf.Series(data, dtype="str") - fp = gs.astype(decimal_dtype(scale=scale, precision=precision)) - got = fp.astype("str") - assert_eq(gs, got) - - -def test_string_empty_to_decimal(): - gs = cudf.Series(["", "-85", ""], dtype="str") - got = gs.astype(cudf.Decimal64Dtype(scale=0, precision=5)) - expected = cudf.Series( - [0, -85, 0], - dtype=cudf.Decimal64Dtype(scale=0, precision=5), - ) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data, scale, precision", - [ - (["1.23", "-2.34", "3.45"], 2, 3), - (["123", "-234", "345"], 0, 3), - (["12300", "-400", "5000.0"], -2, 5), - ([None, None, None], 0, 5), - ([None, "-100", None], 0, 5), - ([], 0, 5), - ], -) -@pytest.mark.parametrize( - "decimal_dtype", - [cudf.Decimal128Dtype, cudf.Decimal32Dtype, cudf.Decimal64Dtype], -) -def test_string_from_decimal(data, scale, precision, decimal_dtype): - decimal_data = [] - for d in data: - if d is None: - decimal_data.append(None) - else: - decimal_data.append(Decimal(d)) - fp = cudf.Series( - decimal_data, - dtype=decimal_dtype(scale=scale, precision=precision), - ) - gs = fp.astype("str") - got = gs.astype(decimal_dtype(scale=scale, precision=precision)) - assert_eq(fp, got) - - -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool", "object", "str"] -) -def test_string_empty_astype(dtype): - data = [] - ps = pd.Series(data, dtype="str") - gs = cudf.Series(data, dtype="str") - - expect = ps.astype(dtype) - got = gs.astype(dtype) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool"]) -def test_string_numeric_astype(dtype): - if dtype.startswith("bool"): - data = [1, 0, 1, 0, 1] - elif ( - dtype.startswith("int") - or dtype.startswith("uint") - or dtype.startswith("long") - ): - data = [1, 2, 3, 4, 5] - elif dtype.startswith("float"): - data = [1.0, 2.0, 3.0, 4.0, 5.0] - elif dtype.startswith("datetime64"): - # pandas rounds the output format based on the data - # Use numpy instead - # but fix '2011-01-01T00:00:00' -> '2011-01-01 00:00:00' - data = [1000000001, 2000000001, 3000000001, 4000000001, 5000000001] - ps = np.asarray(data, dtype=dtype).astype(str) - ps = np.array([i.replace("T", " ") for i in ps]) - - if not dtype.startswith("datetime64"): - ps = pd.Series(data, dtype=dtype) - - gs = cudf.Series(data, dtype=dtype) - - expect = pd.Series(ps.astype("str")) - got = gs.astype("str") - - assert_eq(expect, got) - - -@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["bool"]) -def test_string_empty_numeric_astype(dtype): - data = [] - - if dtype.startswith("datetime64"): - ps = pd.Series(data, dtype="datetime64[ns]") - else: - ps = pd.Series(data, dtype=dtype) - gs = cudf.Series(data, dtype=dtype) - - expect = ps.astype("str") - got = gs.astype("str") - - assert_eq(expect, got) - - -def test_string_concat(): - data1 = ["a", "b", "c", "d", "e"] - data2 = ["f", "g", "h", "i", "j"] - index = [1, 2, 3, 4, 5] - - ps1 = pd.Series(data1, index=index) - ps2 = pd.Series(data2, index=index) - gs1 = cudf.Series(data1, index=index) - gs2 = cudf.Series(data2, index=index) - - expect = pd.concat([ps1, ps2]) - got = concat([gs1, gs2]) - - assert_eq(expect, got) - - expect = ps1.str.cat(ps2) - got = gs1.str.cat(gs2) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("ascending", [True, False]) -def test_string_sort(ps_gs, ascending): - ps, gs = ps_gs - - expect = ps.sort_values(ascending=ascending) - got = gs.sort_values(ascending=ascending) - - assert_eq(expect, got) - - -def test_string_len(ps_gs): - ps, gs = ps_gs - - expect = ps.str.len() - got = gs.str.len() - - # Can't handle nulls in Pandas so use PyArrow instead - # Pandas will return as a float64 so need to typecast to int32 - expect = pa.array(expect, from_pandas=True).cast(pa.int32()) - got = got.to_arrow() - assert pa.Array.equals(expect, got) - - -def _cat_convert_seq_to_cudf(others): - pd_others = others - if isinstance(pd_others, (pd.Series, pd.Index)): - gd_others = cudf.from_pandas(pd_others) - else: - gd_others = pd_others - if isinstance(gd_others, (list, tuple)): - temp_tuple = [ - cudf.from_pandas(elem) - if isinstance(elem, (pd.Series, pd.Index)) - else elem - for elem in gd_others - ] - - if isinstance(gd_others, tuple): - gd_others = tuple(temp_tuple) - else: - gd_others = list(temp_tuple) - return gd_others - - -@pytest.mark.parametrize( - "others", - [ - None, - ["f", "g", "h", "i", "j"], - ("f", "g", "h", "i", "j"), - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - ( - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ), - [ - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["f", "g", "h", "i", "j"]), - ], - ( - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["f", "g", "h", "i", "j"]), - ), - [ - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ), - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series(["hello", "world", "abc", "xyz", "pqr"]), - pd.Series(["abc", "xyz", "hello", "pqr", "world"]), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=[10, 11, 12, 13, 14], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=[10, 15, 11, 13, 14], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["10", "11", "12", "13", "14"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["10", "11", "12", "13", "14"], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["10", "11", "12", "13", "14"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["10", "15", "11", "13", "14"], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["1", "2", "3", "4", "5"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["10", "11", "12", "13", "14"], - ), - ], - ], -) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) -@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) -@pytest.mark.parametrize( - "index", - [["1", "2", "3", "4", "5"]], -) -def test_string_cat(ps_gs, others, sep, na_rep, index): - ps, gs = ps_gs - - pd_others = others - gd_others = _cat_convert_seq_to_cudf(others) - - expect = ps.str.cat(others=pd_others, sep=sep, na_rep=na_rep) - got = gs.str.cat(others=gd_others, sep=sep, na_rep=na_rep) - assert_eq(expect, got) - - ps.index = index - gs.index = index - - expect = ps.str.cat(others=ps.index, sep=sep, na_rep=na_rep) - got = gs.str.cat(others=gs.index, sep=sep, na_rep=na_rep) - - assert_eq(expect, got) - - expect = ps.str.cat(others=[ps.index] + [ps.index], sep=sep, na_rep=na_rep) - got = gs.str.cat(others=[gs.index] + [gs.index], sep=sep, na_rep=na_rep) - - assert_eq(expect, got) - - expect = ps.str.cat(others=(ps.index, ps.index), sep=sep, na_rep=na_rep) - got = gs.str.cat(others=(gs.index, gs.index), sep=sep, na_rep=na_rep) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["1", "2", "3", "4", "5"], - ["a", "b", "c", "d", "e"], - ["a", "b", "c", None, "e"], - ], -) -@pytest.mark.parametrize( - "others", - [ - None, - ["f", "g", "h", "i", "j"], - ("f", "g", "h", "i", "j"), - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - ( - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ), - [ - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["f", "g", "h", "i", "j"]), - ], - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - [ - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["a", "b", "c", "d", "e"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["a", "b", "c", "d", "e"], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=[10, 11, 12, 13, 14], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=[10, 15, 11, 13, 14], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["1", "2", "3", "4", "5"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["1", "2", "3", "4", "5"], - ), - ], - ], -) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) -@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) -@pytest.mark.parametrize("name", [None, "This is the name"]) -def test_string_index_str_cat(data, others, sep, na_rep, name): - pi, gi = pd.Index(data, name=name), cudf.Index(data, name=name) - - pd_others = others - gd_others = _cat_convert_seq_to_cudf(others) - - expect = pi.str.cat(others=pd_others, sep=sep, na_rep=na_rep) - got = gi.str.cat(others=gd_others, sep=sep, na_rep=na_rep) - - assert_eq( - expect, - got, - exact=False, - ) - - -@pytest.mark.parametrize( - "data", - [["a", None, "c", None, "e"], ["a", "b", "c", "d", "a"]], -) -@pytest.mark.parametrize( - "others", - [ - None, - ["f", "g", "h", "i", "j"], - pd.Series(["AbC", "de", "FGHI", "j", "kLm"]), - pd.Index(["f", "g", "h", "i", "j"]), - pd.Index(["AbC", "de", "FGHI", "j", "kLm"]), - [ - np.array(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - [ - pd.Series(["f", "g", "h", "i", "j"]), - pd.Series(["f", "g", "h", "i", "j"]), - ], - pytest.param( - [ - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "g", "h", "i", "j"]), - ], - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/5862" - ), - ), - pytest.param( - ( - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Series(["f", "g", "h", "i", "j"]), - np.array(["f", "a", "b", "f", "a"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["1", "2", "3", "4", "5"]), - np.array(["f", "a", "b", "f", "a"]), - pd.Index(["f", "g", "h", "i", "j"]), - ), - marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/33436" - ), - ), - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["a", "b", "c", "d", "e"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["a", "b", "c", "d", "e"], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=[10, 11, 12, 13, 14], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=[10, 15, 11, 13, 14], - ), - ], - [ - pd.Series( - ["hello", "world", "abc", "xyz", "pqr"], - index=["1", "2", "3", "4", "5"], - ), - pd.Series( - ["abc", "xyz", "hello", "pqr", "world"], - index=["1", "2", "3", "4", "5"], - ), - ], - ], -) -@pytest.mark.parametrize("sep", [None, "", " ", "|", ",", "|||"]) -@pytest.mark.parametrize("na_rep", [None, "", "null", "a"]) -@pytest.mark.parametrize("name", [None, "This is the name"]) -def test_string_index_duplicate_str_cat(data, others, sep, na_rep, name): - pi, gi = pd.Index(data, name=name), cudf.Index(data, name=name) - - pd_others = others - gd_others = _cat_convert_seq_to_cudf(others) - - got = gi.str.cat(others=gd_others, sep=sep, na_rep=na_rep) - expect = pi.str.cat(others=pd_others, sep=sep, na_rep=na_rep) - - # TODO: Remove got.sort_values call once we have `join` param support - # in `.str.cat` - # https://github.com/rapidsai/cudf/issues/5862 - - assert_eq( - expect.sort_values() if not isinstance(expect, str) else expect, - got.sort_values() if not isinstance(got, str) else got, - exact=False, - ) - - -def test_string_cat_str_error(): - gs = cudf.Series(["a", "v", "s"]) - # https://github.com/pandas-dev/pandas/issues/28277 - # ability to pass StringMethods is being removed in future. - with pytest.raises( - TypeError, - match=re.escape( - "others must be Series, Index, DataFrame, np.ndarrary " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ), - ): - gs.str.cat(gs.str) - - -@pytest.mark.parametrize("sep", ["", " ", "|", ",", "|||"]) -def test_string_join(ps_gs, sep): - ps, gs = ps_gs - - expect = ps.str.join(sep) - got = gs.str.join(sep) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("pat", [r"(a)", r"(f)", r"([a-z])", r"([A-Z])"]) -@pytest.mark.parametrize("expand", [True, False]) -@pytest.mark.parametrize( - "flags,flags_raise", [(0, 0), (re.M | re.S, 0), (re.I, 1)] -) -def test_string_extract(ps_gs, pat, expand, flags, flags_raise): - ps, gs = ps_gs - expectation = raise_builder([flags_raise], NotImplementedError) - - with expectation: - expect = ps.str.extract(pat, flags=flags, expand=expand) - got = gs.str.extract(pat, flags=flags, expand=expand) - - assert_eq(expect, got) - - -def test_string_invalid_regex(): - gs = cudf.Series(["a"]) - with pytest.raises(RuntimeError): - gs.str.extract(r"{\}") - - -@pytest.mark.parametrize( - "pat,regex", - [ - ("a", False), - ("a", True), - ("f", False), - (r"[a-z]", True), - (r"[A-Z]", True), - ("hello", False), - ("FGHI", False), - ], -) -@pytest.mark.parametrize( - "flags,flags_raise", - [(0, 0), (re.MULTILINE | re.DOTALL, 0), (re.I, 1), (re.I | re.DOTALL, 1)], -) -@pytest.mark.parametrize("na,na_raise", [(np.nan, 0), (None, 1), ("", 1)]) -def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise): - ps, gs = ps_gs - - expectation = does_not_raise() - if flags_raise or na_raise: - expectation = pytest.raises(NotImplementedError) - - with expectation: - expect = ps.str.contains(pat, flags=flags, na=na, regex=regex) - got = gs.str.contains(pat, flags=flags, na=na, regex=regex) - assert_eq(expect, got) - - -def test_string_contains_case(ps_gs): - ps, gs = ps_gs - with pytest.raises(NotImplementedError): - gs.str.contains("A", case=False) - expected = ps.str.contains("A", regex=False, case=False) - got = gs.str.contains("A", regex=False, case=False) - assert_eq(expected, got) - got = gs.str.contains("a", regex=False, case=False) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "pat,esc,expect", - [ - ("abc", "", [True, False, False, False, False, False]), - ("b%", "/", [False, True, False, False, False, False]), - ("%b", ":", [False, True, False, False, False, False]), - ("%b%", "*", [True, True, False, False, False, False]), - ("___", "", [True, True, True, False, False, False]), - ("__/%", "/", [False, False, True, False, False, False]), - ("55/____", "/", [False, False, False, True, False, False]), - ("%:%%", ":", [False, False, True, False, False, False]), - ("55*_100", "*", [False, False, False, True, False, False]), - ("abc", "abc", [True, False, False, False, False, False]), - ], -) -def test_string_like(pat, esc, expect): - expectation = does_not_raise() - if len(esc) > 1: - expectation = pytest.raises(ValueError) - - with expectation: - gs = cudf.Series(["abc", "bab", "99%", "55_100", "", "556100"]) - got = gs.str.like(pat, esc) - expect = cudf.Series(expect) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [["hello", "world", None, "", "!"]], -) -@pytest.mark.parametrize( - "repeats", - [ - 2, - 0, - -3, - [5, 4, 3, 2, 6], - [5, None, 3, 2, 6], - [0, 0, 0, 0, 0], - [-1, -2, -3, -4, -5], - [None, None, None, None, None], - ], -) -def test_string_repeat(data, repeats): - ps = pd.Series(data) - gs = cudf.from_pandas(ps) - - expect = ps.str.repeat(repeats) - got = gs.str.repeat(repeats) - - assert_eq(expect, got) - - -# Pandas doesn't respect the `n` parameter so ignoring it in test parameters -@pytest.mark.parametrize( - "pat,regex", - [("a", False), ("f", False), (r"[a-z]", True), (r"[A-Z]", True)], -) -@pytest.mark.parametrize("repl", ["qwerty", "", " "]) -@pytest.mark.parametrize("case,case_raise", [(None, 0), (True, 1), (False, 1)]) -@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (re.U, 1)]) -def test_string_replace( - ps_gs, pat, repl, case, case_raise, flags, flags_raise, regex -): - ps, gs = ps_gs - - expectation = raise_builder([case_raise, flags_raise], NotImplementedError) - - with expectation: - expect = ps.str.replace(pat, repl, case=case, flags=flags, regex=regex) - got = gs.str.replace(pat, repl, case=case, flags=flags, regex=regex) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("pat", ["A*", "F?H?"]) -def test_string_replace_zero_length(ps_gs, pat): - ps, gs = ps_gs - - expect = ps.str.replace(pat, "_", regex=True) - got = gs.str.replace(pat, "_", regex=True) - - assert_eq(expect, got) - - -def test_string_lower(ps_gs): - ps, gs = ps_gs - - expect = ps.str.lower() - got = gs.str.lower() - - assert_eq(expect, got) - - -def test_string_upper(ps_gs): - ps, gs = ps_gs - - expect = ps.str.upper() - got = gs.str.upper() - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["a b", " c ", " d", "e ", "f"], - ["a-b", "-c-", "---d", "e---", "f"], - ["ab", "c", "d", "e", "f"], - [None, None, None, None, None], - ], -) -@pytest.mark.parametrize("pat", [None, " ", "-"]) -@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) -@pytest.mark.parametrize("expand", [True, False]) -def test_string_split(data, pat, n, expand): - ps = pd.Series(data, dtype="str") - gs = cudf.Series(data, dtype="str") - - expect = ps.str.split(pat=pat, n=n, expand=expand) - got = gs.str.split(pat=pat, n=n, expand=expand) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["a b", " c ", " d", "e ", "f"], - ["a-b", "-c-", "---d", "e---", "f"], - ["ab", "c", "d", "e", "f"], - [None, None, None, None, None], - ], -) -@pytest.mark.parametrize("pat", [None, " ", "\\-+", "\\s+"]) -@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) -@pytest.mark.parametrize("expand", [True, False]) -def test_string_split_re(data, pat, n, expand): - ps = pd.Series(data, dtype="str") - gs = cudf.Series(data, dtype="str") - - expect = ps.str.split(pat=pat, n=n, expand=expand, regex=True) - got = gs.str.split(pat=pat, n=n, expand=expand, regex=True) - - assert_eq(expect, got) - - -@pytest.mark.parametrize("pat", [None, "\\s+"]) -@pytest.mark.parametrize("regex", [False, True]) -@pytest.mark.parametrize("expand", [False, True]) -def test_string_split_all_empty(pat, regex, expand): - ps = pd.Series(["", "", "", ""], dtype="str") - gs = cudf.Series(["", "", "", ""], dtype="str") - - expect = ps.str.split(pat=pat, expand=expand, regex=regex) - got = gs.str.split(pat=pat, expand=expand, regex=regex) - - if isinstance(got, cudf.DataFrame): - assert_eq(expect, got, check_column_type=False) - else: - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] -) -@pytest.mark.parametrize("num_keys", [1, 2, 3]) -def test_string_groupby_key(str_data, num_keys): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_keys): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - - expect = pdf.groupby(list(range(num_keys)), as_index=False).count() - got = gdf.groupby(list(range(num_keys)), as_index=False).count() - - expect = expect.sort_values([0]).reset_index(drop=True) - got = got.sort_values([0]).reset_index(drop=True) - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] -) -@pytest.mark.parametrize("num_cols", [1, 2, 3]) -@pytest.mark.parametrize("agg", ["count", "max", "min"]) -def test_string_groupby_non_key(str_data, num_cols, agg): - other_data = [1, 2, 3, 4, 5][: len(str_data)] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - for i in range(num_cols): - pdf[i] = pd.Series(str_data, dtype="str") - gdf[i] = cudf.Series(str_data, dtype="str") - pdf["a"] = other_data - gdf["a"] = other_data - - expect = getattr(pdf.groupby("a", as_index=False), agg)() - got = getattr(gdf.groupby("a", as_index=False), agg)() - - expect = expect.sort_values(["a"]).reset_index(drop=True) - got = got.sort_values(["a"]).reset_index(drop=True) - - if agg in ["min", "max"] and len(expect) == 0 and len(got) == 0: - for i in range(num_cols): - expect[i] = expect[i].astype("str") - - assert_eq(expect, got, check_dtype=False) - - -def test_string_groupby_key_index(): - str_data = ["a", "b", "c", "d", "e"] - other_data = [1, 2, 3, 4, 5] - - pdf = pd.DataFrame() - gdf = cudf.DataFrame() - pdf["a"] = pd.Series(str_data, dtype="str") - gdf["a"] = cudf.Series(str_data, dtype="str") - pdf["b"] = other_data - gdf["b"] = other_data - - expect = pdf.groupby("a", sort=True).count() - got = gdf.groupby("a", sort=True).count() - - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize("scalar", ["a", None]) -def test_string_set_scalar(scalar): - pdf = pd.DataFrame() - pdf["a"] = [1, 2, 3, 4, 5] - gdf = cudf.DataFrame.from_pandas(pdf) - - pdf["b"] = "a" - gdf["b"] = "a" - - assert_eq(pdf["b"], gdf["b"]) - assert_eq(pdf, gdf) - - -def test_string_index(): - pdf = pd.DataFrame(np.random.rand(5, 5)) - gdf = cudf.DataFrame.from_pandas(pdf) - stringIndex = ["a", "b", "c", "d", "e"] - pdf.index = stringIndex - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = np.array(["a", "b", "c", "d", "e"]) - pdf.index = stringIndex - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = Index(["a", "b", "c", "d", "e"], name="name") - pdf.index = stringIndex.to_pandas() - gdf.index = stringIndex - assert_eq(pdf, gdf) - stringIndex = cudf.Index._from_column( - cudf.core.column.as_column(["a", "b", "c", "d", "e"]), name="name" - ) - pdf.index = stringIndex.to_pandas() - gdf.index = stringIndex - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "item", - [ - ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], - ["a", "a", "a", "a", "A"], - ["A"], - ["abc", "xyz", None, "ab", "123"], - [None, None, "abc", None, "abc"], - ], -) -def test_string_unique(item): - ps = pd.Series(item) - gs = cudf.Series(item) - # Pandas `unique` returns a numpy array - pres = pd.Series(ps.unique()) - # cudf returns a cudf.Series - gres = gs.unique() - assert_eq(pres, gres) - - -def test_string_slice(): - df = cudf.DataFrame({"a": ["hello", "world"]}) - pdf = pd.DataFrame({"a": ["hello", "world"]}) - a_slice_got = df.a.str.slice(0, 2) - a_slice_expected = pdf.a.str.slice(0, 2) - - assert isinstance(a_slice_got, cudf.Series) - assert_eq(a_slice_expected, a_slice_got) - - -def test_string_equality(): - data1 = ["b", "c", "d", "a", "c"] - data2 = ["a", None, "c", "a", "c"] - - ps1 = pd.Series(data1) - ps2 = pd.Series(data2) - gs1 = cudf.Series(data1) - gs2 = cudf.Series(data2) - - expect = ps1 == ps2 - got = gs1 == gs2 - - assert_eq(expect, got.fillna(False)) - - expect = ps1 == "m" - got = gs1 == "m" - - assert_eq(expect, got.fillna(False)) - - ps1 = pd.Series(["a"]) - gs1 = cudf.Series(["a"]) - - expect = ps1 == "m" - got = gs1 == "m" - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "lhs", - [ - ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], - ["abc", "xyz", "a", "ab", "123", "097"], - ], -) -@pytest.mark.parametrize( - "rhs", - [ - ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], - ["a", "a", "a", "a", "A", "z"], - ], -) -def test_string_binary_op_add(lhs, rhs): - pds = pd.Series(lhs) + pd.Series(rhs) - gds = cudf.Series(lhs) + cudf.Series(rhs) - - assert_eq(pds, gds) - - -@pytest.mark.parametrize("name", [None, "new name", 123]) -def test_string_misc_name(ps_gs, name): - ps, gs = ps_gs - ps.name = name - gs.name = name - - expect = ps.str.slice(0, 1) - got = gs.str.slice(0, 1) - - assert_eq(expect, got) - assert_eq(ps + ps, gs + gs) - assert_eq(ps + "RAPIDS", gs + "RAPIDS") - assert_eq("RAPIDS" + ps, "RAPIDS" + gs) - - -def test_string_no_children_properties(): - empty_col = StringColumn(children=()) - assert empty_col.base_children == () - assert empty_col.base_size == 0 - - assert empty_col.children == () - assert empty_col.size == 0 - - assert getsizeof(empty_col) >= 0 # Accounts for Python GC overhead - - -@pytest.mark.parametrize( - "string", - [ - ["Cbe", "cbe", "CbeD", "Cb", "ghi", "Cb"], - ["abc", "xyz", "a", "ab", "123", "097"], - ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], - ], -) -@pytest.mark.parametrize( - "index", [-100, -5, -2, -6, -1, 0, 1, 2, 3, 9, 10, 100] -) -def test_string_get(string, index): - pds = pd.Series(string) - gds = cudf.Series(string) - - assert_eq( - pds.str.get(index).fillna(""), - gds.str.get(index).fillna(""), - ) - - -@pytest.mark.parametrize( - "string", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], - ["koala", "fox", "chameleon"], - ], -) -@pytest.mark.parametrize( - "number", - [-10, 0, 1, 3, 10], -) -@pytest.mark.parametrize( - "diff", - [0, 2, 5, 9], -) -def test_string_slice_str(string, number, diff): - pds = pd.Series(string) - gds = cudf.Series(string) - - assert_eq(pds.str.slice(start=number), gds.str.slice(start=number)) - assert_eq(pds.str.slice(stop=number), gds.str.slice(stop=number)) - assert_eq(pds.str.slice(), gds.str.slice()) - assert_eq( - pds.str.slice(start=number, stop=number + diff), - gds.str.slice(start=number, stop=number + diff), - ) - if diff != 0: - assert_eq(pds.str.slice(step=diff), gds.str.slice(step=diff)) - assert_eq( - pds.str.slice(start=number, stop=number + diff, step=diff), - gds.str.slice(start=number, stop=number + diff, step=diff), - ) - - -def test_string_slice_from(): - gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) - d_starts = cudf.Series([2, 3, 0, -1, -1], dtype=np.int32) - d_stops = cudf.Series([-1, -1, 0, -1, -1], dtype=np.int32) - got = gs.str.slice_from(starts=d_starts._column, stops=d_stops._column) - expected = cudf.Series(["llo world", "y accéntéd", "", None, ""]) - assert_eq(got, expected) - - -@pytest.mark.parametrize( - "string", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], - ["koala", "fox", "chameleon"], - ], -) -@pytest.mark.parametrize("number", [0, 1, 10]) -@pytest.mark.parametrize("diff", [0, 2, 9]) -@pytest.mark.parametrize("repr", ["2", "!!"]) -def test_string_slice_replace(string, number, diff, repr): - pds = pd.Series(string) - gds = cudf.Series(string) - - assert_eq( - pds.str.slice_replace(start=number, repl=repr), - gds.str.slice_replace(start=number, repl=repr), - check_dtype=False, - ) - assert_eq( - pds.str.slice_replace(stop=number, repl=repr), - gds.str.slice_replace(stop=number, repl=repr), - ) - assert_eq(pds.str.slice_replace(), gds.str.slice_replace()) - assert_eq( - pds.str.slice_replace(start=number, stop=number + diff), - gds.str.slice_replace(start=number, stop=number + diff), - ) - assert_eq( - pds.str.slice_replace(start=number, stop=number + diff, repl=repr), - gds.str.slice_replace(start=number, stop=number + diff, repl=repr), - check_dtype=False, - ) - - -def test_string_slice_replace_fail(): - gs = cudf.Series(["abc", "xyz", ""]) - with pytest.raises(TypeError): - gs.str.slice_replace(0, 1, ["_"]) - - -def test_string_insert(): - gs = cudf.Series(["hello world", "holy accéntéd", "batman", None, ""]) - - ps = pd.Series(["hello world", "holy accéntéd", "batman", None, ""]) - - assert_eq(gs.str.insert(0, ""), gs) - assert_eq(gs.str.insert(0, "+"), "+" + ps) - assert_eq(gs.str.insert(-1, "---"), ps + "---") - assert_eq( - gs.str.insert(5, "---"), - ps.str.slice(stop=5) + "---" + ps.str.slice(start=5), - ) - - with pytest.raises(TypeError): - gs.str.insert(0, ["+"]) - - -_string_char_types_data = [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["abcdefghij", "0123456789", "9876543210", None, "accénted", ""], - ["koala", "fox", "chameleon"], - [ - "1234567890", - "de", - "1.75", - "-34", - "+9.8", - "7¼", - "x³", - "2³", - "12⅝", - "", - "\t\r\n ", - ], - ["one", "one1", "1", ""], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["leopard", "Golden Eagle", "SNAKE", ""], - [r"¯\_(ツ)_/¯", "(╯°□°)╯︵ ┻━┻", "┬─┬ノ( º _ ºノ)"], - ["a1", "A1", "a!", "A!", "!1", "aA"], -] - - -@pytest.mark.parametrize( - "type_op", - [ - "isdecimal", - "isalnum", - "isalpha", - "isdigit", - "isnumeric", - "isupper", - "islower", - ], -) -@pytest.mark.parametrize("data", _string_char_types_data) -def test_string_char_types(type_op, data): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(getattr(gs.str, type_op)(), getattr(ps.str, type_op)()) - - -def test_string_filter_alphanum(): - data = ["1234567890", "!@#$%^&*()", ",./<>?;:[]}{|+=", "abc DEF"] - expected = [] - for st in data: - rs = "" - for c in st: - if str.isalnum(c): - rs = rs + c - expected.append(rs) - - gs = cudf.Series(data) - assert_eq(gs.str.filter_alphanum(), cudf.Series(expected)) - - expected = [] - for st in data: - rs = "" - for c in st: - if not str.isalnum(c): - rs = rs + c - expected.append(rs) - assert_eq(gs.str.filter_alphanum(keep=False), cudf.Series(expected)) - - expected = [] - for st in data: - rs = "" - for c in st: - if str.isalnum(c): - rs = rs + c - else: - rs = rs + "*" - expected.append(rs) - assert_eq(gs.str.filter_alphanum("*"), cudf.Series(expected)) - - expected = [] - for st in data: - rs = "" - for c in st: - if not str.isalnum(c): - rs = rs + c - else: - rs = rs + "*" - expected.append(rs) - assert_eq(gs.str.filter_alphanum("*", keep=False), cudf.Series(expected)) - - with pytest.raises(TypeError): - gs.str.filter_alphanum(["a"]) - - -@pytest.mark.parametrize( - "case_op", ["title", "capitalize", "lower", "upper", "swapcase"] -) -@pytest.mark.parametrize( - "data", - [ - *_string_char_types_data, - [ - None, - "The quick bRoWn fox juMps over the laze DOG", - '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"', - "accénted", - ], - ], -) -def test_string_char_case(case_op, data): - gs = cudf.Series(data) - ps = pd.Series(data) - - s = gs.str - a = getattr(s, case_op) - - assert_eq(a(), getattr(ps.str, case_op)()) - - assert_eq(gs.str.capitalize(), ps.str.capitalize()) - assert_eq(gs.str.isdecimal(), ps.str.isdecimal()) - assert_eq(gs.str.isalnum(), ps.str.isalnum()) - assert_eq(gs.str.isalpha(), ps.str.isalpha()) - assert_eq(gs.str.isdigit(), ps.str.isdigit()) - assert_eq(gs.str.isnumeric(), ps.str.isnumeric()) - assert_eq(gs.str.isspace(), ps.str.isspace()) - - assert_eq(gs.str.isempty(), ps == "") - - -def test_string_is_title(): - data = [ - "leopard", - "Golden Eagle", - "SNAKE", - "", - "!A", - "hello World", - "A B C", - "#", - "AƻB", - "Ⓑⓖ", - "Art of War", - ] - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(gs.str.istitle(), ps.str.istitle()) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ], -) -def test_strings_rpartition(data): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(ps.str.rpartition(), gs.str.rpartition()) - assert_eq(ps.str.rpartition("-"), gs.str.rpartition("-")) - assert_eq(ps.str.rpartition(","), gs.str.rpartition(",")) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ], -) -def test_strings_partition(data): - gs = cudf.Series(data, name="str_name") - ps = pd.Series(data, name="str_name") - - assert_eq(ps.str.partition(), gs.str.partition()) - assert_eq(ps.str.partition(","), gs.str.partition(",")) - assert_eq(ps.str.partition("-"), gs.str.partition("-")) - - gi = cudf.Index(data, name="new name") - pi = pd.Index(data, name="new name") - assert_eq(pi.str.partition(), gi.str.partition()) - assert_eq(pi.str.partition(","), gi.str.partition(",")) - assert_eq(pi.str.partition("-"), gi.str.partition("-")) - - -def test_string_partition_fail(): - gs = cudf.Series(["abc", "aa", "cba"]) - with pytest.raises(TypeError): - gs.str.partition(["a"]) - with pytest.raises(TypeError): - gs.str.rpartition(["a"]) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - [ - "this is a regular sentence", - "https://docs.python.org/3/tutorial/index.html", - None, - ], - ], -) -@pytest.mark.parametrize("n", [-1, 2, 1, 9]) -@pytest.mark.parametrize("expand", [True, False]) -def test_strings_rsplit(data, n, expand): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - ps.str.rsplit(n=n, expand=expand).reset_index(), - gs.str.rsplit(n=n, expand=expand).reset_index(), - check_index_type=False, - ) - assert_eq( - ps.str.rsplit(",", n=n, expand=expand), - gs.str.rsplit(",", n=n, expand=expand), - ) - assert_eq( - ps.str.rsplit("-", n=n, expand=expand), - gs.str.rsplit("-", n=n, expand=expand), - ) - - -@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) -@pytest.mark.parametrize("expand", [True, False]) -def test_string_rsplit_re(n, expand): - data = ["a b", " c ", " d", "e ", "f"] - ps = pd.Series(data, dtype="str") - gs = cudf.Series(data, dtype="str") - - # Pandas does not yet support the regex parameter for rsplit - import inspect - - assert ( - "regex" - not in inspect.signature(pd.Series.str.rsplit).parameters.keys() - ) - - expect = ps.str.rsplit(pat=" ", n=n, expand=expand) - got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - [ - "this is a regular sentence", - "https://docs.python.org/3/tutorial/index.html", - None, - ], - ], -) -@pytest.mark.parametrize("n", [-1, 2, 1, 9]) -@pytest.mark.parametrize("expand", [True, False]) -def test_strings_split(data, n, expand): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - ps.str.split(n=n, expand=expand).reset_index(), - gs.str.split(n=n, expand=expand).reset_index(), - check_index_type=False, - ) - - assert_eq( - ps.str.split(",", n=n, expand=expand), - gs.str.split(",", n=n, expand=expand), - ) - assert_eq( - ps.str.split("-", n=n, expand=expand), - gs.str.split("-", n=n, expand=expand), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - [ - "this is a regular sentence", - "https://docs.python.org/3/tutorial/index.html", - None, - ], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize( - "to_strip", ["⅕", None, "123.", ".!? \n\t", "123.!? \n\t", " ", ".", ","] -) -def test_strings_strip_tests(data, to_strip): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(ps.str.strip(to_strip=to_strip), gs.str.strip(to_strip=to_strip)) - assert_eq( - ps.str.rstrip(to_strip=to_strip), gs.str.rstrip(to_strip=to_strip) - ) - assert_eq( - ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip) - ) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq(pi.str.strip(to_strip=to_strip), gi.str.strip(to_strip=to_strip)) - assert_eq( - pi.str.rstrip(to_strip=to_strip), gi.str.rstrip(to_strip=to_strip) - ) - assert_eq( - pi.str.lstrip(to_strip=to_strip), gi.str.lstrip(to_strip=to_strip) - ) - - -def test_string_strip_fail(): - gs = cudf.Series(["a", "aa", ""]) - with pytest.raises(TypeError): - gs.str.strip(["a"]) - with pytest.raises(TypeError): - gs.str.lstrip(["a"]) - with pytest.raises(TypeError): - gs.str.rstrip(["a"]) - - -@pytest.mark.parametrize( - "data", - [ - ["koala", "fox", "chameleon"], - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - [ - "this is a regular sentence", - "https://docs.python.org/3/tutorial/index.html", - None, - ], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize("width", [0, 1, 4, 9, 100]) -@pytest.mark.parametrize("fillchar", ["⅕", "1", ".", "t", " ", ","]) -def test_strings_filling_tests(data, width, fillchar): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - ps.str.center(width=width, fillchar=fillchar), - gs.str.center(width=width, fillchar=fillchar), - ) - assert_eq( - ps.str.ljust(width=width, fillchar=fillchar), - gs.str.ljust(width=width, fillchar=fillchar), - ) - assert_eq( - ps.str.rjust(width=width, fillchar=fillchar), - gs.str.rjust(width=width, fillchar=fillchar), - ) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq( - pi.str.center(width=width, fillchar=fillchar), - gi.str.center(width=width, fillchar=fillchar), - ) - assert_eq( - pi.str.ljust(width=width, fillchar=fillchar), - gi.str.ljust(width=width, fillchar=fillchar), - ) - assert_eq( - pi.str.rjust(width=width, fillchar=fillchar), - gi.str.rjust(width=width, fillchar=fillchar), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["³", "⅕", ""], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - [" ", "\t\r\n ", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize("width", [0, 1, 4, 6, 9, 100]) -def test_strings_zfill_tests(data, width): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq(ps.str.zfill(width=width), gs.str.zfill(width=width)) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq(pi.str.zfill(width=width), gi.str.zfill(width=width)) - - -@pytest.mark.parametrize( - "data", - [ - ["A,,B", "1,,5", "3,00,0"], - ["Linda van der Berg", "George Pitt-Rivers"], - ["+23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize("width", [0, 1, 4, 9, 100]) -@pytest.mark.parametrize( - "side", - ["left", "right", "both"], -) -@pytest.mark.parametrize("fillchar", [" ", ".", "\n", "+", "\t"]) -def test_strings_pad_tests(data, width, side, fillchar): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - ps.str.pad(width=width, side=side, fillchar=fillchar), - gs.str.pad(width=width, side=side, fillchar=fillchar), - ) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq( - pi.str.pad(width=width, side=side, fillchar=fillchar), - gi.str.pad(width=width, side=side, fillchar=fillchar), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - # [" ", "\t\r\n ", ""], - ["leopard", "Golden Eagle", "SNAKE", ""], - ["line to be wrapped", "another line to be wrapped"], - ], -) -@pytest.mark.parametrize("width", [1, 4, 8, 12, 100]) -def test_string_wrap(data, width): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - gs.str.wrap( - width=width, - break_long_words=False, - expand_tabs=False, - replace_whitespace=True, - drop_whitespace=True, - break_on_hyphens=False, - ), - ps.str.wrap( - width=width, - break_long_words=False, - expand_tabs=False, - replace_whitespace=True, - drop_whitespace=True, - break_on_hyphens=False, - ), - ) - - gi = cudf.Index(data) - pi = pd.Index(data) - - assert_eq( - gi.str.wrap( - width=width, - break_long_words=False, - expand_tabs=False, - replace_whitespace=True, - drop_whitespace=True, - break_on_hyphens=False, - ), - pi.str.wrap( - width=width, - break_long_words=False, - expand_tabs=False, - replace_whitespace=True, - drop_whitespace=True, - break_on_hyphens=False, - ), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\ndog"], - ["line\nto be wrapped", "another\nline\nto be wrapped"], - ], -) -@pytest.mark.parametrize( - "pat", - ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"], -) -@pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL]) -def test_string_count(data, pat, flags): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - gs.str.count(pat=pat, flags=flags), - ps.str.count(pat=pat, flags=flags), - check_dtype=False, - ) - assert_eq( - cudf.Index(gs).str.count(pat=pat), - pd.Index(ps).str.count(pat=pat), - exact=False, - ) - - -@pytest.mark.parametrize( - "pat, flags", - [ - ("Monkey", 0), - ("on", 0), - ("b", 0), - ("on$", 0), - ("on$", re.MULTILINE), - ("o.*k", re.DOTALL), - ], -) -def test_string_findall(pat, flags): - test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] - ps = pd.Series(test_data) - gs = cudf.Series(test_data) - - expected = ps.str.findall(pat, flags) - actual = gs.str.findall(pat, flags) - assert_eq(expected, actual) - - -def test_string_replace_multi(): - ps = pd.Series(["hello", "goodbye"]) - gs = cudf.Series(["hello", "goodbye"]) - expect = ps.str.replace("e", "E").str.replace("o", "O") - got = gs.str.replace(["e", "o"], ["E", "O"]) - - assert_eq(expect, got) - - ps = pd.Series(["foo", "fuz", np.nan]) - gs = cudf.Series.from_pandas(ps) - - expect = ps.str.replace("f.", "ba", regex=True) - got = gs.str.replace(["f."], ["ba"], regex=True) - assert_eq(expect, got) - - ps = pd.Series(["f.o", "fuz", np.nan]) - gs = cudf.Series.from_pandas(ps) - - expect = ps.str.replace("f.", "ba", regex=False) - got = gs.str.replace(["f."], ["ba"], regex=False) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "find", - [ - "(\\d)(\\d)", - "(\\d)(\\d)", - "(\\d)(\\d)", - "(\\d)(\\d)", - "([a-z])-([a-z])", - "([a-z])-([a-zé])", - "([a-z])-([a-z])", - "([a-z])-([a-zé])", - re.compile("([A-Z])(\\d)"), - ], -) -@pytest.mark.parametrize( - "replace", - ["\\1-\\2", "V\\2-\\1", "\\1 \\2", "\\2 \\1", "X\\1+\\2Z", "X\\1+\\2Z"], -) -def test_string_replace_with_backrefs(find, replace): - s = [ - "A543", - "Z756", - "", - None, - "tést-string", - "two-thréé four-fivé", - "abcd-éfgh", - "tést-string-again", - ] - ps = pd.Series(s) - gs = cudf.Series(s) - got = gs.str.replace_with_backrefs(find, replace) - expected = ps.str.replace(find, replace, regex=True) - assert_eq(got, expected) - - got = cudf.Index(gs).str.replace_with_backrefs(find, replace) - expected = pd.Index(ps).str.replace(find, replace, regex=True) - assert_eq(got, expected) - - -def test_string_table_view_creation(): - data = ["hi"] * 25 + [None] * 2027 - psr = pd.Series(data) - gsr = cudf.Series.from_pandas(psr) - - expect = psr[:1] - got = gsr[:1] - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["line to be wrapped", "another line to be wrapped"], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize( - "pat", - ["", None, " ", "a", "abc", "cat", "$", "\n"], -) -def test_string_starts_ends(data, pat): - ps = pd.Series(data) - gs = cudf.Series(data) - - if pat is None: - assert_exceptions_equal( - lfunc=ps.str.startswith, - rfunc=gs.str.startswith, - lfunc_args_and_kwargs=([pat],), - rfunc_args_and_kwargs=([pat],), - ) - assert_exceptions_equal( - lfunc=ps.str.endswith, - rfunc=gs.str.endswith, - lfunc_args_and_kwargs=([pat],), - rfunc_args_and_kwargs=([pat],), - ) - else: - assert_eq( - ps.str.startswith(pat), gs.str.startswith(pat), check_dtype=False - ) - assert_eq( - ps.str.endswith(pat), gs.str.endswith(pat), check_dtype=False - ) - - -@pytest.mark.parametrize( - "data,pat", - [ - ( - ["abc", "xyz", "a", "ab", "123", "097"], - ["abc", "x", "a", "b", "3", "7"], - ), - (["A B", "1.5", "3,000"], ["A ", ".", ","]), - (["23", "³", "⅕", ""], ["23", "³", "⅕", ""]), - ([" ", "\t\r\n ", ""], ["d", "\n ", ""]), - ( - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["$", "$", "a", "<", "(", "#"], - ), - ( - ["line to be wrapped", "another line to be wrapped"], - ["another", "wrapped"], - ), - ( - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["hsdjfk", None, "ll", "+", "-", "w", "-", "én"], - ), - ( - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ), - ], -) -def test_string_starts_ends_list_like_pat(data, pat): - gs = cudf.Series(data) - - starts_expected = [] - ends_expected = [] - for i in range(len(pat)): - if data[i] is None: - starts_expected.append(None) - ends_expected.append(None) - else: - if pat[i] is None: - starts_expected.append(False) - ends_expected.append(False) - else: - starts_expected.append(data[i].startswith(pat[i])) - ends_expected.append(data[i].endswith(pat[i])) - starts_expected = pd.Series(starts_expected) - ends_expected = pd.Series(ends_expected) - assert_eq(starts_expected, gs.str.startswith(pat), check_dtype=False) - assert_eq(ends_expected, gs.str.endswith(pat), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - ["str_foo", "str_bar", "no_prefix", "", None], - ["foo_str", "bar_str", "no_suffix", "", None], - ], -) -def test_string_remove_suffix_prefix(data): - ps = pd.Series(data) - gs = cudf.Series(data) - - got = gs.str.removeprefix("str_") - expect = ps.str.removeprefix("str_") - assert_eq( - expect, - got, - check_dtype=False, - ) - got = gs.str.removesuffix("_str") - expect = ps.str.removesuffix("_str") - assert_eq( - expect, - got, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["line to be wrapped", "another line to be wrapped"], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize( - "sub", - ["", " ", "a", "abc", "cat", "$", "\n"], -) -def test_string_find(data, sub): - ps = pd.Series(data) - gs = cudf.Series(data) - - got = gs.str.find(sub) - expect = ps.str.find(sub) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.find(sub, start=1) - expect = ps.str.find(sub, start=1) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.find(sub, end=10) - expect = ps.str.find(sub, end=10) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.find(sub, start=2, end=10) - expect = ps.str.find(sub, start=2, end=10) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.rfind(sub) - expect = ps.str.rfind(sub) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.rfind(sub, start=1) - expect = ps.str.rfind(sub, start=1) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.rfind(sub, end=10) - expect = ps.str.rfind(sub, end=10) - assert_eq( - expect, - got, - check_dtype=False, - ) - - got = gs.str.rfind(sub, start=2, end=10) - expect = ps.str.rfind(sub, start=2, end=10) - assert_eq( - expect, - got, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "data,sub,er", - [ - (["abc", "xyz", "a", "ab", "123", "097"], "a", ValueError), - (["A B", "1.5", "3,000"], "abc", ValueError), - (["23", "³", "⅕", ""], "⅕", ValueError), - ([" ", "\t\r\n ", ""], "\n", ValueError), - (["$", "B", "Aab$", "$$ca", "C$B$", "cat"], "$", ValueError), - (["line to be wrapped", "another line to be wrapped"], " ", None), - ( - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - "+", - ValueError, - ), - (["line to be wrapped", "another line to be wrapped"], "", None), - ], -) -def test_string_str_index(data, sub, er): - ps = pd.Series(data) - gs = cudf.Series(data) - - if er is None: - assert_eq(ps.str.index(sub), gs.str.index(sub), check_dtype=False) - - try: - ps.str.index(sub) - except er: - pass - else: - assert not er - - try: - gs.str.index(sub) - except er: - pass - else: - assert not er - - -@pytest.mark.parametrize( - "data,sub,er", - [ - (["abc", "xyz", "a", "ab", "123", "097"], "a", ValueError), - (["A B", "1.5", "3,000"], "abc", ValueError), - (["23", "³", "⅕", ""], "⅕", ValueError), - ([" ", "\t\r\n ", ""], "\n", ValueError), - (["$", "B", "Aab$", "$$ca", "C$B$", "cat"], "$", ValueError), - (["line to be wrapped", "another line to be wrapped"], " ", None), - ( - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - "+", - ValueError, - ), - (["line to be wrapped", "another line to be wrapped"], "", None), - ], -) -def test_string_str_rindex(data, sub, er): - ps = pd.Series(data) - gs = cudf.Series(data) - - if er is None: - assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False) - assert_eq( - pd.Index(ps).str.rindex(sub), - cudf.Index(gs).str.rindex(sub), - exact=False, - ) - - try: - ps.str.rindex(sub) - except er: - pass - else: - assert not er - - try: - gs.str.rindex(sub) - except er: - pass - else: - assert not er - - -@pytest.mark.parametrize( - "data,sub,expect", - [ - ( - ["abc", "xyz", "a", "ab", "123", "097"], - ["b", "y", "a", "c", "4", "8"], - [True, True, True, False, False, False], - ), - ( - ["A B", "1.5", "3,000", "23", "³", "⅕"], - ["A B", ".", ",", "1", " ", " "], - [True, True, True, False, False, False], - ), - ( - [" ", "\t", "\r", "\f ", "\n", ""], - ["", "\t", "\r", "xx", "yy", "zz"], - [True, True, True, False, False, False], - ), - ( - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["$", "B", "ab", "*", "@", "dog"], - [True, True, True, False, False, False], - ), - ( - ["hello", "there", "world", "-1234", None, "accént"], - ["lo", "e", "o", "+1234", " ", "e"], - [True, True, True, False, None, False], - ), - ( - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", "", "x", None], - ["A", "B", "C", " ", "y", "e"], - [True, True, True, False, False, None], - ), - ], -) -def test_string_contains_multi(data, sub, expect): - gs = cudf.Series(data) - sub = cudf.Series(sub) - got = gs.str.contains(sub) - expect = cudf.Series(expect) - assert_eq(expect, got, check_dtype=False) - - -# Pandas does not allow 'case' or 'flags' if 'pat' is re.Pattern -# This covers contains, match, count, and replace -@pytest.mark.parametrize( - "pat", - [re.compile("[n-z]"), re.compile("[A-Z]"), re.compile("de"), "A"], -) -@pytest.mark.parametrize("repl", ["xyz", "", " "]) -def test_string_compiled_re(ps_gs, pat, repl): - ps, gs = ps_gs - - expect = ps.str.contains(pat, regex=True) - got = gs.str.contains(pat, regex=True) - assert_eq(expect, got) - - expect = ps.str.match(pat) - got = gs.str.match(pat) - assert_eq(expect, got) - - expect = ps.str.count(pat) - got = gs.str.count(pat) - assert_eq(expect, got, check_dtype=False) - - expect = ps.str.replace(pat, repl, regex=True) - got = gs.str.replace(pat, repl, regex=True) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["line to be wrapped", "another line to be wrapped"], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -@pytest.mark.parametrize("pat", ["", " ", "a", "abc", "cat", "$", "\n"]) -def test_string_str_match(data, pat): - ps = pd.Series(data) - gs = cudf.Series(data) - - assert_eq(ps.str.match(pat), gs.str.match(pat)) - assert_eq( - pd.Index(pd.Index(ps).str.match(pat)), cudf.Index(gs).str.match(pat) - ) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "a", "ab", "123", "097"], - ["A B", "1.5", "3,000"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["line to be wrapped", "another line to be wrapped"], - ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ], -) -def test_string_str_translate(data): - ps = pd.Series(data) - gs = cudf.Series(data) - - assert_eq( - ps.str.translate(str.maketrans({"a": "z"})), - gs.str.translate(str.maketrans({"a": "z"})), - ) - assert_eq( - pd.Index(ps).str.translate(str.maketrans({"a": "z"})), - cudf.Index(gs).str.translate(str.maketrans({"a": "z"})), - ) - assert_eq( - ps.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), - gs.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})), - ) - assert_eq( - pd.Index(ps).str.translate( - str.maketrans({"a": "z", "i": "$", "z": "1"}) - ), - cudf.Index(gs).str.translate( - str.maketrans({"a": "z", "i": "$", "z": "1"}) - ), - ) - assert_eq( - ps.str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - gs.str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - ) - assert_eq( - pd.Index(ps).str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - cudf.Index(gs).str.translate( - str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."}) - ), - ) - assert_eq( - ps.str.translate(str.maketrans({"é": "É"})), - gs.str.translate(str.maketrans({"é": "É"})), - ) - - -def test_string_str_filter_characters(): - data = [ - "hello world", - "A+B+C+D", - "?!@#$%^&*()", - "accént", - None, - "$1.50", - "", - ] - gs = cudf.Series(data) - expected = cudf.Series( - ["helloworld", "ABCD", "", "accnt", None, "150", ""] - ) - filter = {"a": "z", "A": "Z", "0": "9"} - assert_eq(expected, gs.str.filter_characters(filter)) - - expected = cudf.Series([" ", "+++", "?!@#$%^&*()", "é", None, "$.", ""]) - assert_eq(expected, gs.str.filter_characters(filter, False)) - - expected = cudf.Series( - ["hello world", "A B C D", " ", "acc nt", None, " 1 50", ""] - ) - assert_eq(expected, gs.str.filter_characters(filter, True, " ")) - - with pytest.raises(TypeError): - gs.str.filter_characters(filter, True, ["a"]) - - -def test_string_str_code_points(): - data = [ - "abc", - "Def", - None, - "jLl", - "dog and cat", - "accénted", - "", - " 1234 ", - "XYZ", - ] - gs = cudf.Series(data) - expected = [ - 97, - 98, - 99, - 68, - 101, - 102, - 106, - 76, - 108, - 100, - 111, - 103, - 32, - 97, - 110, - 100, - 32, - 99, - 97, - 116, - 97, - 99, - 99, - 50089, - 110, - 116, - 101, - 100, - 32, - 49, - 50, - 51, - 52, - 32, - 88, - 89, - 90, - ] - expected = cudf.Series(expected) - - assert_eq(expected, gs.str.code_points(), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - ["http://www.hellow.com", "/home/nvidia/nfs", "123.45 ~ABCDEF"], - ["23", "³", "⅕", ""], - [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ], -) -def test_string_str_url_encode(data): - gs = cudf.Series(data) - - got = gs.str.url_encode() - expected = pd.Series([urllib.parse.quote(url, safe="~") for url in data]) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - "http://www.hellow.com?k1=acc%C3%A9nted&k2=a%2F/b.c", - "%2Fhome%2fnfs", - "987%20ZYX", - ] - ], -) -def test_string_str_decode_url(data): - gs = cudf.Series(data) - - got = gs.str.url_decode() - expected = pd.Series([urllib.parse.unquote(url) for url in data]) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data,dtype", - [ - (["0.1", "10.2", "10.876"], "float"), - (["-0.1", "10.2", "+10.876"], "float"), - (["1", "10.2", "10.876"], "float32"), - (["+123", "6344556789", "0"], "int"), - (["+123", "6344556789", "0"], "uint64"), - (["+123", "6344556789", "0"], "float"), - (["0.1", "-10.2", "10.876", None], "float"), - ], -) -@pytest.mark.parametrize("obj_type", [None, "str", "category"]) -def test_string_typecast(data, obj_type, dtype): - psr = pd.Series(data, dtype=obj_type) - gsr = cudf.Series(data, dtype=obj_type) - - expect = psr.astype(dtype=dtype) - actual = gsr.astype(dtype=dtype) - assert_eq(expect, actual) - - -@pytest.mark.parametrize( - "data,dtype", - [ - (["0.1", "10.2", "10.876"], "int"), - (["1", "10.2", "+10.876"], "int"), - (["abc", "1", "2", " "], "int"), - (["0.1", "10.2", "10.876"], "uint64"), - (["1", "10.2", "+10.876"], "uint64"), - (["abc", "1", "2", " "], "uint64"), - ([" ", "0.1", "2"], "float"), - ([""], "int"), - ([""], "uint64"), - ([" "], "float"), - (["\n"], "int"), - (["\n"], "uint64"), - (["0.1", "-10.2", "10.876", None], "int"), - (["0.1", "-10.2", "10.876", None], "uint64"), - (["0.1", "-10.2", "10.876", None, "ab"], "float"), - (["+", "-"], "float"), - (["+", "-"], "int"), - (["+", "-"], "uint64"), - (["1++++", "--2"], "float"), - (["1++++", "--2"], "int"), - (["1++++", "--2"], "uint64"), - (["++++1", "--2"], "float"), - (["++++1", "--2"], "int"), - (["++++1", "--2"], "uint64"), - ], -) -@pytest.mark.parametrize("obj_type", [None, "str", "category"]) -def test_string_typecast_error(data, obj_type, dtype): - psr = pd.Series(data, dtype=obj_type) - gsr = cudf.Series(data, dtype=obj_type) - - assert_exceptions_equal( - lfunc=psr.astype, - rfunc=gsr.astype, - lfunc_args_and_kwargs=([dtype],), - rfunc_args_and_kwargs=([dtype],), - ) - - -@pytest.mark.parametrize( - "data", - [ - ["f0:18:98:22:c2:e4", "00:00:00:00:00:00", "ff:ff:ff:ff:ff:ff"], - ["f0189822c2e4", "000000000000", "ffffffffffff"], - ["0xf0189822c2e4", "0x000000000000", "0xffffffffffff"], - ["0Xf0189822c2e4", "0X000000000000", "0Xffffffffffff"], - ], -) -def test_string_hex_to_int(data): - gsr = cudf.Series(data) - - expected = cudf.Series([263988422296292, 0, 281474976710655]) - - got = gsr.str.htoi() - assert_eq(expected, got) - - got = gsr.str.hex_to_int() # alias - assert_eq(expected, got) - - -def test_string_ishex(): - gsr = cudf.Series(["", None, "0x01a2b3c4d5e6f", "0789", "ABCDEF0"]) - got = gsr.str.ishex() - expected = cudf.Series([False, None, True, True, True]) - assert_eq(expected, got) - - -def test_string_istimestamp(): - gsr = cudf.Series( - [ - "", - None, - "20201009 123456.987654AM+0100", - "1920111 012345.000001", - "18201235 012345.1", - "20201009 250001.2", - "20201009 129901.3", - "20201009 123499.4", - "20201009 000000.500000PM-0130", - "20201009:000000.600000", - "20201009 010203.700000PM-2500", - "20201009 010203.800000AM+0590", - "20201009 010203.900000AP-0000", - ] - ) - got = gsr.str.istimestamp(r"%Y%m%d %H%M%S.%f%p%z") - expected = cudf.Series( - [ - False, - None, - True, - False, - False, - False, - False, - False, - True, - False, - False, - False, - False, - ] - ) - assert_eq(expected, got) - - -def test_istimestamp_empty(): - gsr = cudf.Series([], dtype="object") - result = gsr.str.istimestamp("%Y%m%d") - expected = cudf.Series([], dtype="bool") - assert_eq(result, expected) - - -def test_string_ip4_to_int(): - gsr = cudf.Series( - ["", None, "hello", "41.168.0.1", "127.0.0.1", "41.197.0.1"] - ) - expected = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]) - - got = gsr.str.ip2int() - assert_eq(expected, got) - - got = gsr.str.ip_to_int() # alias - assert_eq(expected, got) - - -def test_string_int_to_ipv4(): - gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype( - "uint32" - ) - expected = cudf.Series( - ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"] - ) - - got = cudf.Series._from_column(gsr._column.int2ip()) - - assert_eq(expected, got) - - -def test_string_isipv4(): - gsr = cudf.Series( - [ - "", - None, - "1...1", - "141.168.0.1", - "127.0.0.1", - "1.255.0.1", - "256.27.28.26", - "25.257.28.26", - "25.27.258.26", - "25.27.28.256", - "-1.0.0.0", - ] - ) - got = gsr.str.isipv4() - expected = cudf.Series( - [ - False, - None, - False, - True, - True, - True, - False, - False, - False, - False, - False, - ] - ) - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"uint32"})) -) -def test_string_int_to_ipv4_dtype_fail(dtype): - gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype) - with pytest.raises(TypeError): - gsr._column.int2ip() - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "xyz", "pqr", "tuv"], - ["aaaaaaaaaaaa"], - ["aaaaaaaaaaaa", "bdfeqwert", "poiuytre"], - ], -) -@pytest.mark.parametrize( - "index", - [ - 0, - 1, - 2, - slice(0, 1, 2), - slice(0, 5, 2), - slice(-1, -2, 1), - slice(-1, -2, -1), - slice(-2, -1, -1), - slice(-2, -1, 1), - slice(0), - slice(None), - ], -) -def test_string_str_subscriptable(data, index): - psr = pd.Series(data) - gsr = cudf.Series(data) - - assert_eq(psr.str[index], gsr.str[index]) - - psi = pd.Index(data) - gsi = cudf.Index(data) - - assert_eq(psi.str[index], gsi.str[index]) - - -@pytest.mark.parametrize( - "data,expected", - [ - (["abc", "xyz", "pqr", "tuv"], [3, 3, 3, 3]), - (["aaaaaaaaaaaa"], [12]), - (["aaaaaaaaaaaa", "bdfeqwert", "poiuytre"], [12, 9, 8]), - (["abc", "d", "ef"], [3, 1, 2]), - (["Hello", "Bye", "Thanks 😊"], [5, 3, 11]), - (["\n\t", "Bye", "Thanks 😊"], [2, 3, 11]), - ], -) -def test_string_str_byte_count(data, expected): - sr = cudf.Series(data) - expected = cudf.Series(expected, dtype="int32") - actual = sr.str.byte_count() - assert_eq(expected, actual) - - si = cudf.Index(data) - expected = cudf.Index(expected, dtype="int32") - actual = si.str.byte_count() - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,expected", - [ - (["1", "2", "3", "4", "5"], [True, True, True, True, True]), - ( - ["1.1", "2.0", "3.2", "4.3", "5."], - [False, False, False, False, False], - ), - ( - [".12312", "213123.", ".3223.", "323423.."], - [False, False, False, False], - ), - ([""], [False]), - ( - ["1..1", "+2", "++3", "4++", "-5"], - [False, True, False, False, True], - ), - ( - [ - "24313345435345 ", - "+2632726478", - "++367293674326", - "4382493264392746.237649274692++", - "-578239479238469264", - ], - [False, True, False, False, True], - ), - ( - ["2a2b", "a+b", "++a", "a.b++", "-b"], - [False, False, False, False, False], - ), - ( - ["2a2b", "1+3", "9.0++a", "+", "-"], - [False, False, False, False, False], - ), - ], -) -def test_str_isinteger(data, expected): - sr = cudf.Series(data, dtype="str") - expected = cudf.Series(expected) - actual = sr.str.isinteger() - assert_eq(expected, actual) - - sr = cudf.Index(data) - expected = cudf.Index(expected) - actual = sr.str.isinteger() - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,expected", - [ - (["1", "2", "3", "4", "5"], [True, True, True, True, True]), - (["1.1", "2.0", "3.2", "4.3", "5."], [True, True, True, True, True]), - ([""], [False]), - ( - [".12312", "213123.", ".3223.", "323423.."], - [True, True, False, False], - ), - ( - ["1.00.323.1", "+2.1", "++3.30", "4.9991++", "-5.3"], - [False, True, False, False, True], - ), - ( - [ - "24313345435345 ", - "+2632726478", - "++367293674326", - "4382493264392746.237649274692++", - "-578239479238469264", - ], - [False, True, False, False, True], - ), - ( - [ - "24313345435345.32732 ", - "+2632726478.3627638276", - "++0.326294632367293674326", - "4382493264392746.237649274692++", - "-57823947923.8469264", - ], - [False, True, False, False, True], - ), - ( - ["2a2b", "a+b", "++a", "a.b++", "-b"], - [False, False, False, False, False], - ), - ( - ["2a2b", "1+3", "9.0++a", "+", "-"], - [False, False, False, False, False], - ), - ], -) -def test_str_isfloat(data, expected): - sr = cudf.Series(data, dtype="str") - expected = cudf.Series(expected) - actual = sr.str.isfloat() - assert_eq(expected, actual) - - sr = cudf.Index(data) - expected = cudf.Index(expected) - actual = sr.str.isfloat() - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - ["a", "b", "c", "d", "e"], - ["a", "z", ".", '"', "aa", "zz"], - ["aa", "zz"], - ["z", "a", "zz", "aa"], - ["1", "2", "3", "4", "5"], - [""], - ["a"], - ["hello"], - ["small text", "this is a larger text......"], - ["👋🏻", "🔥", "🥇"], - ["This is 💯", "here is a calendar", "📅"], - ["", ".", ";", "[", "]"], - ["\t", ".", "\n", "\n\t", "\t\n"], - ], -) -def test_str_min(data): - psr = pd.Series(data) - sr = cudf.Series(data) - - assert_eq(psr.min(), sr.min()) - - -@pytest.mark.parametrize( - "data", - [ - ["a", "b", "c", "d", "e"], - ["a", "z", ".", '"', "aa", "zz"], - ["aa", "zz"], - ["z", "a", "zz", "aa"], - ["1", "2", "3", "4", "5"], - [""], - ["a"], - ["hello"], - ["small text", "this is a larger text......"], - ["👋🏻", "🔥", "🥇"], - ["This is 💯", "here is a calendar", "📅"], - ["", ".", ";", "[", "]"], - ["\t", ".", "\n", "\n\t", "\t\n"], - ], -) -def test_str_max(data): - psr = pd.Series(data) - sr = cudf.Series(data) - - assert_eq(psr.max(), sr.max()) - - -@pytest.mark.parametrize( - "data", - [ - ["a", "b", "c", "d", "e"], - ["a", "z", ".", '"', "aa", "zz"], - ["aa", "zz"], - ["z", "a", "zz", "aa"], - ["1", "2", "3", "4", "5"], - [""], - ["a"], - ["hello"], - ["small text", "this is a larger text......"], - ["👋🏻", "🔥", "🥇"], - ["This is 💯", "here is a calendar", "📅"], - ["", ".", ";", "[", "]"], - ["\t", ".", "\n", "\n\t", "\t\n"], - ], -) -def test_str_sum(data): - psr = pd.Series(data) - sr = cudf.Series(data) - - assert_eq(psr.sum(), sr.sum()) - - -def test_str_mean(): - sr = cudf.Series(["a", "b", "c", "d", "e"]) - - with pytest.raises(TypeError): - sr.mean() - - -def test_string_product(): - psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = cudf.Series(["1", "2", "3", "4", "5"]) - - assert_exceptions_equal( - lfunc=psr.product, - rfunc=sr.product, - ) - - -def test_string_var(): - psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = cudf.Series(["1", "2", "3", "4", "5"]) - - assert_exceptions_equal(lfunc=psr.var, rfunc=sr.var) - - -def test_string_std(): - psr = pd.Series(["1", "2", "3", "4", "5"]) - sr = cudf.Series(["1", "2", "3", "4", "5"]) - - assert_exceptions_equal(lfunc=psr.std, rfunc=sr.std) - - -def test_string_slice_with_mask(): - actual = cudf.Series(["hi", "hello", None]) - expected = actual[0:3] - - assert actual._column.base_size == 3 - assert_eq(actual._column.base_size, expected._column.base_size) - assert_eq(actual._column.null_count, expected._column.null_count) - - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data", - [ - [ - """ - { - "store":{ - "book":[ - { - "category":"reference", - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ], - [ - """ - { - "store":{ - "book":[ - { - "category":"reference", - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - } - ] - } - } - """, - """ - { - "store":{ - "book":[ - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """, - ], - ], -) -def test_string_get_json_object_n(data): - gs = cudf.Series(data) - ps = pd.Series(data) - - assert_eq( - json.loads(gs.str.get_json_object("$.store")[0]), - ps.apply(lambda x: json.loads(x)["store"])[0], - ) - assert_eq( - json.loads(gs.str.get_json_object("$.store.book")[0]), - ps.apply(lambda x: json.loads(x)["store"]["book"])[0], - ) - assert_eq( - gs.str.get_json_object("$.store.book[0].category"), - ps.apply(lambda x: json.loads(x)["store"]["book"][0]["category"]), - ) - - -@pytest.mark.parametrize( - "json_path", ["$.store", "$.store.book", "$.store.book[*].category", " "] -) -def test_string_get_json_object_empty_json_strings(json_path): - gs = cudf.Series( - [ - """ - { - "":{ - "":[ - { - "":"", - "":"", - "":"" - }, - { - "":"fiction", - "":"", - "title":"" - } - ] - } - } - """ - ] - ) - - got = gs.str.get_json_object(json_path) - expect = cudf.Series([None], dtype="object") - - assert_eq(got, expect) - - -@pytest.mark.parametrize("json_path", ["a", ".", "/.store"]) -def test_string_get_json_object_invalid_JSONPath(json_path): - gs = cudf.Series( - [ - """ - { - "store":{ - "book":[ - { - "category":"reference", - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ] - ) - - with pytest.raises(ValueError): - gs.str.get_json_object(json_path) - - -def test_string_get_json_object_allow_single_quotes(): - gs = cudf.Series( - [ - """ - { - "store":{ - "book":[ - { - 'author':"Nigel Rees", - "title":'Sayings of the Century', - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - 'title':"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ] - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", allow_single_quotes=True - ), - cudf.Series(["Nigel Rees"]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", allow_single_quotes=True - ), - cudf.Series(["['Sayings of the Century',\"Sword of Honour\"]"]), - ) - - assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", allow_single_quotes=False - ), - cudf.Series([None]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", allow_single_quotes=False - ), - cudf.Series([None]), - ) - - -def test_string_get_json_object_strip_quotes_from_single_strings(): - gs = cudf.Series( - [ - """ - { - "store":{ - "book":[ - { - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ] - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", strip_quotes_from_single_strings=True - ), - cudf.Series(["Nigel Rees"]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", strip_quotes_from_single_strings=True - ), - cudf.Series(['["Sayings of the Century","Sword of Honour"]']), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].author", strip_quotes_from_single_strings=False - ), - cudf.Series(['"Nigel Rees"']), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].title", strip_quotes_from_single_strings=False - ), - cudf.Series(['["Sayings of the Century","Sword of Honour"]']), - ) - - -def test_string_get_json_object_missing_fields_as_nulls(): - gs = cudf.Series( - [ - """ - { - "store":{ - "book":[ - { - "author":"Nigel Rees", - "title":"Sayings of the Century", - "price":8.95 - }, - { - "category":"fiction", - "author":"Evelyn Waugh", - "title":"Sword of Honour", - "price":12.99 - } - ] - } - } - """ - ] - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].category", missing_fields_as_nulls=True - ), - cudf.Series(["null"]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].category", missing_fields_as_nulls=True - ), - cudf.Series(['[null,"fiction"]']), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[0].category", missing_fields_as_nulls=False - ), - cudf.Series([None]), - ) - assert_eq( - gs.str.get_json_object( - "$.store.book[*].category", missing_fields_as_nulls=False - ), - cudf.Series(['["fiction"]']), - ) - - -def test_str_join_lists_error(): - sr = cudf.Series([["a", "a"], ["b"], ["c"]]) - - with pytest.raises( - ValueError, match="sep_na_rep cannot be defined when `sep` is scalar." - ): - sr.str.join(sep="-", sep_na_rep="-") - - with pytest.raises( - TypeError, - match=re.escape( - "string_na_rep should be a string scalar, got [10, 20] of type " - ": " - ), - ): - sr.str.join(string_na_rep=[10, 20]) - - with pytest.raises( - ValueError, - match=re.escape( - "sep should be of similar size to the series, got: 2, expected: 3" - ), - ): - sr.str.join(sep=["=", "-"]) - - with pytest.raises( - TypeError, - match=re.escape( - "sep_na_rep should be a string scalar, got " - "['na'] of type: " - ), - ): - sr.str.join(sep=["-", "+", "."], sep_na_rep=["na"]) - - with pytest.raises( - TypeError, - match=re.escape( - "sep should be an str, array-like or Series object, " - "found " - ), - ): - sr.str.join(sep=cudf.DataFrame()) - - -@pytest.mark.parametrize( - "sr,sep,string_na_rep,sep_na_rep,expected", - [ - ( - cudf.Series([["a", "a"], ["b"], ["c"]]), - "-", - None, - None, - cudf.Series(["a-a", "b", "c"]), - ), - ( - cudf.Series([["a", "b"], [None], [None, "hello", None, "world"]]), - "__", - "=", - None, - cudf.Series(["a__b", None, "=__hello__=__world"]), - ), - ( - cudf.Series( - [ - ["a", None, "b"], - [None], - [None, "hello", None, "world"], - None, - ] - ), - ["-", "_", "**", "!"], - None, - None, - cudf.Series(["a--b", None, "**hello****world", None]), - ), - ( - cudf.Series( - [ - ["a", None, "b"], - [None], - [None, "hello", None, "world"], - None, - ] - ), - ["-", "_", "**", None], - "rep_str", - "sep_str", - cudf.Series( - ["a-rep_str-b", None, "rep_str**hello**rep_str**world", None] - ), - ), - ( - cudf.Series([[None, "a"], [None], None]), - ["-", "_", None], - "rep_str", - None, - cudf.Series(["rep_str-a", None, None]), - ), - ( - cudf.Series([[None, "a"], [None], None]), - ["-", "_", None], - None, - "sep_str", - cudf.Series(["-a", None, None]), - ), - ], -) -def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected): - actual = sr.str.join( - sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep - ) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "patterns, expected", - [ - ( - lambda: ["a", "s", "g", "i", "o", "r"], - [ - [-1, 0, 5, 3, -1, 2], - [-1, -1, -1, -1, 1, -1], - [2, 0, -1, -1, -1, 3], - [-1, -1, -1, 0, -1, -1], - ], - ), - ( - lambda: cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]), - [ - [-1, 0, 5, -1, -1, 2, -1], - [-1, -1, -1, -1, 1, -1, -1], - [2, -1, -1, -1, -1, 3, 0], - [-1, -1, -1, -1, -1, -1, -1], - ], - ), - ], -) -def test_str_find_multiple(patterns, expected): - s = cudf.Series(["strings", "to", "search", "in"]) - t = patterns() - - expected = cudf.Series(expected) - - # We convert to pandas because find_multiple returns ListDtype(int32) - # and expected is ListDtype(int64). - # Currently there is no easy way to type-cast these to match. - assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) - - s = cudf.Index(s) - t = cudf.Index(t) - - expected.index = s - - assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas()) - - -def test_str_find_multiple_error(): - s = cudf.Series(["strings", "to", "search", "in"]) - with pytest.raises( - TypeError, - match=re.escape( - "patterns should be an array-like or a Series object, found " - "" - ), - ): - s.str.find_multiple("a") - - t = cudf.Series([1, 2, 3]) - with pytest.raises( - TypeError, - match=re.escape("patterns can only be of 'string' dtype, got: int64"), - ): - s.str.find_multiple(t) - - -def test_str_iterate_error(): - s = cudf.Series(["abc", "xyz"]) - with pytest.raises(TypeError): - iter(s.str) - - -def test_string_reduction_error(): - s = cudf.Series([None, None], dtype="str") - ps = s.to_pandas(nullable=True) - assert_exceptions_equal( - s.any, - ps.any, - lfunc_args_and_kwargs=([], {"skipna": False}), - rfunc_args_and_kwargs=([], {"skipna": False}), - ) - - assert_exceptions_equal( - s.all, - ps.all, - lfunc_args_and_kwargs=([], {"skipna": False}), - rfunc_args_and_kwargs=([], {"skipna": False}), - ) diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py deleted file mode 100644 index 69876d97aad..00000000000 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ /dev/null @@ -1,374 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -import numba -import numpy as np -import pandas as pd -import pytest -from numba import cuda -from numba.core.typing import signature as nb_signature -from numba.types import CPointer, void - -import rmm - -import cudf -from cudf._lib.strings_udf import ( - column_from_udf_string_array, - column_to_string_view_array, -) -from cudf.core.udf.strings_typing import ( - str_view_arg_handler, - string_view, - udf_string, -) -from cudf.core.udf.utils import _get_extensionty_size, _ptx_file -from cudf.testing import assert_eq -from cudf.testing._utils import sv_to_udf_str -from cudf.utils._numba import _CUDFNumbaConfig - -_PTX_FILE = _ptx_file() - - -def get_kernels(func, dtype, size): - """ - Create two kernels for testing a single scalar string function. - The first tests the function's action on a string_view object and - the second tests the same except using a udf_string object. - Allocates an output vector with a dtype specified by the caller - The returned kernels execute the input function on each data - element of the input and returns the output into the output vector - """ - - func = cuda.jit(device=True)(func) - - if dtype == "str": - outty = CPointer(udf_string) - else: - outty = numba.np.numpy_support.from_dtype(dtype)[::1] - sig = nb_signature(void, CPointer(string_view), outty) - - @cuda.jit(sig, link=[_PTX_FILE], extensions=[str_view_arg_handler]) - def string_view_kernel(input_strings, output_col): - id = cuda.grid(1) - if id < size: - st = input_strings[id] - result = func(st) - output_col[id] = result - - @cuda.jit(sig, link=[_PTX_FILE], extensions=[str_view_arg_handler]) - def udf_string_kernel(input_strings, output_col): - # test the string function with a udf_string as input - id = cuda.grid(1) - if id < size: - st = input_strings[id] - st = sv_to_udf_str(st) - result = func(st) - output_col[id] = result - - return string_view_kernel, udf_string_kernel - - -def run_udf_test(data, func, dtype): - """ - Run a test kernel on a set of input data - Converts the input data to a cuDF column and subsequently - to an array of cudf::string_view objects. It then creates - a CUDA kernel using get_kernel which calls the input function, - and then assembles the result back into a cuDF series before - comparing it with the equivalent pandas result - """ - if dtype == "str": - output = rmm.DeviceBuffer( - size=len(data) * _get_extensionty_size(udf_string) - ) - else: - dtype = np.dtype(dtype) - output = cudf.core.column.column_empty(len(data), dtype=dtype) - - cudf_column = cudf.core.column.as_column(data) - str_views = column_to_string_view_array(cudf_column) - sv_kernel, udf_str_kernel = get_kernels(func, dtype, len(data)) - - expect = pd.Series(data).apply(func) - with _CUDFNumbaConfig(): - sv_kernel.forall(len(data))(str_views, output) - if dtype == "str": - result = column_from_udf_string_array(output) - else: - result = output - - got = cudf.Series._from_column(result.astype(dtype)) - assert_eq(expect, got, check_dtype=False) - with _CUDFNumbaConfig(): - udf_str_kernel.forall(len(data))(str_views, output) - if dtype == "str": - result = column_from_udf_string_array(output) - else: - result = output - - got = cudf.Series._from_column(result.astype(dtype)) - assert_eq(expect, got, check_dtype=False) - - -@pytest.fixture(scope="module") -def data(): - return [ - "abc", - "ABC", - "AbC", - "123", - "123aBc", - "123@.!", - "", - "rapids ai", - "gpu", - "True", - "False", - "1.234", - ".123a", - "0.013", - "1.0", - "01", - "20010101", - "cudf", - "cuda", - "gpu", - "This Is A Title", - "This is Not a Title", - "Neither is This a Title", - "NoT a TiTlE", - "123 Title Works", - ] - - -@pytest.fixture(params=["cudf", "cuda", "gpucudf", "abc"]) -def rhs(request): - return request.param - - -@pytest.fixture(params=["c", "cu", "2", "abc", "", "gpu"]) -def substr(request): - return request.param - - -def test_string_udf_eq(data, rhs): - def func(st): - return st == rhs - - run_udf_test(data, func, "bool") - - -def test_string_udf_ne(data, rhs): - def func(st): - return st != rhs - - run_udf_test(data, func, "bool") - - -def test_string_udf_ge(data, rhs): - def func(st): - return st >= rhs - - run_udf_test(data, func, "bool") - - -def test_string_udf_le(data, rhs): - def func(st): - return st <= rhs - - run_udf_test(data, func, "bool") - - -def test_string_udf_gt(data, rhs): - def func(st): - return st > rhs - - run_udf_test(data, func, "bool") - - -def test_string_udf_lt(data, rhs): - def func(st): - return st < rhs - - run_udf_test(data, func, "bool") - - -def test_string_udf_contains(data, substr): - def func(st): - return substr in st - - run_udf_test(data, func, "bool") - - -def test_string_udf_count(data, substr): - def func(st): - return st.count(substr) - - run_udf_test(data, func, "int32") - - -def test_string_udf_find(data, substr): - def func(st): - return st.find(substr) - - run_udf_test(data, func, "int32") - - -def test_string_udf_endswith(data, substr): - def func(st): - return st.endswith(substr) - - run_udf_test(data, func, "bool") - - -def test_string_udf_isalnum(data): - def func(st): - return st.isalnum() - - run_udf_test(data, func, "bool") - - -def test_string_udf_isalpha(data): - def func(st): - return st.isalpha() - - run_udf_test(data, func, "bool") - - -def test_string_udf_isdecimal(data): - def func(st): - return st.isdecimal() - - run_udf_test(data, func, "bool") - - -def test_string_udf_isdigit(data): - def func(st): - return st.isdigit() - - run_udf_test(data, func, "bool") - - -def test_string_udf_islower(data): - def func(st): - return st.islower() - - run_udf_test(data, func, "bool") - - -def test_string_udf_isnumeric(data): - def func(st): - return st.isnumeric() - - run_udf_test(data, func, "bool") - - -def test_string_udf_isspace(data): - def func(st): - return st.isspace() - - run_udf_test(data, func, "bool") - - -def test_string_udf_isupper(data): - def func(st): - return st.isupper() - - run_udf_test(data, func, "bool") - - -def test_string_udf_istitle(data): - def func(st): - return st.istitle() - - run_udf_test(data, func, "bool") - - -def test_string_udf_len(data): - def func(st): - return len(st) - - run_udf_test(data, func, "int64") - - -def test_string_udf_rfind(data, substr): - def func(st): - return st.rfind(substr) - - run_udf_test(data, func, "int32") - - -def test_string_udf_startswith(data, substr): - def func(st): - return st.startswith(substr) - - run_udf_test(data, func, "bool") - - -def test_string_udf_return_string(data): - def func(st): - return st - - run_udf_test(data, func, "str") - - -@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) -def test_string_udf_strip(data, strip_char): - def func(st): - return st.strip(strip_char) - - run_udf_test(data, func, "str") - - -@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) -def test_string_udf_lstrip(data, strip_char): - def func(st): - return st.lstrip(strip_char) - - run_udf_test(data, func, "str") - - -@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) -def test_string_udf_rstrip(data, strip_char): - def func(st): - return st.rstrip(strip_char) - - run_udf_test(data, func, "str") - - -def test_string_udf_upper(data): - def func(st): - return st.upper() - - run_udf_test(data, func, "str") - - -def test_string_udf_lower(data): - def func(st): - return st.lower() - - run_udf_test(data, func, "str") - - -@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) -def test_string_udf_concat(data, concat_char): - def func(st): - return st + concat_char - - run_udf_test(data, func, "str") - - -@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) -def test_string_udf_concat_reflected(data, concat_char): - def func(st): - return concat_char + st - - run_udf_test(data, func, "str") - - -@pytest.mark.parametrize("to_replace", ["a", "1", "", "@"]) -@pytest.mark.parametrize("replacement", ["a", "1", "", "@"]) -def test_string_udf_replace(data, to_replace, replacement): - def func(st): - return st.replace(to_replace, replacement) - - run_udf_test(data, func, "str") diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py deleted file mode 100644 index e91edc9eec6..00000000000 --- a/python/cudf/cudf/tests/test_struct.py +++ /dev/null @@ -1,451 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.core.dtypes import StructDtype -from cudf.testing import assert_eq -from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES - - -@pytest.mark.parametrize( - "data", - [ - [{}], - [{"a": None}], - [{"a": 1}], - [{"a": "one"}], - [{"a": 1}, {"a": 2}], - [{"a": 1, "b": "one"}, {"a": 2, "b": "two"}], - [{"b": "two", "a": None}, None, {"a": "one", "b": "two"}], - ], -) -def test_create_struct_series(data): - expect = pd.Series(data) - got = cudf.Series(data) - assert_eq(expect, got, check_dtype=False) - - -def test_struct_of_struct_copy(): - sr = cudf.Series([{"a": {"b": 1}}]) - assert_eq(sr, sr.copy()) - - -def test_struct_of_struct_loc(): - df = cudf.DataFrame({"col": [{"a": {"b": 1}}]}) - expect = cudf.Series([{"a": {"b": 1}}], name="col") - assert_eq(expect, df["col"]) - - -@pytest.mark.parametrize( - "key, expect", [(0, [1, 3]), (1, [2, 4]), ("a", [1, 3]), ("b", [2, 4])] -) -def test_struct_for_field(key, expect): - sr = cudf.Series([{"a": 1, "b": 2}, {"a": 3, "b": 4}]) - expect = cudf.Series(expect) - got = sr.struct.field(key) - assert_eq(expect, got) - - -@pytest.mark.parametrize("input_obj", [[{"a": 1, "b": cudf.NA, "c": 3}]]) -def test_series_construction_with_nulls(input_obj): - expect = pa.array(input_obj, from_pandas=True) - got = cudf.Series(input_obj).to_arrow() - - assert expect == got - - -@pytest.mark.parametrize( - "fields", - [ - {"a": np.dtype(np.int64)}, - {"a": np.dtype(np.int64), "b": None}, - { - "a": cudf.ListDtype(np.dtype(np.int64)), - "b": cudf.Decimal64Dtype(1, 0), - }, - { - "a": cudf.ListDtype(cudf.StructDtype({"b": np.dtype(np.int64)})), - "b": cudf.ListDtype(cudf.ListDtype(np.dtype(np.int64))), - }, - ], -) -def test_serialize_struct_dtype(fields): - dtype = cudf.StructDtype(fields) - recreated = dtype.__class__.deserialize(*dtype.serialize()) - assert recreated == dtype - - -@pytest.mark.parametrize( - "series, expected", - [ - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": 1}, - {}, - ], - {"a": "Hello world", "b": [], "c": cudf.NA}, - ), - ([{}], {}), - ( - [{"b": True}, {"a": 1, "c": [1, 2, 3], "d": "1", "b": False}], - {"a": cudf.NA, "c": cudf.NA, "d": cudf.NA, "b": True}, - ), - ], -) -def test_struct_getitem(series, expected): - sr = cudf.Series(series) - assert sr[0] == expected - - -@pytest.mark.parametrize( - "data, item", - [ - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, - {"a": "abcde", "b": [4, 5, 6], "c": 9}, - ], - {"a": "Hello world", "b": [], "c": cudf.NA}, - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, - {"a": "abcde", "b": [4, 5, 6], "c": 9}, - ], - {}, - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, - {"a": "abcde", "b": [4, 5, 6], "c": 9}, - ], - cudf.NA, - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": cudf.NA}, - {"a": "abcde", "b": [4, 5, 6], "c": 9}, - ], - {"a": "Second element", "b": [1, 2], "c": 1000}, - ), - ], -) -def test_struct_setitem(data, item): - sr = cudf.Series(data) - sr[1] = item - data[1] = item - expected = cudf.Series(data) - assert sr.to_arrow() == expected.to_arrow() - - -@pytest.mark.parametrize( - "data", - [ - {"a": 1, "b": "rapids", "c": [1, 2, 3, 4]}, - {"a": "Hello"}, - ], -) -def test_struct_scalar_host_construction(data): - slr = cudf.Scalar(data) - assert slr.value == data - assert list(slr.device_value.value.values()) == list(data.values()) - - -@pytest.mark.parametrize( - ("data", "dtype"), - [ - ( - {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA}, - cudf.StructDtype( - { - "a": np.dtype(np.int64), - "b": np.dtype(np.str_), - "c": cudf.ListDtype(np.dtype(np.int64)), - "d": np.dtype(np.int64), - } - ), - ), - ( - {"b": [], "c": [1, 2, 3]}, - cudf.StructDtype( - { - "b": cudf.ListDtype(np.dtype(np.int64)), - "c": cudf.ListDtype(np.dtype(np.int64)), - } - ), - ), - ], -) -def test_struct_scalar_host_construction_no_dtype_inference(data, dtype): - # cudf cannot infer the dtype of the scalar when it contains only nulls or - # is empty. - slr = cudf.Scalar(data, dtype=dtype) - assert slr.value == data - assert list(slr.device_value.value.values()) == list(data.values()) - - -def test_struct_scalar_null(): - slr = cudf.Scalar(cudf.NA, dtype=StructDtype) - assert slr.device_value.value is cudf.NA - - -def test_struct_explode(): - s = cudf.Series([], dtype=cudf.StructDtype({})) - expect = cudf.DataFrame({}) - assert_eq(expect, s.struct.explode()) - - s = cudf.Series( - [ - {"a": 1, "b": "x"}, - {"a": 2, "b": "y"}, - {"a": 3, "b": "z"}, - {"a": 4, "b": "a"}, - ] - ) - expect = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["x", "y", "z", "a"]}) - got = s.struct.explode() - assert_eq(expect, got) - - # check that a copy was made: - got["a"][0] = 5 - assert_eq(s.struct.explode(), expect) - - -def test_dataframe_to_struct(): - df = cudf.DataFrame() - expect = cudf.Series(dtype=cudf.StructDtype({})) - got = df.to_struct() - assert_eq(expect, got) - - df = cudf.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]}) - expect = cudf.Series( - [{"a": 1, "b": "x"}, {"a": 2, "b": "y"}, {"a": 3, "b": "z"}] - ) - got = df.to_struct() - assert_eq(expect, got) - - # check that a copy was made: - df["a"][0] = 5 - assert_eq(got, expect) - - # check that a non-string (but convertible to string) named column can be - # converted to struct - df = cudf.DataFrame([[1, 2], [3, 4]], columns=[(1, "b"), 0]) - expect = cudf.Series([{"(1, 'b')": 1, "0": 2}, {"(1, 'b')": 3, "0": 4}]) - with pytest.warns(UserWarning, match="will be casted"): - got = df.to_struct() - assert_eq(got, expect) - - -@pytest.mark.parametrize( - "series, slce", - [ - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": 1}, - {}, - None, - ], - slice(1, None), - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": 1}, - {}, - None, - {"d": ["Hello", "rapids"]}, - None, - cudf.NA, - ], - slice(1, 5), - ), - ( - [ - {"a": "Hello world", "b": []}, - {"a": "CUDF", "b": [1, 2, 3], "c": 1}, - {}, - None, - {"c": 5}, - None, - cudf.NA, - ], - slice(None, 4), - ), - ([{"a": {"b": 42, "c": -1}}, {"a": {"b": 0, "c": None}}], slice(0, 1)), - ], -) -def test_struct_slice(series, slce): - got = cudf.Series(series)[slce] - expected = cudf.Series(series[slce]) - assert got.to_arrow() == expected.to_arrow() - - -def test_struct_slice_nested_struct(): - data = [ - {"a": {"b": 42, "c": "abc"}}, - {"a": {"b": 42, "c": "hello world"}}, - ] - - got = cudf.Series(data)[0:1] - expect = cudf.Series(data[0:1]) - assert got.to_arrow() == expect.to_arrow() - - -@pytest.mark.parametrize( - "data", - [ - [{}], - [{"a": None}], - [{"a": 1}], - [{"a": "one"}], - [{"a": 1}, {"a": 2}], - [{"a": 1, "b": "one"}, {"a": 2, "b": "two"}], - [{"b": "two", "a": None}, None, {"a": "one", "b": "two"}], - ], -) -def test_struct_field_errors(data): - got = cudf.Series(data) - - with pytest.raises(KeyError): - got.struct.field("notWithinFields") - - with pytest.raises(IndexError): - got.struct.field(100) - - -@pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES) -def test_struct_with_datetime_and_timedelta(dtype): - df = cudf.DataFrame( - { - "a": [12, 232, 2334], - "datetime": cudf.Series([23432, 3432423, 324324], dtype=dtype), - } - ) - series = df.to_struct() - a_array = np.array([12, 232, 2334]) - datetime_array = np.array([23432, 3432423, 324324]).astype(dtype) - - actual = series.to_pandas() - values_list = [] - for i, val in enumerate(a_array): - values_list.append({"a": val, "datetime": datetime_array[i]}) - - expected = pd.Series(values_list) - assert_eq(expected, actual) - - -def test_struct_int_values(): - series = cudf.Series( - [{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}] - ) - actual_series = series.to_pandas() - - assert isinstance(actual_series[0]["b"], int) - assert isinstance(actual_series[1]["b"], type(None)) - assert isinstance(actual_series[2]["b"], int) - - -def test_nested_struct_from_pandas_empty(): - # tests constructing nested structs columns that would result in - # libcudf EMPTY type child columns inheriting their parent's null - # mask. See GH PR: #10761 - pdf = pd.Series([[{"c": {"x": None}}], [{"c": None}]]) - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf, gdf) - - -def _nested_na_replace(struct_scalar): - """ - Replace `cudf.NA` with `None` in the dict - """ - for key, value in struct_scalar.items(): - if value is cudf.NA: - struct_scalar[key] = None - return struct_scalar - - -@pytest.mark.parametrize( - "data, idx, expected", - [ - ( - [{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}], - 0, - {"f1": "a", "f2": {"a": "sf21"}}, - ), - ( - [ - {"f2": {"a": "sf21"}}, - {"f1": "sf12", "f2": None}, - ], - 0, - {"f1": cudf.NA, "f2": {"a": "sf21"}}, - ), - ( - [{"a": "123"}, {"a": "sf12", "b": {"a": {"b": "c"}}}], - 1, - {"a": "sf12", "b": {"a": {"b": "c"}}}, - ), - ], -) -def test_nested_struct_extract_host_scalars(data, idx, expected): - series = cudf.Series(data) - - assert _nested_na_replace(series[idx]) == _nested_na_replace(expected) - - -def test_struct_memory_usage(): - s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}]) - df = s.struct.explode() - - assert_eq(s.memory_usage(), df.memory_usage().sum()) - - -def test_struct_with_null_memory_usage(): - df = cudf.DataFrame( - { - "a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"), - "b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"), - } - ) - s = df.to_struct() - assert s.memory_usage() == 80 - - s[2:4] = None - assert s.memory_usage() == 272 - - -@pytest.mark.parametrize( - "indices", - [slice(0, 3), slice(1, 4), slice(None, None, 2), slice(1, None, 2)], - ids=[":3", "1:4", "0::2", "1::2"], -) -@pytest.mark.parametrize( - "values", - [[None, {}, {}, None], [{}, {}, {}, {}]], - ids=["nulls", "no_nulls"], -) -def test_struct_empty_children_slice(indices, values): - s = cudf.Series(values) - actual = s.iloc[indices] - expect = cudf.Series(values[indices], index=range(len(values))[indices]) - assert_eq(actual, expect) - - -def test_struct_iterate_error(): - s = cudf.Series( - [{"f2": {"a": "sf21"}, "f1": "a"}, {"f1": "sf12", "f2": None}] - ) - with pytest.raises(TypeError): - iter(s.struct) diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py deleted file mode 100644 index c3620db3880..00000000000 --- a/python/cudf/cudf/tests/test_testing.py +++ /dev/null @@ -1,437 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.core.column.column import as_column -from cudf.testing import ( - assert_frame_equal, - assert_index_equal, - assert_series_equal, -) -from cudf.testing._utils import ( - NUMERIC_TYPES, - OTHER_TYPES, - assert_column_memory_eq, - assert_column_memory_ne, -) -from cudf.testing.testing import assert_column_equal, assert_eq - - -@pytest.fixture( - params=[ - pa.array([*range(10)]), - pa.array(["hello", "world", "rapids", "AI"]), - pa.array([[1, 2, 3], [4, 5], [6], [], [7]]), - pa.array([{"f0": "hello", "f1": 42}, {"f0": "world", "f1": 3}]), - ] -) -def arrow_arrays(request): - return request.param - - -@pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) -@pytest.mark.parametrize("exact", ["equiv", True, False]) -@pytest.mark.parametrize("check_names", [True, False]) -@pytest.mark.parametrize("rname", ["a", "b"]) -@pytest.mark.parametrize("check_categorical", [True, False]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] -) -def test_basic_assert_index_equal( - rdata, - exact, - check_names, - rname, - check_categorical, - dtype, -): - p_left = pd.Index([1, 2, 3], name="a", dtype=dtype) - p_right = pd.Index(rdata, name=rname, dtype=dtype) - - left = cudf.from_pandas(p_left) - right = cudf.from_pandas(p_right) - - kind = None - try: - pd.testing.assert_index_equal( - p_left, - p_right, - exact=exact, - check_names=check_names, - check_categorical=check_categorical, - ) - except BaseException as e: - kind = type(e) - msg = str(e) - - if kind is not None: - if (kind == TypeError) and ( - msg - == ( - "Categoricals can only be compared " - "if 'categories' are the same." - ) - ): - kind = AssertionError - with pytest.raises(kind): - assert_index_equal( - left, - right, - exact=exact, - check_names=check_names, - check_categorical=check_categorical, - ) - else: - assert_index_equal( - left, - right, - exact=exact, - check_names=check_names, - check_categorical=check_categorical, - ) - - -@pytest.mark.parametrize("rdata", [[1, 2, 5], [1, 2, 6], [1, 2, 5, 6]]) -@pytest.mark.parametrize("check_names", [True, False]) -@pytest.mark.parametrize("rname", ["a", "b"]) -@pytest.mark.parametrize("check_category_order", [True, False]) -@pytest.mark.parametrize("check_categorical", [True, False]) -@pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + OTHER_TYPES + ["datetime64[ns]"] -) -def test_basic_assert_series_equal( - rdata, - rname, - check_names, - check_category_order, - check_categorical, - dtype, -): - p_left = pd.Series([1, 2, 3], name="a", dtype=dtype) - p_right = pd.Series(rdata, name=rname, dtype=dtype) - - left = cudf.from_pandas(p_left) - right = cudf.from_pandas(p_right) - - kind = None - try: - pd.testing.assert_series_equal( - p_left, - p_right, - check_names=check_names, - check_categorical=check_categorical, - check_category_order=check_category_order, - ) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_series_equal( - left, - right, - check_names=check_names, - check_categorical=check_categorical, - check_category_order=check_category_order, - ) - else: - assert_series_equal( - left, - right, - check_names=check_names, - check_categorical=check_categorical, - check_category_order=check_category_order, - ) - - -@pytest.mark.parametrize( - "other", - [ - as_column(["1", "2", "3"]), - as_column([[1], [2], [3]]), - as_column([{"a": 1}, {"a": 2}, {"a": 3}]), - ], -) -def test_assert_column_equal_dtype_edge_cases(other): - # string series should be 100% different - # even when the elements are the same - base = as_column([1, 2, 3]) - - # for these dtypes, the diff should always be 100% regardless of the values - with pytest.raises( - AssertionError, match=r".*values are different \(100.0 %\).*" - ): - assert_column_equal(base, other, check_dtype=False) - - # the exceptions are the empty and all null cases - assert_column_equal(base.slice(0, 0), other.slice(0, 0), check_dtype=False) - assert_column_equal(other.slice(0, 0), base.slice(0, 0), check_dtype=False) - - base = as_column(cudf.NA, length=len(base), dtype=base.dtype) - other = as_column(cudf.NA, length=len(other), dtype=other.dtype) - - assert_column_equal(base, other, check_dtype=False) - assert_column_equal(other, base, check_dtype=False) - - -@pytest.mark.parametrize( - "rdtype", [["int8", "int16", "int64"], ["int64", "int16", "int8"]] -) -@pytest.mark.parametrize("rname", [["a", "b", "c"], ["b", "c", "a"]]) -@pytest.mark.parametrize("index", [[1, 2, 3], [3, 2, 1]]) -@pytest.mark.parametrize("check_exact", [True, False]) -@pytest.mark.parametrize("check_dtype", [True, False]) -@pytest.mark.parametrize("check_names", [True, False]) -@pytest.mark.parametrize("check_like", [True, False]) -@pytest.mark.parametrize("mismatch", [True, False]) -def test_basic_assert_frame_equal( - rdtype, - rname, - index, - check_exact, - check_dtype, - check_names, - check_like, - mismatch, -): - data = [1, 2, 1] - p_left = pd.DataFrame(index=[1, 2, 3]) - p_left["a"] = np.array(data, dtype="int8") - p_left["b"] = np.array(data, dtype="int16") - if mismatch: - p_left["c"] = np.array([1, 2, 3], dtype="int64") - else: - p_left["c"] = np.array(data, dtype="int64") - - p_right = pd.DataFrame(index=index) - for dtype, name in zip(rdtype, rname): - p_right[name] = np.array(data, dtype=dtype) - - left = cudf.from_pandas(p_left) - right = cudf.from_pandas(p_right) - - kind = None - try: - pd.testing.assert_frame_equal( - p_left, - p_right, - check_exact=check_exact, - check_dtype=check_dtype, - check_names=check_names, - check_like=check_like, - ) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_frame_equal( - left, - right, - check_exact=check_exact, - check_dtype=check_dtype, - check_names=check_names, - check_like=check_like, - ) - else: - assert_frame_equal( - left, - right, - check_exact=check_exact, - check_dtype=check_dtype, - check_names=check_names, - check_like=check_like, - ) - - -@pytest.mark.parametrize("rdata", [[0, 1, 2, 3], [0, 1, 2, 4]]) -@pytest.mark.parametrize("check_datetimelike_compat", [True, False]) -def test_datetime_like_compaibility(rdata, check_datetimelike_compat): - psr1 = pd.Series([0, 1, 2, 3], dtype="datetime64[ns]") - psr2 = pd.Series(rdata, dtype="datetime64[ns]").astype("str") - - sr1 = cudf.from_pandas(psr1) - sr2 = cudf.from_pandas(psr2) - - kind = None - try: - pd.testing.assert_series_equal( - psr1, psr2, check_datetimelike_compat=check_datetimelike_compat - ) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_series_equal( - sr1, sr2, check_datetimelike_compat=check_datetimelike_compat - ) - else: - assert_series_equal( - sr1, sr2, check_datetimelike_compat=check_datetimelike_compat - ) - - -@pytest.mark.parametrize( - "rdata", - [ - [[0, 1, 2, 3], ["G", "O", "N", "E"]], - [[0, 1, 2, 4], ["G", "O", "N", "E"]], - ], -) -def test_multiindex_equal(rdata): - pidx1 = pd.MultiIndex.from_arrays( - [[0, 1, 2, 3], ["G", "O", "N", "E"]], names=("n", "id") - ) - pidx2 = pd.MultiIndex.from_arrays(rdata, names=("n", "id")) - - idx1 = cudf.from_pandas(pidx1) - idx2 = cudf.from_pandas(pidx2) - - kind = None - try: - pd.testing.assert_index_equal(pidx1, pidx2) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_index_equal(idx1, idx2) - else: - assert_index_equal(idx1, idx2) - - -@pytest.mark.parametrize("dtype", ["int8", "uint8", "float32"]) -@pytest.mark.parametrize("check_exact", [True, False]) -@pytest.mark.parametrize("check_dtype", [True, False]) -def test_series_different_type_cases(dtype, check_exact, check_dtype): - data = [0, 1, 2, 3] - - psr1 = pd.Series(data, dtype="uint8") - psr2 = pd.Series(data, dtype=dtype) - - sr1 = cudf.from_pandas(psr1) - sr2 = cudf.from_pandas(psr2) - - kind = None - try: - pd.testing.assert_series_equal( - psr1, psr2, check_exact=check_exact, check_dtype=check_dtype - ) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_series_equal( - sr1, sr2, check_exact=check_exact, check_dtype=check_dtype - ) - else: - assert_series_equal( - sr1, sr2, check_exact=check_exact, check_dtype=check_dtype - ) - - -@pytest.mark.parametrize( - "dtype", - ["int8", "int16", "int32", "int64"], -) -@pytest.mark.parametrize("exact", ["equiv", True, False]) -def test_range_index_and_int_index_eqaulity(dtype, exact): - pidx1 = pd.RangeIndex(0, stop=5, step=1) - pidx2 = pd.Index([0, 1, 2, 3, 4]) - idx1 = cudf.from_pandas(pidx1) - idx2 = cudf.Index([0, 1, 2, 3, 4], dtype=dtype) - - kind = None - try: - pd.testing.assert_index_equal(pidx1, pidx2, exact=exact) - except BaseException as e: - kind = type(e) - - if kind is not None: - with pytest.raises(kind): - assert_index_equal(idx1, idx2, exact=exact) - else: - assert_index_equal(idx1, idx2, exact=exact) - - -@pytest.mark.parametrize( - "left, right", - [ - (1493282, 1493282), - (1493282.0, 1493282.0 + 1e-8), - ("abc", "abc"), - (0, np.array(0)), - ( - np.datetime64(123456, "ns"), - pd.Timestamp(np.datetime64(123456, "ns")), - ), - ("int64", np.dtype("int64")), - (np.nan, np.nan), - ], -) -def test_basic_scalar_equality(left, right): - assert_eq(left, right) - - -@pytest.mark.parametrize( - "left, right", - [ - (1493282, 1493274), - (1493282.0, 1493282.0 + 1e-6), - ("abc", "abd"), - (0, np.array(1)), - ( - np.datetime64(123456, "ns"), - pd.Timestamp(np.datetime64(123457, "ns")), - ), - ("int64", np.dtype("int32")), - ], -) -def test_basic_scalar_inequality(left, right): - with pytest.raises(AssertionError, match=r".*not (almost )?equal.*"): - assert_eq(left, right) - - -def test_assert_column_memory_basic(arrow_arrays): - left = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - right = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - - with pytest.raises(AssertionError): - assert_column_memory_eq(left, right) - assert_column_memory_ne(left, right) - - -def test_assert_column_memory_slice(arrow_arrays): - col = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - left = col.slice(0, 1) - right = col.slice(1, 2) - - with pytest.raises(AssertionError): - assert_column_memory_eq(left, right) - assert_column_memory_ne(left, right) - - with pytest.raises(AssertionError): - assert_column_memory_eq(left, col) - assert_column_memory_ne(left, col) - - with pytest.raises(AssertionError): - assert_column_memory_eq(right, col) - assert_column_memory_ne(right, col) - - -def test_assert_column_memory_basic_same(arrow_arrays): - data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays) - buf = cudf.core.buffer.as_buffer(data.base_data) - - left = cudf.core.column.build_column(buf, dtype=np.int8) - right = cudf.core.column.build_column(buf, dtype=np.int8) - - assert_column_memory_eq(left, right) - with pytest.raises(AssertionError): - assert_column_memory_ne(left, right) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py deleted file mode 100644 index d622ff6b94e..00000000000 --- a/python/cudf/cudf/tests/test_timedelta.py +++ /dev/null @@ -1,1508 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import datetime -import operator - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import _utils as utils, assert_eq -from cudf.testing._utils import assert_exceptions_equal - -_TIMEDELTA_DATA = [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [ - 136457654736252, - 134736784364431, - 245345345545332, - 223432411, - 2343241, - 3634548734, - 23234, - ], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], -] - -_TIMEDELTA_DATA_NON_OVERFLOW = [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], -] - -_cmpops = [ - operator.lt, - operator.gt, - operator.le, - operator.ge, - operator.eq, - operator.ne, -] - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - [0.3534, 12, 22, 343, 43.53534, 4353.42], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_series_create(data, dtype): - if dtype not in ("timedelta64[ns]"): - pytest.skip( - "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" - ) - psr = pd.Series( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype - ) - gsr = cudf.Series(data, dtype=dtype) - - assert_eq(psr, gsr) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [12, 12, 22, 343, 4353534, 435342], - [0.3534, 12, 22, 343, 43.53534, 4353.42], - cp.asarray([10, 20, 30, 100]), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize("cast_dtype", ["int64", "category"]) -def test_timedelta_from_typecast(data, dtype, cast_dtype): - if dtype not in ("timedelta64[ns]"): - pytest.skip( - "Bug in pandas : https://github.com/pandas-dev/pandas/issues/35465" - ) - psr = pd.Series( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype - ) - gsr = cudf.Series(data, dtype=dtype) - - if cast_dtype == "int64": - assert_eq(psr.values.view(cast_dtype), gsr.astype(cast_dtype).values) - else: - assert_eq(psr.astype(cast_dtype), gsr.astype(cast_dtype)) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [12, 12, 22, 343, 4353534, 435342], - [0.3534, 12, 22, 343, 43.53534, 4353.42], - cp.asarray([10, 20, 30, 100]), - ], -) -@pytest.mark.parametrize("cast_dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_to_typecast(data, cast_dtype): - psr = pd.Series(cp.asnumpy(data) if isinstance(data, cp.ndarray) else data) - gsr = cudf.Series(data) - - assert_eq(psr.astype(cast_dtype), gsr.astype(cast_dtype)) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - [0.3534, 12, 22, 343, 43.53534, 4353.42], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_from_pandas(data, dtype): - psr = pd.Series( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype - ) - gsr = cudf.from_pandas(psr) - - assert_eq(psr, gsr) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_series_to_numpy(data, dtype): - gsr = cudf.Series(data, dtype=dtype) - - expected = np.array( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype - ) - expected = expected[~np.isnan(expected)] - - actual = gsr.dropna().to_numpy() - - np.testing.assert_array_equal(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_series_to_pandas(data, dtype): - gsr = cudf.Series(data, dtype=dtype) - - expected = np.array( - cp.asnumpy(data) if isinstance(data, cp.ndarray) else data, dtype=dtype - ) - - expected = pd.Series(expected) - actual = gsr.to_pandas() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data,other", - [ - ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), - ([1000000, 200000, None], [1000000, 200000, None]), - ([], []), - ([None], [None]), - ([None, None, None, None, None], [None, None, None, None, None]), - ( - [12, 12, 22, 343, 4353534, 435342], - [12, 12, 22, 343, 4353534, 435342], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ([1000000, 200000, 3000000], [200000, 34543, 3000000]), - ([1000000, 200000, None], [1000000, 200000, 3000000]), - ([None], [1]), - ( - [12, 12, 22, 343, 4353534, 435342], - [None, 1, 220, 3, 34, 4353423287], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize( - "ops", - [ - "eq", - "ne", - "lt", - "gt", - "le", - "ge", - "add", - "radd", - "sub", - "rsub", - "floordiv", - "truediv", - "mod", - ], -) -def test_timedelta_ops_misc_inputs(data, other, dtype, ops): - gsr = cudf.Series(data, dtype=dtype) - other_gsr = cudf.Series(other, dtype=dtype) - - psr = gsr.to_pandas() - other_psr = other_gsr.to_pandas() - - expected = getattr(psr, ops)(other_psr) - actual = getattr(gsr, ops)(other_gsr) - if ops in ("eq", "lt", "gt", "le", "ge"): - actual = actual.fillna(False) - elif ops == "ne": - actual = actual.fillna(True) - - if ops == "floordiv": - expected[actual.isna().to_pandas()] = np.nan - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "datetime_data,timedelta_data", - [ - ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), - ([1000000, 200000, None], [1000000, 200000, None]), - ([], []), - ([None], [None]), - ([None, None, None, None, None], [None, None, None, None, None]), - ( - [12, 12, 22, 343, 4353534, 435342], - [12, 12, 22, 343, 4353534, 435342], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ([1000000, 200000, 3000000], [200000, 34543, 3000000]), - ([1000000, 200000, None], [1000000, 200000, 3000000]), - ([None], [1]), - ( - [12, 12, 22, 343, 4353534, 435342], - [None, 1, 220, 3, 34, 4353423287], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ( - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - ), - ( - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - ), - ( - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ), - ( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ), - ], -) -@pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) -@pytest.mark.parametrize("timedelta_dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize( - "ops", - ["add", "sub"], -) -def test_timedelta_ops_datetime_inputs( - datetime_data, timedelta_data, datetime_dtype, timedelta_dtype, ops -): - gsr_datetime = cudf.Series(datetime_data, dtype=datetime_dtype) - gsr_timedelta = cudf.Series(timedelta_data, dtype=timedelta_dtype) - - psr_datetime = gsr_datetime.to_pandas() - psr_timedelta = gsr_timedelta.to_pandas() - - expected = getattr(psr_datetime, ops)(psr_timedelta) - actual = getattr(gsr_datetime, ops)(gsr_timedelta) - - assert_eq(expected, actual) - - if ops == "add": - expected = getattr(psr_timedelta, ops)(psr_datetime) - actual = getattr(gsr_timedelta, ops)(gsr_datetime) - - assert_eq(expected, actual) - elif ops == "sub": - assert_exceptions_equal( - lfunc=operator.sub, - rfunc=operator.sub, - lfunc_args_and_kwargs=([psr_timedelta, psr_datetime],), - rfunc_args_and_kwargs=([gsr_timedelta, gsr_datetime],), - ) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame( - { - "A": pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")), - "B": pd.Series([pd.Timedelta(days=i) for i in range(3)]), - } - ), - pd.DataFrame( - { - "A": pd.Series( - pd.date_range("1994-1-1", periods=50, freq="D") - ), - "B": pd.Series([pd.Timedelta(days=i) for i in range(50)]), - } - ), - ], -) -@pytest.mark.parametrize("op", ["add", "sub"]) -def test_timedelta_dataframe_ops(df, op): - pdf = df - gdf = cudf.from_pandas(pdf) - - if op == "add": - pdf["C"] = pdf["A"] + pdf["B"] - gdf["C"] = gdf["A"] + gdf["B"] - elif op == "sub": - pdf["C"] = pdf["A"] - pdf["B"] - gdf["C"] = gdf["A"] - gdf["B"] - - assert_eq(pdf, gdf) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize( - "other_scalars", - [ - datetime.timedelta(days=768), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - datetime.timedelta(minutes=447), - datetime.timedelta(hours=447), - datetime.timedelta(weeks=734), - np.timedelta64(4, "s"), - np.timedelta64(456, "D"), - np.timedelta64(46, "h"), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize( - "op", - [ - "add", - "sub", - "truediv", - "mod", - "floordiv", - ], -) -def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op): - gsr = cudf.Series(data=data, dtype=dtype) - psr = gsr.to_pandas() - - if op == "add": - expected = psr + other_scalars - actual = gsr + other_scalars - elif op == "sub": - expected = psr - other_scalars - actual = gsr - other_scalars - elif op == "truediv": - expected = psr / other_scalars - actual = gsr / other_scalars - elif op == "floordiv": - expected = psr // other_scalars - actual = gsr // other_scalars - elif op == "mod": - expected = psr % other_scalars - actual = gsr % other_scalars - - assert_eq(expected, actual) - - if op == "add": - expected = other_scalars + psr - actual = other_scalars + gsr - elif op == "sub": - expected = other_scalars - psr - actual = other_scalars - gsr - elif op == "truediv": - expected = other_scalars / psr - actual = other_scalars / gsr - elif op == "floordiv": - expected = other_scalars // psr - actual = other_scalars // gsr - elif op == "mod": - expected = other_scalars % psr - actual = other_scalars % gsr - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "reverse", - [ - False, - pytest.param( - True, - marks=pytest.mark.xfail( - strict=True, - reason=( - "timedelta modulo by zero is dubiously defined in " - "both pandas and cuDF " - "(see https://github.com/rapidsai/cudf/issues/5938)" - ), - ), - ), - ], -) -def test_timedelta_series_mod_with_scalar_zero(reverse): - gsr = cudf.Series(data=[0.2434], dtype=np.timedelta64(1, "ns")) - psr = gsr.to_pandas() - scalar = datetime.timedelta(days=768) - if reverse: - expected = scalar % psr - actual = scalar % gsr - else: - expected = psr % scalar - actual = gsr % scalar - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize( - "cpu_scalar", - [ - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - np.timedelta64(4, "s"), - np.timedelta64("nat", "s"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64("nat", "ns"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize( - "op", - [ - "add", - "sub", - "truediv", - "mod", - "floordiv", - ], -) -def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op): - gpu_scalar = cudf.Scalar(cpu_scalar) - - gsr = cudf.Series(data=data, dtype=dtype) - psr = gsr.to_pandas() - - if op == "add": - expected = psr + cpu_scalar - actual = gsr + gpu_scalar - elif op == "sub": - expected = psr - cpu_scalar - actual = gsr - gpu_scalar - elif op == "truediv": - expected = psr / cpu_scalar - actual = gsr / gpu_scalar - elif op == "floordiv": - expected = psr // cpu_scalar - actual = gsr // gpu_scalar - elif op == "mod": - expected = psr % cpu_scalar - actual = gsr % gpu_scalar - - assert_eq(expected, actual) - - if op == "add": - expected = cpu_scalar + psr - actual = gpu_scalar + gsr - elif op == "sub": - expected = cpu_scalar - psr - actual = gpu_scalar - gsr - elif op == "truediv": - expected = cpu_scalar / psr - actual = gpu_scalar / gsr - elif op == "floordiv": - expected = cpu_scalar // psr - actual = gpu_scalar // gsr - elif op == "mod": - expected = cpu_scalar % psr - actual = gpu_scalar % gsr - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "reverse", - [ - False, - pytest.param( - True, - marks=pytest.mark.xfail( - strict=True, - reason=( - "timedelta modulo by zero is dubiously defined in " - "both pandas and cuDF " - "(see https://github.com/rapidsai/cudf/issues/5938)" - ), - ), - ), - ], -) -def test_timedelta_series_mod_with_cudf_scalar_zero(reverse): - gsr = cudf.Series(data=[0.2434], dtype=np.timedelta64(1, "ns")) - psr = gsr.to_pandas() - scalar = datetime.timedelta(days=768) - gpu_scalar = cudf.Scalar(scalar) - if reverse: - expected = scalar % psr - actual = gpu_scalar % gsr - else: - expected = psr % scalar - actual = gsr % gpu_scalar - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - [ - [1000000, 200000, 3000000], - [1000000, 200000, None], - [], - [None], - [None, None, None, None, None], - [12, 12, 22, 343, 4353534, 435342], - np.array([10, 20, 30, None, 100]), - cp.asarray([10, 20, 30, 100]), - [1000000, 200000, 3000000], - [1000000, 200000, None], - [1], - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize("reduction_op", ["sum", "mean", "median", "quantile"]) -def test_timedelta_reduction_ops(data, dtype, reduction_op): - gsr = cudf.Series(data, dtype=dtype) - psr = gsr.to_pandas() - - if len(psr) > 0 and psr.isnull().all() and reduction_op == "median": - with pytest.warns(RuntimeWarning, match="Mean of empty slice"): - expected = getattr(psr, reduction_op)() - else: - expected = getattr(psr, reduction_op)() - actual = getattr(gsr, reduction_op)() - if pd.isna(expected) and pd.isna(actual): - pass - elif isinstance(expected, pd.Timedelta) and isinstance( - actual, pd.Timedelta - ): - assert ( - expected.round(gsr._column.time_unit).value - == actual.round(gsr._column.time_unit).value - ) - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - _TIMEDELTA_DATA, -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_dt_components(data, dtype): - gsr = cudf.Series(data, dtype=dtype) - psr = gsr.to_pandas() - - expected = psr.dt.components - actual = gsr.dt.components - - if gsr.isnull().any(): - assert_eq(expected, actual.astype("float")) - else: - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "data", - _TIMEDELTA_DATA, -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_dt_properties(data, dtype): - gsr = cudf.Series(data, dtype=dtype) - psr = gsr.to_pandas() - - def local_assert(expected, actual, **kwargs): - if gsr.isnull().any(): - assert_eq(expected, actual.astype("float"), **kwargs) - else: - assert_eq(expected, actual, **kwargs) - - expected_days = psr.dt.days - actual_days = gsr.dt.days - - local_assert(expected_days, actual_days, check_dtype=False) - - expected_seconds = psr.dt.seconds - actual_seconds = gsr.dt.seconds - - local_assert(expected_seconds, actual_seconds, check_dtype=False) - - expected_microseconds = psr.dt.microseconds - actual_microseconds = gsr.dt.microseconds - - local_assert(expected_microseconds, actual_microseconds, check_dtype=False) - - expected_nanoseconds = psr.dt.nanoseconds - actual_nanoseconds = gsr.dt.nanoseconds - - local_assert(expected_nanoseconds, actual_nanoseconds, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - _TIMEDELTA_DATA, -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_index(data, dtype): - gdi = cudf.Index(data, dtype=dtype) - pdi = gdi.to_pandas() - - assert_eq(pdi, gdi) - - -@pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW) -@pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) -@pytest.mark.parametrize("timedelta_dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_index_datetime_index_ops( - data, datetime_dtype, timedelta_dtype -): - gdt = cudf.Index(data, dtype=datetime_dtype) - gtd = cudf.Index(data, dtype=timedelta_dtype) - - pdt = gdt.to_pandas() - ptd = gtd.to_pandas() - - assert_eq(gdt - gtd, pdt - ptd) - assert_eq(gdt + gtd, pdt + ptd) - - -@pytest.mark.parametrize( - "datetime_data,timedelta_data", - [ - ([1000000, 200000, 3000000], [1000000, 200000, 3000000]), - ([1000000, 200000, None], [1000000, 200000, None]), - ([], []), - ([None], [None]), - ([None, None, None, None, None], [None, None, None, None, None]), - ( - [12, 12, 22, 343, 4353534, 435342], - [12, 12, 22, 343, 4353534, 435342], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ([1000000, 200000, 3000000], [200000, 34543, 3000000]), - ([1000000, 200000, None], [1000000, 200000, 3000000]), - ([None], [1]), - ( - [12, 12, 22, 343, 4353534, 435342], - [None, 1, 220, 3, 34, 4353423287], - ), - (np.array([10, 20, 30, None, 100]), np.array([10, 20, 30, None, 100])), - (cp.asarray([10, 20, 30, 100]), cp.asarray([10, 20, 30, 100])), - ( - [12, 11, 232, 223432411, 2343241, 234324, 23234], - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - ), - ( - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - ), - ( - [11, 1132324, 2322323111, 23341, 2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ), - ( - [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323], - [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234], - ), - ], -) -@pytest.mark.parametrize("datetime_dtype", utils.DATETIME_TYPES) -@pytest.mark.parametrize("timedelta_dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_datetime_index_ops_misc( - datetime_data, timedelta_data, datetime_dtype, timedelta_dtype -): - gdt = cudf.Index(datetime_data, dtype=datetime_dtype) - gtd = cudf.Index(timedelta_data, dtype=timedelta_dtype) - - pdt = gdt.to_pandas() - ptd = gtd.to_pandas() - - assert_eq(gdt - gtd, pdt - ptd) - assert_eq(gdt + gtd, pdt + ptd) - - -@pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW) -@pytest.mark.parametrize( - "other_scalars", - [ - pd.Timedelta(1513393355.5, unit="s"), - pd.Timedelta(34765, unit="D"), - datetime.timedelta(days=768), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - datetime.timedelta(minutes=447), - datetime.timedelta(hours=447), - datetime.timedelta(weeks=734), - np.timedelta64(4, "s"), - np.timedelta64(456, "D"), - np.timedelta64(46, "h"), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize( - "op", - [ - "add", - "sub", - "truediv", - "floordiv", - ], -) -@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning:pandas") -def test_timedelta_index_ops_with_scalars( - request, data, other_scalars, dtype, op -): - gtdi = cudf.Index(data=data, dtype=dtype) - ptdi = gtdi.to_pandas() - - if op == "add": - expected = ptdi + other_scalars - actual = gtdi + other_scalars - elif op == "sub": - expected = ptdi - other_scalars - actual = gtdi - other_scalars - elif op == "truediv": - expected = ptdi / other_scalars - actual = gtdi / other_scalars - elif op == "floordiv": - expected = ptdi // other_scalars - actual = gtdi // other_scalars - - assert_eq(expected, actual) - - if op == "add": - expected = other_scalars + ptdi - actual = other_scalars + gtdi - elif op == "sub": - expected = other_scalars - ptdi - actual = other_scalars - gtdi - elif op == "truediv": - expected = other_scalars / ptdi - actual = other_scalars / gtdi - elif op == "floordiv": - expected = other_scalars // ptdi - actual = other_scalars // gtdi - - # Division by zero for datetime or timedelta is - # dubiously defined in both pandas (Any // 0 -> 0 in - # pandas) and cuDF (undefined behaviour) - request.applymarker( - pytest.mark.xfail( - condition=( - op == "floordiv" - and 0 in ptdi.astype("int") - and np.timedelta64(other_scalars).item() is not None - ), - reason="Related to https://github.com/rapidsai/cudf/issues/5938", - ) - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW) -@pytest.mark.parametrize( - "cpu_scalar", - [ - pd.Timedelta(1513393355.5, unit="s"), - datetime.timedelta(seconds=768), - datetime.timedelta(microseconds=7), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize( - "op", - [ - "add", - "sub", - "truediv", - "floordiv", - ], -) -def test_timedelta_index_ops_with_cudf_scalars( - request, data, cpu_scalar, dtype, op -): - gtdi = cudf.Index(data=data, dtype=dtype) - ptdi = gtdi.to_pandas() - - gpu_scalar = cudf.Scalar(cpu_scalar) - - if op == "add": - expected = ptdi + cpu_scalar - actual = gtdi + gpu_scalar - elif op == "sub": - expected = ptdi - cpu_scalar - actual = gtdi - gpu_scalar - elif op == "truediv": - expected = ptdi / cpu_scalar - actual = gtdi / gpu_scalar - elif op == "floordiv": - expected = ptdi // cpu_scalar - actual = gtdi // gpu_scalar - - assert_eq(expected, actual) - - if op == "add": - expected = cpu_scalar + ptdi - actual = gpu_scalar + gtdi - elif op == "sub": - expected = cpu_scalar - ptdi - actual = gpu_scalar - gtdi - elif op == "truediv": - expected = cpu_scalar / ptdi - actual = gpu_scalar / gtdi - elif op == "floordiv": - expected = cpu_scalar // ptdi - actual = gpu_scalar // gtdi - - # Division by zero for datetime or timedelta is - # dubiously defined in both pandas (Any // 0 -> 0 in - # pandas) and cuDF (undefined behaviour) - request.applymarker( - pytest.mark.xfail( - condition=( - op == "floordiv" - and 0 in ptdi.astype("int") - and np.timedelta64(cpu_scalar).item() is not None - ), - reason="https://github.com/rapidsai/cudf/issues/5938", - ) - ) - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", _TIMEDELTA_DATA) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize("name", ["abcd", None]) -def test_timedelta_index_properties(data, dtype, name): - gdi = cudf.Index(data, dtype=dtype, name=name) - pdi = gdi.to_pandas() - - def local_assert(expected, actual): - if actual._values.null_count: - assert_eq(expected, actual.astype("float64")) - else: - assert_eq(expected, actual) - - expected_days = pdi.days - actual_days = gdi.days - - local_assert(expected_days, actual_days) - - expected_seconds = pdi.seconds - actual_seconds = gdi.seconds - - local_assert(expected_seconds, actual_seconds) - - expected_microseconds = pdi.microseconds - actual_microseconds = gdi.microseconds - - local_assert(expected_microseconds, actual_microseconds) - - expected_nanoseconds = pdi.nanoseconds - actual_nanoseconds = gdi.nanoseconds - - local_assert(expected_nanoseconds, actual_nanoseconds) - - expected_components = pdi.components - actual_components = gdi.components - - if actual_components.isnull().any().any(): - assert_eq(expected_components, actual_components.astype("float")) - else: - assert_eq( - expected_components, - actual_components, - check_index_type=not actual_components.empty, - ) - - -@pytest.mark.parametrize("data", _TIMEDELTA_DATA) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize( - "fill_value", - [ - np.timedelta64(4, "s"), - np.timedelta64(456, "D"), - np.timedelta64("nat"), - np.timedelta64(1, "s"), - np.timedelta64(1, "ms"), - np.timedelta64(1, "us"), - np.timedelta64(1, "ns"), - "NaT", - ], -) -def test_timedelta_fillna(data, dtype, fill_value): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - expected = psr.dropna() - actual = sr.dropna() - - assert_eq(expected, actual) - - expected = psr.fillna(fill_value) - actual = sr.fillna(fill_value) - assert_eq(expected, actual) - - expected = expected.dropna() - actual = actual.dropna() - - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "gsr,expected_series", - [ - ( - cudf.Series([1, 2, 3], dtype="timedelta64[ns]"), - cudf.Series( - [ - "0 days 00:00:00.000000001", - "0 days 00:00:00.000000002", - "0 days 00:00:00.000000003", - ] - ), - ), - ( - cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ms]"), - cudf.Series( - ["0 days 00:16:40", "0 days 00:03:20", "0 days 00:50:00"] - ), - ), - ( - cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[s]"), - cudf.Series( - ["11 days 13:46:40", "2 days 07:33:20", "34 days 17:20:00"] - ), - ), - ( - cudf.Series( - [None, None, None, None, None], dtype="timedelta64[us]" - ), - cudf.Series([None, None, None, None, None], dtype="str"), - ), - ( - cudf.Series( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[us]", - ), - cudf.Series( - [ - "0 days 00:02:16.457654", - None, - "0 days 00:04:05.345345", - "0 days 00:03:43.432411", - None, - "0 days 01:00:34.548734", - "0 days 00:00:00.023234", - ] - ), - ), - ( - cudf.Series( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[ms]", - ), - cudf.Series( - [ - "1 days 13:54:17.654", - None, - "2 days 20:09:05.345", - "2 days 14:03:52.411", - None, - "42 days 01:35:48.734", - "0 days 00:00:23.234", - ] - ), - ), - ( - cudf.Series( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[s]", - ), - cudf.Series( - [ - "1579 days 08:54:14", - None, - "2839 days 15:29:05", - "2586 days 00:33:31", - None, - "42066 days 12:52:14", - "0 days 06:27:14", - ] - ), - ), - ( - cudf.Series( - [ - 136457654, - None, - 245345345, - 223432411, - None, - 3634548734, - 23234, - ], - dtype="timedelta64[ns]", - ), - cudf.Series( - [ - "0 days 00:00:00.136457654", - None, - "0 days 00:00:00.245345345", - "0 days 00:00:00.223432411", - None, - "0 days 00:00:03.634548734", - "0 days 00:00:00.000023234", - ] - ), - ), - ], -) -def test_timedelta_str_roundtrip(gsr, expected_series): - actual_series = gsr.astype("str") - - assert_eq(expected_series, actual_series) - - assert_eq(gsr, actual_series.astype(gsr.dtype)) - - -def test_timedelta_invalid_ops(): - sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") - psr = sr.to_pandas() - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, 1],), - rfunc_args_and_kwargs=([sr, 1],), - ) - - assert_exceptions_equal( - lfunc=operator.add, - rfunc=operator.add, - lfunc_args_and_kwargs=([psr, "a"],), - rfunc_args_and_kwargs=([sr, "a"],), - ) - - dt_sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - dt_psr = dt_sr.to_pandas() - - assert_exceptions_equal( - lfunc=operator.mod, - rfunc=operator.mod, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.mod, - rfunc=operator.mod, - lfunc_args_and_kwargs=([psr, "a"],), - rfunc_args_and_kwargs=([sr, "a"],), - check_exception_type=False, - ) - - assert_exceptions_equal( - lfunc=operator.gt, - rfunc=operator.gt, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.lt, - rfunc=operator.lt, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.ge, - rfunc=operator.ge, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.le, - rfunc=operator.le, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.truediv, - rfunc=operator.truediv, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.floordiv, - rfunc=operator.floordiv, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.mul, - rfunc=operator.mul, - lfunc_args_and_kwargs=([psr, dt_psr],), - rfunc_args_and_kwargs=([sr, dt_sr],), - ) - - assert_exceptions_equal( - lfunc=operator.mul, - rfunc=operator.mul, - lfunc_args_and_kwargs=([psr, psr],), - rfunc_args_and_kwargs=([sr, sr],), - check_exception_type=False, - ) - - assert_exceptions_equal( - lfunc=operator.xor, - rfunc=operator.xor, - lfunc_args_and_kwargs=([psr, psr],), - rfunc_args_and_kwargs=([sr, sr],), - ) - - -def test_timedelta_datetime_cast_invalid(): - sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") - psr = sr.to_pandas() - - assert_exceptions_equal( - psr.astype, - sr.astype, - (["datetime64[ns]"],), - (["datetime64[ns]"],), - ) - - sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") - psr = sr.to_pandas() - - assert_exceptions_equal( - psr.astype, - sr.astype, - (["timedelta64[ns]"],), - (["timedelta64[ns]"],), - ) - - -@pytest.mark.parametrize("data", [[], [1, 2, 3, 4, 5]]) -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) -@pytest.mark.parametrize("timedelta_dtype", utils.TIMEDELTA_TYPES) -def test_numeric_to_timedelta(data, dtype, timedelta_dtype): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - actual = sr.astype(timedelta_dtype) - expected = psr.astype(timedelta_dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[], [1, 2, 3, 4, 5]]) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize( - "scalar", - [ - 1, - 2, - 3, - "a", - np.timedelta64(1, "s"), - np.timedelta64(2, "s"), - np.timedelta64(2, "D"), - np.timedelta64(3, "ms"), - np.timedelta64(4, "us"), - np.timedelta64(5, "ns"), - np.timedelta64(6, "ns"), - np.datetime64(6, "s"), - ], -) -def test_timedelta_contains(data, dtype, scalar): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - expected = scalar in sr - actual = scalar in psr - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("data", [[1, 2, 3], [], [1, 20, 1000, None]]) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize("ddof", [1, 2, 3]) -def test_timedelta_std(data, dtype, ddof): - gsr = cudf.Series(data, dtype=dtype) - psr = gsr.to_pandas() - - expected = psr.std(ddof=ddof) - actual = gsr.std(ddof=ddof) - - if np.isnat(expected.to_numpy()) and np.isnat(actual.to_numpy()): - assert True - else: - np.testing.assert_allclose( - expected.to_numpy().astype("float64"), - actual.to_numpy().astype("float64"), - rtol=1e-5, - atol=0, - ) - - -@pytest.mark.parametrize("op", ["max", "min"]) -@pytest.mark.parametrize( - "data", - [ - [], - [1, 2, 3, 100], - [10, None, 100, None, None], - [None, None, None], - [1231], - ], -) -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -def test_timedelta_reductions(data, op, dtype): - sr = cudf.Series(data, dtype=dtype) - psr = sr.to_pandas() - - actual = getattr(sr, op)() - expected = getattr(psr, op)() - - if np.isnat(expected.to_numpy()) and np.isnat(actual): - assert True - else: - assert_eq(expected.to_numpy(), actual) - - -def test_error_values(): - s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") - with pytest.raises( - NotImplementedError, - match="TimeDelta Arrays is not yet implemented in cudf", - ): - s.values - - -@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) -@pytest.mark.parametrize("name", [None, "delta-index"]) -def test_create_TimedeltaIndex(dtype, name): - gdi = cudf.TimedeltaIndex( - [1132223, 2023232, 342234324, 4234324], dtype=dtype, name=name - ) - pdi = gdi.to_pandas() - assert_eq(pdi, gdi) - - -@pytest.mark.parametrize("data", [[43534, 43543, 37897, 2000]]) -@pytest.mark.parametrize("dtype", ["timedelta64[ns]"]) -def test_timedelta_constructor(data, dtype): - expected = pd.TimedeltaIndex(data=data, dtype=dtype) - actual = cudf.TimedeltaIndex(data=data, dtype=dtype) - - assert_eq(expected, actual) - - expected = pd.TimedeltaIndex(data=pd.Series(data), dtype=dtype) - actual = cudf.TimedeltaIndex(data=cudf.Series(data), dtype=dtype) - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_timdelta_binop_tz_timestamp(op): - s = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") - pd_tz_timestamp = pd.Timestamp("1970-01-01 00:00:00.000000001", tz="utc") - with pytest.raises(NotImplementedError): - op(s, pd_tz_timestamp) - date_tz_scalar = datetime.datetime.now(datetime.timezone.utc) - with pytest.raises(NotImplementedError): - op(s, date_tz_scalar) - - -def test_timedelta_getitem_na(): - s = cudf.Series([1, 2, None, 3], dtype="timedelta64[ns]") - assert s[2] is cudf.NaT - - -@pytest.mark.parametrize("data1", [[123, 456, None, 321, None]]) -@pytest.mark.parametrize("data2", [[123, 456, 789, None, None]]) -@pytest.mark.parametrize("op", _cmpops) -def test_timedelta_series_cmpops_pandas_compatibility(data1, data2, op): - gsr1 = cudf.Series(data=data1, dtype="timedelta64[ns]") - psr1 = gsr1.to_pandas() - - gsr2 = cudf.Series(data=data2, dtype="timedelta64[ns]") - psr2 = gsr2.to_pandas() - - expect = op(psr1, psr2) - with cudf.option_context("mode.pandas_compatible", True): - got = op(gsr1, gsr2) - - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "method, kwargs", - [ - ["sum", {}], - ["mean", {}], - ["median", {}], - ["std", {}], - ["std", {"ddof": 0}], - ], -) -def test_tdi_reductions(method, kwargs): - pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"]) - cudf_tdi = cudf.from_pandas(pd_tdi) - - result = getattr(pd_tdi, method)(**kwargs) - expected = getattr(cudf_tdi, method)(**kwargs) - assert result == expected - - -def test_tdi_asi8(): - pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"]) - cudf_tdi = cudf.from_pandas(pd_tdi) - - result = pd_tdi.asi8 - expected = cudf_tdi.asi8 - assert_eq(result, expected) - - -def test_tdi_unit(): - pd_tdi = pd.TimedeltaIndex( - ["1 day", "2 days", "3 days"], dtype="timedelta64[ns]" - ) - cudf_tdi = cudf.from_pandas(pd_tdi) - - result = pd_tdi.unit - expected = cudf_tdi.unit - assert result == expected diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py deleted file mode 100644 index 88938457545..00000000000 --- a/python/cudf/cudf/tests/test_transform.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - - -import numpy as np -import pytest - -from cudf import Series -from cudf.testing._utils import NUMERIC_TYPES - -supported_types = NUMERIC_TYPES - - -def _generic_function(a): - return a**3 - - -@pytest.mark.parametrize("dtype", supported_types) -@pytest.mark.parametrize( - "udf,testfunc", - [ - (_generic_function, lambda ser: ser**3), - (lambda x: x in [1, 2, 3, 4], lambda ser: np.isin(ser, [1, 2, 3, 4])), - ], -) -def test_apply_python_lambda(dtype, udf, testfunc): - size = 500 - - lhs_arr = np.random.random(size).astype(dtype) - lhs_ser = Series(lhs_arr) - - out_ser = lhs_ser.apply(udf) - result = testfunc(lhs_arr) - np.testing.assert_almost_equal(result, out_ser.to_numpy()) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py deleted file mode 100644 index 087d10b8295..00000000000 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ /dev/null @@ -1,1024 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. -import math -import operator - -import numpy as np -import pytest -from numba import cuda - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.missing import NA -from cudf.core.udf._ops import ( - arith_ops, - bitwise_ops, - comparison_ops, - unary_ops, -) -from cudf.core.udf.api import Masked -from cudf.core.udf.utils import precompiled -from cudf.testing import assert_eq -from cudf.testing._utils import ( - _decimal_series, - parametrize_numeric_dtypes_pairwise, - sv_to_udf_str, -) - - -@pytest.fixture(scope="module") -def str_udf_data(): - return cudf.DataFrame( - { - "str_col": [ - "abc", - "ABC", - "AbC", - "123", - "123aBc", - "123@.!", - "", - "rapids ai", - "gpu", - "True", - "False", - "1.234", - ".123a", - "0.013", - "1.0", - "01", - "20010101", - "cudf", - "cuda", - "gpu", - "This Is A Title", - "This is Not a Title", - "Neither is This a Title", - "NoT a TiTlE", - "123 Title Works", - ] - } - ) - - -@pytest.fixture(params=["a", "cu", "2", "gpu", "", " "]) -def substr(request): - return request.param - - -def run_masked_udf_test(func, data, args=(), nullable=True, **kwargs): - gdf = data - pdf = data.to_pandas(nullable=nullable) - - expect = pdf.apply(func, args=args, axis=1) - obtain = gdf.apply(func, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - -def run_masked_string_udf_test(func, data, args=(), **kwargs): - gdf = data - pdf = data.to_pandas(nullable=True) - - def row_wrapper(row): - st = row["str_col"] - return func(st) - - expect = pdf.apply(row_wrapper, args=args, axis=1) - - func = cuda.jit(device=True)(func) - obtain = gdf.apply(row_wrapper, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - # strings that come directly from input columns are backed by - # MaskedType(string_view) types. But new strings that are returned - # from functions or operators are backed by MaskedType(udf_string) - # types. We need to make sure all of our methods work on both kind - # of MaskedType. This function promotes the former to the latter - # prior to running the input function - def udf_string_wrapper(row): - masked_udf_str = Masked( - sv_to_udf_str(row["str_col"].value), row["str_col"].valid - ) - return func(masked_udf_str) - - obtain = gdf.apply(udf_string_wrapper, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - -def run_masked_udf_series(func, data, args=(), **kwargs): - gsr = data - psr = data.to_pandas(nullable=True) - - expect = psr.apply(func, args=args) - obtain = gsr.apply(func, args=args) - assert_eq(expect, obtain, **kwargs) - - -@pytest.mark.parametrize("op", arith_ops) -def test_arith_masked_vs_masked(op): - # This test should test all the typing - # and lowering for arithmetic ops between - # two columns - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", bitwise_ops) -def test_bitwise_masked_vs_masked(op): - # This test should test all the typing - # and lowering for bitwise ops between - # two columns - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame( - { - "a": [1, 0, 1, 0, 0b1011, 42, None], - "b": [1, 1, 0, 0, 0b1100, -42, 5], - } - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize( - "dtype_l", - ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], -) -@pytest.mark.parametrize( - "dtype_r", - [ - "timedelta64[ns]", - "timedelta64[us]", - "timedelta64[ms]", - "timedelta64[s]", - "datetime64[ns]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[s]", - ], -) -@pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_arith_masked_vs_masked_datelike(op, dtype_l, dtype_r): - # Datetime version of the above - # does not test all dtype combinations for now - if "datetime" in dtype_l and "datetime" in dtype_r and op is operator.add: - # don't try adding datetimes to datetimes. - pytest.skip("Adding datetime to datetime is not valid") - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame( - { - "a": ["2011-01-01", cudf.NA, "2011-03-01", cudf.NA], - "b": [4, 5, cudf.NA, cudf.NA], - } - ) - gdf["a"] = gdf["a"].astype(dtype_l) - gdf["b"] = gdf["b"].astype(dtype_r) - - pdf = gdf.to_pandas() - expect = op(pdf["a"], pdf["b"]) - obtain = gdf.apply(func, axis=1) - assert_eq(expect, obtain, check_dtype=False) - # TODO: After the following pandas issue is - # fixed, uncomment the following line and delete - # through `to_pandas()` statement. - # https://github.com/pandas-dev/pandas/issues/52411 - - # run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) - - -@pytest.mark.parametrize("op", comparison_ops) -def test_compare_masked_vs_masked(op): - # this test should test all the - # typing and lowering for comparisons - # between columns - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - # we should get: - # [?, ?, , , ] - gdf = cudf.DataFrame( - {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]} - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, True, False]) -@pytest.mark.parametrize("data", [[1, 2, cudf.NA]]) -def test_arith_masked_vs_constant(op, constant, data): - def func(row): - x = row["data"] - return op(x, constant) - - gdf = cudf.DataFrame({"data": data}) - - if constant is False and op in { - operator.mod, - operator.pow, - operator.truediv, - operator.floordiv, - operator.imod, - operator.ipow, - operator.itruediv, - operator.ifloordiv, - }: - # The following tests cases yield undefined behavior: - # - truediv(x, False) because its dividing by zero - # - floordiv(x, False) because its dividing by zero - # - mod(x, False) because its mod by zero, - # - pow(x, False) because we have an NA in the series and pandas - # insists that (NA**0 == 1) where we do not - pytest.skip() - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, True, False]) -@pytest.mark.parametrize("data", [[2, 3, cudf.NA], [1, cudf.NA, 1]]) -def test_arith_masked_vs_constant_reflected(request, op, constant, data): - def func(row): - x = row["data"] - return op(constant, x) - - # Just a single column -> result will be all NA - gdf = cudf.DataFrame({"data": data}) - - # cudf differs from pandas for 1**NA - request.applymarker( - pytest.mark.xfail( - condition=(constant == 1 and op in {operator.pow, operator.ipow}), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("data", [[1, cudf.NA, 3], [2, 3, cudf.NA]]) -def test_arith_masked_vs_null(request, op, data): - def func(row): - x = row["data"] - return op(x, NA) - - gdf = cudf.DataFrame({"data": data}) - - # In pandas, 1**NA == 1. - request.applymarker( - pytest.mark.xfail( - condition=( - (gdf["data"] == 1).any() - and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -def test_arith_masked_vs_null_reflected(op): - def func(row): - x = row["data"] - return op(NA, x) - - gdf = cudf.DataFrame({"data": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", unary_ops) -def test_unary_masked(op): - # This test should test all the typing - # and lowering for unary ops - - def func(row): - x = row["a"] - return op(x) if x is not NA else NA - - if "log" in op.__name__: - gdf = cudf.DataFrame({"a": [0.1, 1.0, None, 3.5, 1e8]}) - elif op.__name__ in {"asin", "acos"}: - gdf = cudf.DataFrame({"a": [0.0, 0.5, None, 1.0]}) - elif op.__name__ in {"atanh"}: - gdf = cudf.DataFrame({"a": [0.0, -0.5, None, 0.8]}) - elif op.__name__ in {"acosh", "sqrt", "lgamma"}: - gdf = cudf.DataFrame({"a": [1.0, 2.0, None, 11.0]}) - elif op.__name__ in {"gamma"}: - gdf = cudf.DataFrame({"a": [0.1, 2, None, 4]}) - elif op.__name__ in {"invert"}: - gdf = cudf.DataFrame({"a": [-100, 128, None, 0]}, dtype="int64") - else: - gdf = cudf.DataFrame({"a": [-125.60, 395.2, 0.0, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_masked_is_null_conditional(): - def func(row): - x = row["a"] - y = row["b"] - if x is NA: - return y - else: - return x + y - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_contains(): - def func(row): - x = row["a"] - return x in [1, 2] - - gdf = cudf.DataFrame({"a": [1, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@parametrize_numeric_dtypes_pairwise -@pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) -def test_apply_mixed_dtypes(left_dtype, right_dtype, op): - """ - Test that operations can be performed between columns - of different dtypes and return a column with the correct - values and nulls - """ - - # First perform the op on two dummy data on host, if numpy can - # safely type cast, we should expect it to work in udf too. - try: - op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42)) - except TypeError: - pytest.skip("Operation is unsupported for corresponding dtype.") - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) - gdf["a"] = gdf["a"].astype(left_dtype) - gdf["b"] = gdf["b"].astype(right_dtype) - - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("val", [5, 5.5]) -def test_apply_return_literal(val): - """ - Test unification codepath for scalars and MaskedType - makes sure that numba knows how to cast a scalar value - to a MaskedType - """ - - def func(row): - x = row["a"] - y = row["b"] - if x is not NA and x < 2: - return val - else: - return x + y - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_null(): - """ - Tests casting / unification of Masked and NA - """ - - def func(row): - x = row["a"] - if x is NA: - return NA - else: - return x - - gdf = cudf.DataFrame({"a": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_either_null_or_literal(): - def func(row): - x = row["a"] - if x > 5: - return 2 - else: - return NA - - gdf = cudf.DataFrame({"a": [1, 3, 6]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_literal_only(): - def func(x): - return 5 - - gdf = cudf.DataFrame({"a": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_everything(): - def func(row): - w = row["a"] - x = row["b"] - y = row["c"] - z = row["d"] - if x is NA: - return w + y - z - elif ((z > y) is not NA) and z > y: - return x - elif ((x + y) is not NA) and x + y == 0: - return z / x - elif x + y is NA: - return 2.5 - elif w > 100: - return ( - math.sin(x) - + math.sqrt(y) - - (-z) - + math.lgamma(x) * math.fabs(-0.8) / math.radians(3.14) - ) - else: - return y > 2 - - gdf = cudf.DataFrame( - { - "a": [1, 3, 6, 0, None, 5, None, 101], - "b": [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0, 1.0], - "c": [2, 3, 6, 0, None, 5, None, 6], - "d": [4, None, 6, 0, None, 5, None, 7.5], - } - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -### - - -@pytest.mark.parametrize( - "data,name", - [([1, 2, 3], None), ([1, cudf.NA, 3], None), ([1, 2, 3], "test_name")], -) -def test_series_apply_basic(data, name): - data = cudf.Series(data, name=name) - - def func(x): - return x + 1 - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -def test_series_apply_null_conditional(): - def func(x): - if x is NA: - return 42 - else: - return x - 1 - - data = cudf.Series([1, cudf.NA, 3]) - - run_masked_udf_series(func, data) - - -### - - -@pytest.mark.parametrize("op", arith_ops) -def test_series_arith_masked_vs_masked(op): - def func(x): - return op(x, x) - - data = cudf.Series([1, cudf.NA, 3]) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -@pytest.mark.parametrize("op", comparison_ops) -def test_series_compare_masked_vs_masked(op): - """ - In the series case, only one other MaskedType to compare with - - itself - """ - - def func(x): - return op(x, x) - - data = cudf.Series([1, cudf.NA, 3]) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) -def test_series_arith_masked_vs_constant(request, op, constant): - def func(x): - return op(x, constant) - - # Just a single column -> result will be all NA - data = cudf.Series([1, 2, cudf.NA]) - # in pandas, 1**NA == 1. In cudf, 1**NA == NA. - request.applymarker( - pytest.mark.xfail( - condition=( - constant is cudf.NA and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) -def test_series_arith_masked_vs_constant_reflected(request, op, constant): - def func(x): - return op(constant, x) - - # Just a single column -> result will be all NA - data = cudf.Series([1, 2, cudf.NA]) - # Using in {1} since bool(NA == 1) raises a TypeError since NA is - # neither truthy nor falsy - # in pandas, 1**NA == 1. In cudf, 1**NA == NA. - request.applymarker( - pytest.mark.xfail( - condition=( - constant in {1} and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -def test_series_masked_is_null_conditional(): - def func(x): - if x is NA: - return 42 - else: - return x - - data = cudf.Series([1, cudf.NA, 3, cudf.NA]) - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_lambda_support(op): - func = lambda row: op(row["a"], row["b"]) # noqa: E731 - - data = cudf.DataFrame( - {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} - ) - - run_masked_udf_test(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_nested_function_support(op): - """ - Nested functions need to be explicitly jitted by the user - for numba to recognize them. Unfortunately the object - representing the jitted function can not itself be used in - pandas udfs. - """ - - def inner(x, y): - return op(x, y) - - def outer(row): - x = row["a"] - y = row["b"] - return inner(x, y) - - gdf = cudf.DataFrame( - {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} - ) - - with pytest.raises(ValueError): - gdf.apply(outer, axis=1) - - pdf = gdf.to_pandas(nullable=True) - inner_gpu = cuda.jit(device=True)(inner) - - def outer_gpu(row): - x = row["a"] - y = row["b"] - return inner_gpu(x, y) - - got = gdf.apply(outer_gpu, axis=1) - expect = pdf.apply(outer, axis=1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, - {"a": [1, 2, 3], "c": [4, 5, 6], "b": [7, 8, 9]}, - {"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]}, - ], -) -def test_masked_udf_subset_selection(data): - def func(row): - return row["a"] + row["b"] - - data = cudf.DataFrame(data) - run_masked_udf_test(func, data) - - -@pytest.mark.parametrize( - "unsupported_col", - [ - _decimal_series( - ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) - ), - cudf.Series([1, 2, 3], dtype="category"), - cudf.interval_range(start=0, end=3), - [[1, 2], [3, 4], [5, 6]], - [{"a": 1}, {"a": 2}, {"a": 3}], - ], -) -def test_masked_udf_unsupported_dtype(unsupported_col): - data = cudf.DataFrame() - data["unsupported_col"] = unsupported_col - - def func(row): - return row["unsupported_col"] - - # check that we fail when an unsupported type is used within a function - with pytest.raises(ValueError): - data.apply(func, axis=1) - - # also check that a DF containing unsupported dtypes can still run a - # function that does NOT involve any of the unsupported dtype columns - data["supported_col"] = 1 - - def other_func(row): - return row["supported_col"] - - expect = cudf.Series(np.ones(len(data))) - got = data.apply(other_func, axis=1) - - assert_eq(expect, got, check_dtype=False) - - -# tests for `DataFrame.apply(f, args=(x,y,z))` -# testing the whole space of possibilities is intractable -# these test the most rudimentary guaranteed functionality -@pytest.mark.parametrize( - "data", - [ - {"a": [1, cudf.NA, 3]}, - {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, - {"a": [True, False, cudf.NA]}, - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops(data, op): - data = cudf.DataFrame(data) - - def func(row, c): - return op(row["a"], c) - - run_masked_udf_test(func, data, args=(1,), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, cudf.NA, 3]}, - {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, - {"a": [True, False, cudf.NA]}, - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops_multiple(data, op): - data = cudf.DataFrame(data) - - def func(row, c, k): - x = op(row["a"], c) - y = op(x, k) - return y - - run_masked_udf_test(func, data, args=(1, 2), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - [1, cudf.NA, 3], - [0.5, 2.0, cudf.NA, cudf.NA, 5.0], - [True, False, cudf.NA], - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_mask_udf_scalar_args_binops_series(data, op): - data = cudf.Series(data) - - def func(x, c): - return x + c - - run_masked_udf_series(func, data, args=(1,), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - [1, cudf.NA, 3], - [0.5, 2.0, cudf.NA, cudf.NA, 5.0], - [True, False, cudf.NA], - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops_multiple_series(request, data, op): - data = cudf.Series(data) - request.applymarker( - pytest.mark.xfail( - op in comparison_ops - and PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and data.dtype.kind != "b", - reason="https://github.com/pandas-dev/pandas/issues/57390", - ) - ) - - def func(data, c, k): - x = op(data, c) - y = op(x, k) - return y - - run_masked_udf_series(func, data, args=(1, 2), check_dtype=False) - - -def test_masked_udf_caching(): - # Make sure similar functions that differ - # by simple things like constants actually - # recompile - - data = cudf.Series([1, 2, 3]) - - expect = data**2 - got = data.apply(lambda x: x**2) - assert_eq(expect, got, check_dtype=False) - - # update the constant value being used and make sure - # it does not result in a cache hit - - expect = data**3 - got = data.apply(lambda x: x**3) - assert_eq(expect, got, check_dtype=False) - - # make sure we get a hit when reapplying - def f(x): - return x + 1 - - precompiled.clear() - assert precompiled.currsize == 0 - data.apply(f) - - assert precompiled.currsize == 1 - data.apply(f) - - assert precompiled.currsize == 1 - - # validate that changing the type of a scalar arg - # results in a miss - precompiled.clear() - - def f(x, c): - return x + c - - data.apply(f, args=(1,)) - assert precompiled.currsize == 1 - - data.apply(f, args=(1.5,)) - assert precompiled.currsize == 2 - - -@pytest.mark.parametrize( - "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] -) -@pytest.mark.parametrize("operator", [float, int, bool]) -def test_masked_udf_casting(operator, data): - data = cudf.Series(data) - - def func(x): - return operator(x) - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - np.array( - [0, 1, -1, 0, np.iinfo("int64").min, np.iinfo("int64").max], - dtype="int64", - ), - np.array([0, 0, 1, np.iinfo("uint64").max], dtype="uint64"), - np.array( - [ - 0, - 0.0, - -1.0, - 1.5, - -1.5, - np.finfo("float64").min, - np.finfo("float64").max, - np.nan, - np.inf, - -np.inf, - ], - dtype="float64", - ), - [False, True, False, cudf.NA], - ], -) -def test_masked_udf_abs(data): - data = cudf.Series(data) - data[0] = cudf.NA - - def func(x): - return abs(x) - - run_masked_udf_series(func, data, check_dtype=False) - - -class TestStringUDFs: - def test_string_udf_len(self, str_udf_data): - def func(row): - return len(row["str_col"]) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_startswith(self, str_udf_data, substr): - def func(row): - return row["str_col"].startswith(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_endswith(self, str_udf_data, substr): - def func(row): - return row["str_col"].endswith(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_find(self, str_udf_data, substr): - def func(row): - return row["str_col"].find(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_rfind(self, str_udf_data, substr): - def func(row): - return row["str_col"].rfind(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_contains(self, str_udf_data, substr): - def func(row): - return substr in row["str_col"] - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("other", ["cudf", "123", "", " "]) - @pytest.mark.parametrize("cmpop", comparison_ops) - def test_string_udf_cmpops(self, str_udf_data, other, cmpop): - def func(row): - return cmpop(row["str_col"], other) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isalnum(self, str_udf_data): - def func(row): - return row["str_col"].isalnum() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isalpha(self, str_udf_data): - def func(row): - return row["str_col"].isalpha() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isdigit(self, str_udf_data): - def func(row): - return row["str_col"].isdigit() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isdecimal(self, str_udf_data): - def func(row): - return row["str_col"].isdecimal() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isupper(self, str_udf_data): - def func(row): - return row["str_col"].isupper() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_islower(self, str_udf_data): - def func(row): - return row["str_col"].islower() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isspace(self, str_udf_data): - def func(row): - return row["str_col"].isspace() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_istitle(self, str_udf_data): - def func(row): - return row["str_col"].istitle() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_count(self, str_udf_data, substr): - def func(row): - return row["str_col"].count(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_return_string(self, str_udf_data): - def func(row): - return row["str_col"] - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_strip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].strip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_lstrip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].lstrip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_rstrip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].rstrip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_upper(self, str_udf_data): - def func(row): - return row["str_col"].upper() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_lower(self, str_udf_data): - def func(row): - return row["str_col"].lower() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize( - "concat_char", ["1", "a", "12", " ", "", ".", "@"] - ) - def test_string_udf_concat(self, str_udf_data, concat_char): - def func(row): - return row["str_col"] + concat_char - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("to_replace", ["a", "1", "", "@"]) - @pytest.mark.parametrize("replacement", ["a", "1", "", "@"]) - def test_string_udf_replace(self, str_udf_data, to_replace, replacement): - def func(row): - return row["str_col"].replace(to_replace, replacement) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py deleted file mode 100644 index 5f5d79c1dce..00000000000 --- a/python/cudf/cudf/tests/test_unaops.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import itertools -import operator -import re - -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf import Series -from cudf.testing import _utils as utils, assert_eq - -_unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor] - - -@pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) -def test_series_abs(dtype): - arr = (np.random.random(1000) * 100).astype(dtype) - sr = Series(arr) - np.testing.assert_equal(sr.abs().to_numpy(), np.abs(arr)) - np.testing.assert_equal(abs(sr).to_numpy(), abs(arr)) - - -@pytest.mark.parametrize("dtype", utils.INTEGER_TYPES) -def test_series_invert(dtype): - arr = (np.random.random(1000) * 100).astype(dtype) - sr = Series(arr) - np.testing.assert_equal((~sr).to_numpy(), np.invert(arr)) - np.testing.assert_equal((~sr).to_numpy(), ~arr) - - -def test_series_neg(): - arr = np.random.random(100) * 100 - sr = Series(arr) - np.testing.assert_equal((-sr).to_numpy(), -arr) - - -@pytest.mark.parametrize("mth", ["min", "max", "sum", "product"]) -def test_series_pandas_methods(mth): - np.random.seed(0) - arr = (1 + np.random.random(5) * 100).astype(np.int64) - sr = Series(arr) - psr = pd.Series(arr) - np.testing.assert_equal(getattr(sr, mth)(), getattr(psr, mth)()) - - -@pytest.mark.parametrize("mth", ["min", "max", "sum", "product", "quantile"]) -def test_series_pandas_methods_empty(mth): - arr = np.array([]) - sr = Series(arr) - psr = pd.Series(arr) - np.testing.assert_equal(getattr(sr, mth)(), getattr(psr, mth)()) - - -def generate_valid_scalar_unaop_combos(): - results = [] - - # All ops valid for integer values - int_values = [0, 1, -1] - int_dtypes = utils.INTEGER_TYPES - int_ops = _unaops - - results += list(itertools.product(int_values, int_dtypes, int_ops)) - - float_values = [0.0, 1.0, -1.1] - float_dtypes = utils.FLOAT_TYPES - float_ops = [op for op in _unaops if op is not operator.invert] - results += list(itertools.product(float_values, float_dtypes, float_ops)) - - bool_values = [True, False] - bool_dtypes = ["bool"] - bool_ops = [op for op in _unaops if op is not operator.neg] - results += list(itertools.product(bool_values, bool_dtypes, bool_ops)) - - return results - - -@pytest.mark.filterwarnings("ignore:overflow encountered in scalar negative") -@pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos()) -def test_scalar_unary_operations(slr, dtype, op): - slr_host = np.array([slr])[0].astype(cudf.dtype(dtype)) - # The scalar may be out of bounds, so go via array force-cast - # NOTE: This is a change in behavior - slr = np.array(slr).astype(dtype)[()] - slr_device = cudf.Scalar(slr) - - expect = op(slr_host) - got = op(slr_device) - - assert expect == got.value - - # f16 for small ints with ceil and float - if expect.dtype == np.dtype("float16"): - assert got.dtype == np.dtype("float32") - else: - assert expect.dtype == got.dtype - - -def test_scalar_logical(): - T = cudf.Scalar(True) - F = cudf.Scalar(False) - - assert T - assert not F - - assert T and T - assert not (T and F) - assert not (F and T) - assert not (F and F) - - assert T or T - assert T or F - assert F or T - assert not (F or F) - - -def test_scalar_no_negative_bools(): - x = cudf.Scalar(True) - with pytest.raises( - TypeError, - match=re.escape( - "Boolean scalars in cuDF do not " - "support negation, use logical not" - ), - ): - -x - - -def test_series_bool_neg(): - sr = Series([True, False, True, None, False, None, True, True]) - psr = sr.to_pandas(nullable=True) - assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True) diff --git a/python/cudf/cudf/tests/test_unique.py b/python/cudf/cudf/tests/test_unique.py deleted file mode 100644 index 699b3340521..00000000000 --- a/python/cudf/cudf/tests/test_unique.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -import cupy as cp -import numpy as np -import pandas as pd -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.fixture -def df(): - df = cudf.DataFrame() - np.random.seed(0) - - arr = np.random.randint(2, size=10, dtype=np.int64) - df["foo"] = arr - df["bar"] = cudf.Series([pd.Timestamp(x) for x in arr]) - - return df - - -@pytest.fixture(params=["foo", "bar"]) -def series_test_vals(request, df): - actual = cudf.unique(df[request.param]) - expected = pd.unique(df[request.param].to_pandas()) - return actual, expected - - -def test_unique_series_obj(series_test_vals): - actual, expected = series_test_vals - - assert isinstance(expected, np.ndarray) - assert isinstance(actual, cudf.Series) - assert_eq(actual, pd.Series(expected, name=actual.name)) - - -@pytest.mark.parametrize( - "index", - [ - (cudf.Index, pd.Index), - (cudf.MultiIndex, pd.MultiIndex), - (cudf.DatetimeIndex, pd.DatetimeIndex), - (cudf.CategoricalIndex, pd.CategoricalIndex), - ], -) -@pytest.mark.parametrize("col", ["foo", "bar"]) -def test_unique_index_obj(index, col, df): - if index[0] == cudf.MultiIndex: - df.index = cudf.MultiIndex.from_arrays([df[col], df[col]]) - else: - df.index = index[0](df[col]) - actual = cudf.unique(df.index) - expected = pd.unique(df.index.to_pandas()) - - isinstance(expected, np.ndarray) - assert isinstance(actual, index[0]) - - if index[0] == cudf.MultiIndex: - expect = index[1].from_arrays( - [ - [x[0] for x in expected], - [x[1] for x in expected], - ], - names=actual.names, - ) - assert_eq(actual, expect) - else: - assert_eq(actual, index[1](expected, name=actual.name)) - - -def test_unique_cupy_ndarray(df): - arr = np.asarray(df["foo"].to_pandas()) - garr = cp.asarray(df["foo"]) - - expected = pd.unique(arr) - actual = cudf.unique(garr) - - isinstance(expected, np.ndarray) - isinstance(actual, cp.ndarray) - assert_eq(actual, expected) - - -@pytest.mark.parametrize( - "data", - [ - ["abc", "def", "abc", "a", "def", None], - [10, 20, 100, -10, 0, 1, None, 10, 100], - ], -) -def test_category_dtype_unique(data): - gs = cudf.Series(data, dtype="category") - ps = gs.to_pandas() - - actual = cudf.unique(gs) - expected = pd.unique(ps) - - assert isinstance(expected, pd.Categorical) - assert isinstance(actual, cudf.Series) - assert_eq(actual, pd.Series(expected)) - - -def test_unique_fails_value_error(df): - with pytest.raises( - ValueError, - match="Must pass cudf.Series, cudf.Index, or cupy.ndarray object", - ): - cudf.unique(df) - - -def test_unique_fails_not_implemented_error(df): - with cudf.option_context("mode.pandas_compatible", True): - with pytest.raises( - NotImplementedError, match="cudf.Categorical is not implemented" - ): - cudf.unique(cudf.Series(["foo", "foo"], dtype="category")) diff --git a/python/cudf/cudf/tests/test_version.py b/python/cudf/cudf/tests/test_version.py deleted file mode 100644 index 8c10cc20a9a..00000000000 --- a/python/cudf/cudf/tests/test_version.py +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -import cudf - - -def test_version_constants_are_populated(): - # __git_commit__ will only be non-empty in a built distribution - assert isinstance(cudf.__git_commit__, str) - - # __version__ should always be non-empty - assert isinstance(cudf.__version__, str) - assert len(cudf.__version__) > 0 diff --git a/python/cudf/cudf/tests/text/__init__.py b/python/cudf/cudf/tests/text/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py deleted file mode 100644 index 78b58344374..00000000000 --- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py +++ /dev/null @@ -1,239 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import os - -import cupy -import numpy as np -import pytest - -import cudf -from cudf.core.subword_tokenizer import SubwordTokenizer -from cudf.testing import assert_eq - - -@pytest.fixture(scope="module") -def datadir(datadir): - return os.path.join(datadir, "subword_tokenizer_data") - - -def assert_equal_tokenization_outputs(hf_output, cudf_output): - assert ( - np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0 - ) - assert ( - np.sum( - hf_output["attention_mask"] != cudf_output["attention_mask"].get() - ) - == 0 - ) - - -@pytest.mark.skip(reason="segfaults") -@pytest.mark.parametrize("seq_len", [32, 64]) -@pytest.mark.parametrize("stride", [0, 15, 30]) -@pytest.mark.parametrize("add_special_tokens", [True, False]) -@pytest.mark.parametrize("do_lower_case", [True, False]) -def test_subword_tokenize( - seq_len, stride, add_special_tokens, do_lower_case, datadir -): - with open( - os.path.join(datadir, "test_sentences.txt"), encoding="utf-8" - ) as file: - input_sentence_ls = [line.strip() for line in file] - - vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") - - transformers = pytest.importorskip("transformers") - - hf_tokenizer = transformers.BertTokenizer.from_pretrained( - vocab_dir, do_lower_case=do_lower_case - ) - - hf_output = hf_tokenizer( - input_sentence_ls, - max_length=seq_len, - stride=stride, - padding="max_length", - return_tensors="np", - truncation=True, - add_special_tokens=add_special_tokens, - ) - - vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt") - str_series = cudf.Series(input_sentence_ls) - cudf_tokenizer = SubwordTokenizer(vocab_hash, do_lower_case=do_lower_case) - cudf_output = cudf_tokenizer( - str_series, - max_length=seq_len, - max_num_rows=len(str_series), - stride=stride, - padding="max_length", - return_tensors="cp", - truncation=True, - add_special_tokens=add_special_tokens, - ) - assert_equal_tokenization_outputs(hf_output, cudf_output) - - -def test_subword_tokenize_with_truncation(datadir): - vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") - vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt") - str_series = cudf.Series(["Test error"]) - cudf_tokenizer = SubwordTokenizer(vocab_hash) - - error_msg = ( - "Adding special tokens is not supported with truncation = False. " - "Custom Cupy kernel can potentially " - "be used to add it. For reference " - "see: _bert_add_special_tokens" - ) - - with pytest.raises(NotImplementedError, match=error_msg): - cudf_tokenizer( - str_series, - max_length=64, - max_num_rows=len(str_series), - truncation=False, - add_special_tokens=True, - ) - - -def test_text_subword_tokenize(tmpdir): - sr = cudf.Series( - [ - "This is a test", - "A test this is", - "Is test a this", - "Test test", - "this This", - ] - ) - hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") - content = "1\n0\n23\n" - coefficients = [65559] * 23 - for c in coefficients: - content = content + str(c) + " 0\n" - # based on values from the bert_hash_table.txt file for the - # test words used here: 'this' 'is' 'a' test' - table = [0] * 23 - table[0] = 3015668 - table[1] = 6205475701751155871 - table[5] = 6358029 - table[16] = 451412625363 - table[20] = 6206321707968235495 - content = content + "23\n" - for v in table: - content = content + str(v) + "\n" - content = content + "100\n101\n102\n\n" - hash_file.write(content) - - cudf_tokenizer = SubwordTokenizer(hash_file) - - token_d = cudf_tokenizer( - sr, 8, 8, add_special_tokens=False, truncation=True - ) - tokens, masks, metadata = ( - token_d["input_ids"], - token_d["attention_mask"], - token_d["metadata"], - ) - expected_tokens = cupy.asarray( - [ - 2023, - 2003, - 1037, - 3231, - 0, - 0, - 0, - 0, - 1037, - 3231, - 2023, - 2003, - 0, - 0, - 0, - 0, - 2003, - 3231, - 1037, - 2023, - 0, - 0, - 0, - 0, - 3231, - 3231, - 0, - 0, - 0, - 0, - 0, - 0, - 2023, - 2023, - 0, - 0, - 0, - 0, - 0, - 0, - ], - dtype=np.uint32, - ) - expected_tokens = expected_tokens.reshape(-1, 8) - assert_eq(expected_tokens, tokens) - - expected_masks = cupy.asarray( - [ - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - ], - dtype=np.uint32, - ) - expected_masks = expected_masks.reshape(-1, 8) - assert_eq(expected_masks, masks) - - expected_metadata = cupy.asarray( - [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32 - ) - expected_metadata = expected_metadata.reshape(-1, 3) - assert_eq(expected_metadata, metadata) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py deleted file mode 100644 index 997ca357986..00000000000 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ /dev/null @@ -1,1121 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import random -import string - -import numpy as np -import pytest - -import cudf -from cudf.core.byte_pair_encoding import BytePairEncoder -from cudf.core.tokenize_vocabulary import TokenizeVocabulary -from cudf.testing import assert_eq - - -def test_tokenize(): - strings = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - expected_values = [ - "the", - "quick", - "fox", - "jumped", - "over", - "the", - "lazy", - "dog", - "the", - "siamésé", - "cat", - "jumped", - "under", - "the", - "sofa", - ] - expected_index = strings.index.repeat(strings.str.token_count()) - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.tokenize() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_tokenize_delimiter(): - strings = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - expected_values = [ - "the quick f", - "x jumped ", - "ver the lazy d", - "g", - "the siamésé cat jumped under the s", - "fa", - ] - expected_index = strings.index.repeat(strings.str.token_count("o")) - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.tokenize(delimiter="o") - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_detokenize(): - strings = cudf.Series( - [ - "the", - "quick", - "fox", - "jumped", - "over", - "the", - "lazy", - "dog", - "the", - "siamésé", - "cat", - "jumped", - "under", - "the", - "sofa", - ] - ) - - indices = cudf.Series([0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]) - actual = strings.str.detokenize(indices) - expected = cudf.Series( - [ - "the quick fox", - "jumped over", - "the lazy dog", - "the siamésé cat jumped under the sofa", - ] - ) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - indices = cudf.Series( - [4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8 - ) - actual = strings.str.detokenize(indices, "+") - expected = cudf.Series( - [ - "quick+fox+jumped+over", - "lazy+dog", - "siamésé+cat+jumped+under", - "sofa", - "the+the+the+the", - ] - ) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "delimiter, expected_token_counts", - [ - ("", [10, 9, 0, 0, 5]), - ("o", [6, 3, 0, 0, 1]), - (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]), - (["a", "e", "i", "o"], [12, 11, 0, 0, 6]), - ], -) -def test_token_count(delimiter, expected_token_counts): - strings = cudf.Series( - [ - "the quick brown fox jumped over the lazy brown dog", - "the sable siamésé cat jumped under the brown sofa", - None, - "", - "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05", - ] - ) - - expected = cudf.Series(expected_token_counts) - - actual = strings.str.token_count(delimiter) - - assert type(expected) == type(actual) - assert_eq(expected, actual, check_dtype=False) - - -@pytest.mark.parametrize( - "delimiter, input, default_id, results", - [ - ( - "", - "the quick brown fox jumps over the lazy brown dog", - 99, - [0, 1, 2, 3, 4, 5, 0, 99, 2, 6], - ), - ( - " ", - " the sable siamésé cat jumps under the brown sofa ", - -1, - [0, 7, 8, 9, 4, 10, 0, 2, 11], - ), - ( - "_", - "the_quick_brown_fox_jumped__over_the_lazy_brown_dog", - -99, - [0, 1, 2, 3, -99, 5, 0, -99, 2, 6], - ), - ], -) -def test_tokenize_with_vocabulary(delimiter, input, default_id, results): - vocabulary = cudf.Series( - [ - "the", - "quick", - "brown", - "fox", - "jumps", - "over", - "dog", - "sable", - "siamésé", - "cat", - "under", - "sofa", - ] - ) - tokenizer = TokenizeVocabulary(vocabulary) - - strings = cudf.Series([input, None, "", input]) - - expected = cudf.Series( - [ - cudf.Series(results, dtype=np.int32), - None, - cudf.Series([], dtype=np.int32), - cudf.Series(results, dtype=np.int32), - ] - ) - - actual = tokenizer.tokenize(strings, delimiter, default_id) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_normalize_spaces(): - strings = cudf.Series( - [ - " the\t quick fox jumped over the lazy dog", - "the siamésé cat\f jumped\t\tunder the sofa ", - None, - "", - ] - ) - expected = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - actual = strings.str.normalize_spaces() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_normalize_characters(): - strings = cudf.Series( - ["乾 \t 乿", "ĂĆCĖÑTÜATE", "âscénd, Descend", "", None, "Stock^ $1"] - ) - expected = cudf.Series( - [ - " 乾 乿 ", - "accentuate", - "ascend , descend", - "", - None, - "stock ^ $ 1", - ] - ) - - actual = strings.str.normalize_characters() - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [ - " 乾 乿 ", - "ĂĆCĖÑTÜATE", - "âscénd , Descend", - "", - None, - "Stock ^ $ 1", - ] - ) - actual = strings.str.normalize_characters(do_lower=False) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, separator, expected_values", - [ - ( - 2, - "_", - [ - "this_is", - "is_my", - "my_favorite", - "favorite_book", - "book_on", - "on_my", - "my_bookshelf", - ], - ), - ( - 3, - "-", - [ - "this-is-my", - "is-my-favorite", - "my-favorite-book", - "favorite-book-on", - "book-on-my", - "on-my-bookshelf", - ], - ), - ], -) -def test_ngrams(n, separator, expected_values): - strings = cudf.Series( - ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"] - ) - - expected = cudf.Series(expected_values) - - actual = strings.str.ngrams(n=n, separator=separator) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, expected_values, expected_index, as_list", - [ - ( - 2, - [ - "th", - "hi", - "is", - "is", - "my", - "bo", - "oo", - "ok", - "he", - "er", - "re", - ], - [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5], - False, - ), - ( - 3, - [ - "thi", - "his", - "boo", - "ook", - "her", - "ere", - ], - [1, 1, 4, 4, 5, 5], - False, - ), - ( - 3, - [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []], - [1, 2, 3, 4, 5, 6], - True, - ), - ], -) -def test_character_ngrams(n, expected_values, expected_index, as_list): - strings = cudf.Series( - ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6] - ) - - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.character_ngrams(n=n, as_list=as_list) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_hash_character_ngrams(): - strings = cudf.Series(["abcdefg", "stuvwxyz"]) - expected = cudf.Series( - [ - cudf.Series([3902511862, 570445242, 4202475763], dtype=np.uint32), - cudf.Series( - [556054766, 3166857694, 3760633458, 192452857], dtype=np.uint32 - ), - ] - ) - actual = strings.str.hash_character_ngrams(5, True) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - actual = strings.str.hash_character_ngrams(5) - expected = expected.explode() - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, separator, expected_values", - [ - ( - 2, - "_", - [ - "this_is", - "is_my", - "my_favorite", - "book_on", - "on_my", - "my_bookshelf", - ], - ), - ( - 3, - "-", - ["this-is-my", "is-my-favorite", "book-on-my", "on-my-bookshelf"], - ), - ], -) -def test_ngrams_tokenize(n, separator, expected_values): - strings = cudf.Series(["this is my favorite", "book on my bookshelf"]) - - expected = cudf.Series(expected_values) - - actual = strings.str.ngrams_tokenize(n=n, separator=separator) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_character_tokenize_series(): - sr = cudf.Series( - [ - "hello world", - "sdf", - ( - "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" - ), - ] - ) - expected_values = [ - "h", - "e", - "l", - "l", - "o", - " ", - "w", - "o", - "r", - "l", - "d", - "s", - "d", - "f", - "g", - "o", - "o", - "d", - "b", - "y", - "e", - ",", - " ", - "o", - "n", - "e", - "-", - "t", - "w", - "o", - ":", - "t", - "h", - "r", - "e", - "e", - "~", - "f", - "o", - "u", - "r", - "+", - "f", - "i", - "v", - "e", - "_", - "s", - "i", - "x", - "@", - "s", - "e", - "v", - "e", - "n", - "#", - "e", - "i", - "g", - "h", - "t", - "^", - "n", - "i", - "n", - "e", - " ", - "h", - "e", - "Œ", - "Ž", - "‘", - "•", - "™", - "œ", - "$", - "µ", - "¾", - "Ť", - "Ơ", - "é", - " ", - "DŽ", - ] - expected_index = sr.index.repeat(sr.str.len().fillna(0)) - expected = cudf.Series(expected_values, index=expected_index) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Series([""]) - expected = cudf.Series([], dtype="object") - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Series(["a"]) - expected = cudf.Series(["a"]) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - -def test_character_tokenize_index(): - sr = cudf.Index( - [ - "hello world", - "sdf", - ( - "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" - ), - ] - ) - expected = cudf.Index( - [ - "h", - "e", - "l", - "l", - "o", - " ", - "w", - "o", - "r", - "l", - "d", - "s", - "d", - "f", - "g", - "o", - "o", - "d", - "b", - "y", - "e", - ",", - " ", - "o", - "n", - "e", - "-", - "t", - "w", - "o", - ":", - "t", - "h", - "r", - "e", - "e", - "~", - "f", - "o", - "u", - "r", - "+", - "f", - "i", - "v", - "e", - "_", - "s", - "i", - "x", - "@", - "s", - "e", - "v", - "e", - "n", - "#", - "e", - "i", - "g", - "h", - "t", - "^", - "n", - "i", - "n", - "e", - " ", - "h", - "e", - "Œ", - "Ž", - "‘", - "•", - "™", - "œ", - "$", - "µ", - "¾", - "Ť", - "Ơ", - "é", - " ", - "DŽ", - ] - ) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Index([""]) - expected = cudf.Index([], dtype="object") - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Index(["a"]) - expected = cudf.Index(["a"]) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - -def test_text_replace_tokens(): - sr = cudf.Series(["this is me", "theme music", ""]) - targets = cudf.Series(["is", "me"]) - - expected = cudf.Series(["this _ _", "theme music", ""]) - actual = sr.str.replace_tokens(targets, "_") - - assert_eq(expected, actual) - - replacements = cudf.Series(["IS", "ME"]) - expected = cudf.Series(["this IS ME", "theme music", ""]) - actual = sr.str.replace_tokens(targets, replacements) - - assert_eq(expected, actual) - - sr = cudf.Series( - [ - "this is a small text ☕", - "this \t\t is ; ; - + a looooooooooonnnnnnnggggggg text \n\t", - "emptyme", - ], - ) - targets = cudf.Series( - ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"] - ) - replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""]) - - expected = cudf.Series( - [ - "this is the small text 🚒", - "this \t\t is ; ; - + the 🔥🔥 text \n\t", - "", - ] - ) - actual = sr.str.replace_tokens(targets, replacements) - - assert_eq(expected, actual) - - sr = cudf.Series( - ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] - ) - targets = cudf.Series(["🌬", "🔥", "🌊"]) - replacements = "🚰" - - expected = cudf.Series( - ["All-we-need;is;🚰", "\tall-we-need0is;🚰", "all;we:need+is;🚰"] - ) - actual = sr.str.replace_tokens(targets, replacements, delimiter=";") - - assert_eq(expected, actual) - assert_eq(sr, sr.str.replace_tokens(targets, replacements)) - assert_eq(sr, sr.str.replace_tokens([""], [""])) - - -def test_text_replace_tokens_error_cases(): - sr = cudf.Series(["this is me", "theme music", ""]) - - with pytest.raises( - TypeError, - match="targets should be an array-like or a Series object, " - "found ", - ): - sr.str.replace_tokens("me", ["a"]) - - with pytest.raises( - ValueError, - match="targets and replacements should be same size" - " sequences unless replacements is a string.", - ): - sr.str.replace_tokens(["a"], ["me", "ki"]) - - with pytest.raises( - TypeError, - match="replacements should be an str, array-like or Series object," - " found ", - ): - sr.str.replace_tokens(["a"], {"s"}) - - with pytest.raises( - TypeError, - match="Type of delimiter should be a string, found ", - ): - sr.str.replace_tokens(["a"], ["s"], delimiter=["a", "b"]) - - -def test_text_filter_tokens(): - sr = cudf.Series(["the quick brown fox jumped", "over the lazy dog", ""]) - - expected = cudf.Series([" quick brown jumped", " ", ""]) - actual = sr.str.filter_tokens(5) - assert_eq(expected, actual) - - expected = cudf.Series(["🔥 quick brown 🔥 jumped", "🔥 🔥 🔥 🔥", ""]) - actual = sr.str.filter_tokens(5, "🔥") - assert_eq(expected, actual) - - sr = cudf.Series( - ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] - ) - expected = cudf.Series( - ["All-we-need;is;--", "\tall-we-need0is;--", "all;we:need+is;--"] - ) - actual = sr.str.filter_tokens(2, "--", ";") - assert_eq(expected, actual) - - assert_eq(sr, sr.str.filter_tokens(1)) - - -def test_text_filter_tokens_error_cases(): - sr = cudf.Series(["abc", "def", ""]) - - with pytest.raises( - TypeError, - match="Type of replacement should be a string, found ", - ): - sr.str.filter_tokens(3, replacement=["a", "b"]) - - with pytest.raises( - TypeError, - match="Type of delimiter should be a string, found ", - ): - sr.str.filter_tokens(3, delimiter=["a", "b"]) - - -def test_edit_distance(): - sr = cudf.Series(["kitten", "saturday", "address", "book"]) - tg = cudf.Series(["sitting", "sunday", "addressee", "back"]) - - expected = cudf.Series([3, 3, 2, 2], dtype=np.int32) - actual = sr.str.edit_distance(tg) - assert_eq(expected, actual) - - expected = cudf.Series([0, 7, 6, 6], dtype=np.int32) - actual = sr.str.edit_distance("kitten") - assert_eq(expected, actual) - - -def test_edit_distance_matrix(): - # normal - sr = cudf.Series(["rounded", "bounded", "bounce", "trounce", "ounce"]) - - expected = cudf.Series( - [ - [0, 1, 3, 3, 3], - [1, 0, 2, 4, 3], - [3, 2, 0, 2, 1], - [3, 4, 2, 0, 2], - [3, 3, 1, 2, 0], - ] - ) - got = sr.str.edit_distance_matrix() - - assert_eq(expected, got, check_dtype=False) - - # 1-row series - sr2 = cudf.Series(["x"]) - with pytest.raises(ValueError, match="Require size >= 2"): - sr2.str.edit_distance_matrix() - - # null rows - sr3 = cudf.Series(["rounded", None, "bounce", "trounce", "ounce"]) - with pytest.raises(ValueError, match="Cannot compute"): - sr3.str.edit_distance_matrix() - - -def test_porter_stemmer_measure(): - strings = cudf.Series( - [ - "tr", - "ee", - "tree", - "y", - "by", - "trouble", - "oats", - "trees", - "ivy", - "troubles", - "private", - "oaten", - "orrery", - None, - "", - ] - ) - expected = cudf.Series( - [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, None, 0], dtype=np.int32 - ) - - actual = strings.str.porter_stemmer_measure() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_is_vowel_consonant(): - strings = cudf.Series( - ["tr", "ee", "tree", "y", "by", "oats", "ivy", "orrery", None, ""] - ) - expected = cudf.Series( - [False, False, True, False, False, False, True, False, None, False] - ) - actual = strings.str.is_vowel(2) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [True, False, True, False, False, False, True, True, None, False] - ) - actual = strings.str.is_consonant(1) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0]) - expected = cudf.Series( - [False, True, False, False, True, False, True, True, None, False] - ) - actual = strings.str.is_vowel(indices) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [False, False, True, True, False, True, False, False, None, False] - ) - actual = strings.str.is_consonant(indices) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_minhash(): - strings = cudf.Series(["this is my", "favorite book", None, ""]) - - expected = cudf.Series( - [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - None, - cudf.Series([0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), - None, - cudf.Series([0, 0, 0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash(seeds=seeds, width=5) - assert_eq(expected, actual) - - expected = cudf.Series( - [ - cudf.Series([3232308021562742685], dtype=np.uint64), - cudf.Series([23008204270530356], dtype=np.uint64), - None, - cudf.Series([0], dtype=np.uint64), - ] - ) - actual = strings.str.minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - expected = cudf.Series( - [ - cudf.Series( - [7082801294247314046, 185949556058924788, 167570629329462454], - dtype=np.uint64, - ), - cudf.Series( - [382665377781028452, 86243762733551437, 7688750597953083512], - dtype=np.uint64, - ), - None, - cudf.Series([0, 0, 0], dtype=np.uint64), - ] - ) - actual = strings.str.minhash64(seeds=seeds, width=5) - assert_eq(expected, actual) - - # test wrong seed types - with pytest.raises(ValueError): - strings.str.minhash(seeds="a") - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64(seeds=seeds) - - -def test_word_minhash(): - ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]]) - - expected = cudf.Series( - [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - ] - ) - actual = ls.str.word_minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32), - cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32), - ] - ) - actual = ls.str.word_minhash(seeds=seeds) - assert_eq(expected, actual) - - expected = cudf.Series( - [ - cudf.Series([2603139454418834912], dtype=np.uint64), - cudf.Series([5240044617220523711], dtype=np.uint64), - ] - ) - actual = ls.str.word_minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) - expected = cudf.Series( - [ - cudf.Series( - [ - 2603139454418834912, - 8644371945174847701, - 5541030711534384340, - ], - dtype=np.uint64, - ), - cudf.Series( - [5240044617220523711, 5847101123925041457, 153762819128779913], - dtype=np.uint64, - ), - ] - ) - actual = ls.str.word_minhash64(seeds=seeds) - assert_eq(expected, actual) - - # test wrong seed types - with pytest.raises(ValueError): - ls.str.word_minhash(seeds="a") - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - ls.str.word_minhash(seeds=seeds) - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - ls.str.word_minhash64(seeds=seeds) - - -def test_jaccard_index(): - str1 = cudf.Series(["the brown dog", "jumped about"]) - str2 = cudf.Series(["the black cat", "jumped around"]) - - expected = cudf.Series([0.058824, 0.307692], dtype=np.float32) - actual = str1.str.jaccard_index(str2, 5) - assert_eq(expected, actual) - - actual = str2.str.jaccard_index(str1, 5) - assert_eq(expected, actual) - - with pytest.raises(ValueError): - str1.str.jaccard_index(str2, 1) - with pytest.raises(ValueError): - str3 = cudf.Series(["not enough rows"]) - str1.str.jaccard_index(str3, 5) - - -def _make_list_of_strings_of_random_length( - num_strings, min_length, max_length -): - return [ - "".join( - random.choice(string.ascii_lowercase) - for _ in range(random.randint(min_length, max_length)) - ) - for _ in range(num_strings) - ] - - -def test_jaccard_index_random_strings(): - # Seed the rng before random string generation. - random.seed(42) - num_strings = 100 - jaccard_width = 5 - common_strings = _make_list_of_strings_of_random_length( - num_strings, jaccard_width, 50 - ) - uncommon_strings1 = _make_list_of_strings_of_random_length( - num_strings, jaccard_width, 10 - ) - uncommon_strings2 = _make_list_of_strings_of_random_length( - num_strings, jaccard_width, 20 - ) - str1 = cudf.Series(uncommon_strings1).str.cat(cudf.Series(common_strings)) - str2 = cudf.Series(uncommon_strings2).str.cat(cudf.Series(common_strings)) - - # adopted from https://github.com/rapidsai/rapids-deduplication/issues/36 - da = str1.str.character_ngrams(jaccard_width, True) - db = str2.str.character_ngrams(jaccard_width, True) - da = da.list.unique() - db = db.list.unique() - da = da.explode() - db = db.explode() - da = da.to_frame() - db = db.to_frame() - da = da.reset_index() - db = db.reset_index() - da = da.rename(columns={0: "token"}) - db = db.rename(columns={0: "token"}) - db["match"] = 1 - inter = da.merge(db, on=["index", "token"], how="left") - inter = inter.groupby("index")["match"].sum() - union = da.merge(db, on=["index", "token"], how="outer") - union = union.groupby("index").size() - res = inter / union - res.fillna(0, inplace=True) - res = res.sort_index() - res = res.values.astype("float32") - expected = cudf.Series(res) - - actual = str1.str.jaccard_index(str2, jaccard_width) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "separator, input, results", - [ - (" ", "thetestsentence", "the test sent ence"), - ("_", "sentenceistest", "sent_ence_is_test"), - ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"), - ], -) -def test_byte_pair_encoding(separator, input, results): - pairs_table = cudf.Series( - [ - "t he", - "h e", - "e n", - "i t", - "i s", - "e s", - "en t", - "c e", - "es t", - "en ce", - "t h", - "h i", - "th is", - "t est", - "s i", - "s ent", - ] - ) - encoder = BytePairEncoder(pairs_table) - - strings = cudf.Series([input, None, "", input]) - - expected = cudf.Series([results, None, "", results]) - - actual = encoder(strings, separator) - assert type(expected) == type(actual) - assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/window/__init__.py b/python/cudf/cudf/tests/window/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/window/test_rolling.py b/python/cudf/cudf/tests/window/test_rolling.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/window/test_rolling.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/utils/__init__.py b/python/cudf/cudf/utils/__init__.py deleted file mode 100644 index ccbb16256fb..00000000000 --- a/python/cudf/cudf/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py deleted file mode 100644 index d9dde58d998..00000000000 --- a/python/cudf/cudf/utils/_numba.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -import glob -import os -import sys -from functools import lru_cache - -from numba import config as numba_config - - -# Use an lru_cache with a single value to allow a delayed import of -# strings_udf. This is the easiest way to break an otherwise circular import -# loop of _lib.*->cudautils->_numba->_lib.strings_udf -@lru_cache -def _get_cuda_build_version(): - from cudf._lib import strings_udf - - # The version is an integer, parsed as 1000 * major + 10 * minor - cuda_build_version = strings_udf.get_cuda_build_version() - cuda_major_version = cuda_build_version // 1000 - cuda_minor_version = (cuda_build_version % 1000) // 10 - return (cuda_major_version, cuda_minor_version) - - -def _get_best_ptx_file(archs, max_compute_capability): - """ - Determine of the available PTX files which one is - the most recent up to and including the device compute capability. - """ - filtered_archs = [x for x in archs if x[0] <= max_compute_capability] - if filtered_archs: - return max(filtered_archs, key=lambda x: x[0]) - else: - return None - - -def _get_ptx_file(path, prefix): - if "RAPIDS_NO_INITIALIZE" in os.environ: - # cc=70 ptx is always built - cc = int(os.environ.get("STRINGS_UDF_CC", "70")) - else: - from numba import cuda - - dev = cuda.get_current_device() - - # Load the highest compute capability file available that is less than - # the current device's. - cc = int("".join(str(x) for x in dev.compute_capability)) - files = glob.glob(os.path.join(path, f"{prefix}*.ptx")) - if len(files) == 0: - raise RuntimeError(f"Missing PTX files for cc={cc}") - regular_sms = [] - - for f in files: - file_name = os.path.basename(f) - sm_number = file_name.rstrip(".ptx").lstrip(prefix) - if sm_number.endswith("a"): - processed_sm_number = int(sm_number.rstrip("a")) - if processed_sm_number == cc: - return f - else: - regular_sms.append((int(sm_number), f)) - - regular_result = None - - if regular_sms: - regular_result = _get_best_ptx_file(regular_sms, cc) - - if regular_result is None: - raise RuntimeError( - "This cuDF installation is missing the necessary PTX " - f"files that are <={cc}." - ) - else: - return regular_result[1] - - -def patch_numba_linker_cuda_11(): - # Enable the config option for minor version compatibility - numba_config.CUDA_ENABLE_MINOR_VERSION_COMPATIBILITY = 1 - - if "numba.cuda" in sys.modules: - # Patch numba for version 0.57.0 MVC support, which must know the - # config value at import time. We cannot guarantee the order of imports - # between cudf and numba.cuda so we patch numba to ensure it has these - # names available. - # See https://github.com/numba/numba/issues/8977 for details. - import numba.cuda - from cubinlinker import CubinLinker, CubinLinkerError - from ptxcompiler import compile_ptx - - numba.cuda.cudadrv.driver.compile_ptx = compile_ptx - numba.cuda.cudadrv.driver.CubinLinker = CubinLinker - numba.cuda.cudadrv.driver.CubinLinkerError = CubinLinkerError - - -def _setup_numba(): - """ - Configure the numba linker for use with cuDF. This consists of - potentially putting numba into enhanced compatibility mode - based on the user driver and runtime versions as well as the - version of the CUDA Toolkit used to build the PTX files shipped - with the user cuDF package. - """ - - # Either ptxcompiler, or our vendored version (_ptxcompiler.py) - # is needed to determine the driver and runtime CUDA versions in - # the environment. In a CUDA 11.x environment, ptxcompiler is used - # to provide MVC directly, whereas for CUDA 12.x this is provided - # through pynvjitlink. The presence of either package does not - # perturb cuDF's operation in situations where they aren't used. - try: - from ptxcompiler.patch import NO_DRIVER, safe_get_versions - except ModuleNotFoundError: - # use vendored version - from cudf.utils._ptxcompiler import NO_DRIVER, safe_get_versions - - versions = safe_get_versions() - if versions != NO_DRIVER: - driver_version, runtime_version = versions - shim_ptx_cuda_version = _get_cuda_build_version() - - # MVC is required whenever any PTX is newer than the driver - # This could be the shipped shim PTX file (determined by the CUDA - # version used at build time) or the PTX emitted by the version of NVVM - # on the user system (determined by the user's CUDA runtime version) - if (driver_version < shim_ptx_cuda_version) or ( - driver_version < runtime_version - ): - if driver_version < (12, 0): - patch_numba_linker_cuda_11() - else: - from pynvjitlink.patch import patch_numba_linker - - patch_numba_linker() - - -class _CUDFNumbaConfig: - def __enter__(self): - self.CUDA_LOW_OCCUPANCY_WARNINGS = ( - numba_config.CUDA_LOW_OCCUPANCY_WARNINGS - ) - numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 - - self.CAPTURED_ERRORS = numba_config.CAPTURED_ERRORS - numba_config.CAPTURED_ERRORS = "new_style" - - def __exit__(self, exc_type, exc_value, traceback): - numba_config.CUDA_LOW_OCCUPANCY_WARNINGS = ( - self.CUDA_LOW_OCCUPANCY_WARNINGS - ) - numba_config.CAPTURED_ERRORS = self.CAPTURED_ERRORS diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py deleted file mode 100644 index 9d7071d55a5..00000000000 --- a/python/cudf/cudf/utils/_ptxcompiler.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import os -import re -import subprocess -import sys -import warnings - -NO_DRIVER = (math.inf, math.inf) -START_TAG = "_VER_START" -END_TAG = "_VER_END" - -NUMBA_CHECK_VERSION_CMD = """\ -from ctypes import c_int, byref -from numba import cuda -dv = c_int(0) -cuda.cudadrv.driver.driver.cuDriverGetVersion(byref(dv)) -drv_major = dv.value // 1000 -drv_minor = (dv.value - (drv_major * 1000)) // 10 -run_major, run_minor = cuda.runtime.get_version() -print(f'_VER_START{drv_major} {drv_minor} {run_major} {run_minor}_VER_END') -""" - - -def check_disabled_in_env(): - # We should avoid checking whether the patch is - # needed if the user requested that we don't check - # (e.g. in a non-fork-safe environment) - check = os.getenv("PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED") - if check is not None: - try: - check = int(check) - except ValueError: - check = False - else: - check = True - - return not check - - -def get_versions(): - cp = subprocess.run( - [sys.executable, "-c", NUMBA_CHECK_VERSION_CMD], capture_output=True - ) - if cp.returncode: - msg = ( - f"Error getting driver and runtime versions:\n\nstdout:\n\n" - f"{cp.stdout.decode()}\n\nstderr:\n\n{cp.stderr.decode()}\n\n" - "Not patching Numba" - ) - warnings.warn(msg, UserWarning) - return NO_DRIVER - - pattern = r"_VER_START(.*?)_VER_END" - - ver_str = re.search(pattern, cp.stdout.decode()).group(1) - - versions = [int(s) for s in ver_str.strip().split()] - driver_version = tuple(versions[:2]) - runtime_version = tuple(versions[2:]) - - return driver_version, runtime_version - - -def safe_get_versions(): - """ - Return a 2-tuple of deduced driver and runtime versions. - - To ensure that this function does not initialize a CUDA context, - calls to the runtime and driver are made in a subprocess. - - If PTXCOMPILER_CHECK_NUMBA_CODEGEN_PATCH_NEEDED is set - in the environment, then this subprocess call is not launched. - To specify the driver and runtime versions of the environment - in this case, set PTXCOMPILER_KNOWN_DRIVER_VERSION and - PTXCOMPILER_KNOWN_RUNTIME_VERSION appropriately. - """ - if check_disabled_in_env(): - try: - # allow user to specify driver/runtime - # versions manually, if necessary - driver_version = os.environ[ - "PTXCOMPILER_KNOWN_DRIVER_VERSION" - ].split(".") - runtime_version = os.environ[ - "PTXCOMPILER_KNOWN_RUNTIME_VERSION" - ].split(".") - driver_version, runtime_version = ( - tuple(map(int, driver_version)), - tuple(map(int, runtime_version)), - ) - except (KeyError, ValueError): - warnings.warn( - "No way to determine driver and runtime versions for " - "patching, set PTXCOMPILER_KNOWN_DRIVER_VERSION and " - "PTXCOMPILER_KNOWN_RUNTIME_VERSION" - ) - return NO_DRIVER - else: - driver_version, runtime_version = get_versions() - return driver_version, runtime_version diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py deleted file mode 100644 index cd7fe5ee023..00000000000 --- a/python/cudf/cudf/utils/applyutils.py +++ /dev/null @@ -1,375 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import functools -from typing import Any - -import cupy as cp -from numba import cuda -from numba.core.utils import pysignature - -import cudf -from cudf import _lib as libcudf -from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import column -from cudf.utils import utils -from cudf.utils._numba import _CUDFNumbaConfig -from cudf.utils.docutils import docfmt_partial - -_doc_applyparams = """ -df : DataFrame - The source dataframe. -func : function - The transformation function that will be executed on the CUDA GPU. -incols: list or dict - A list of names of input columns that match the function arguments. - Or, a dictionary mapping input column names to their corresponding - function arguments such as {'col1': 'arg1'}. -outcols: dict - A dictionary of output column names and their dtype. -kwargs: dict - name-value of extra arguments. These values are passed - directly into the function. -pessimistic_nulls : bool - Whether or not apply_rows output should be null when any corresponding - input is null. If False, all outputs will be non-null, but will be the - result of applying func against the underlying column data, which - may be garbage. -""" - -_doc_applychunkparams = """ -chunks : int or Series-like - If it is an ``int``, it is the chunksize. - If it is an array, it contains integer offset for the start of each chunk. - The span of a chunk for chunk i-th is ``data[chunks[i] : chunks[i + 1]]`` - for any ``i + 1 < chunks.size``; or, ``data[chunks[i]:]`` for the - ``i == len(chunks) - 1``. -tpb : int; optional - The threads-per-block for the underlying kernel. - If not specified (Default), uses Numba ``.forall(...)`` built-in to query - the CUDA Driver API to determine optimal kernel launch configuration. - Specify 1 to emulate serial execution for each chunk. It is a good - starting point but inefficient. - Its maximum possible value is limited by the available CUDA GPU resources. -blkct : int; optional - The number of blocks for the underlying kernel. - If not specified (Default) and ``tpb`` is not specified (Default), uses - Numba ``.forall(...)`` built-in to query the CUDA Driver API to determine - optimal kernel launch configuration. - If not specified (Default) and ``tpb`` is specified, uses ``chunks`` as the - number of blocks. -""" - -doc_apply = docfmt_partial(params=_doc_applyparams) -doc_applychunks = docfmt_partial( - params=_doc_applyparams, params_chunks=_doc_applychunkparams -) - - -@doc_apply() -def apply_rows( - df, func, incols, outcols, kwargs, pessimistic_nulls, cache_key -): - """Row-wise transformation - - Parameters - ---------- - {params} - """ - applyrows = ApplyRowsCompiler( - func, incols, outcols, kwargs, pessimistic_nulls, cache_key=cache_key - ) - return applyrows.run(df) - - -@doc_applychunks() -def apply_chunks( - df, - func, - incols, - outcols, - kwargs, - pessimistic_nulls, - chunks, - blkct=None, - tpb=None, -): - """Chunk-wise transformation - - Parameters - ---------- - {params} - {params_chunks} - """ - applychunks = ApplyChunksCompiler( - func, incols, outcols, kwargs, pessimistic_nulls, cache_key=None - ) - return applychunks.run(df, chunks=chunks, tpb=tpb) - - -@acquire_spill_lock() -def make_aggregate_nullmask(df, columns=None, op="__and__"): - out_mask = None - for k in columns or df._data: - col = cudf.core.dataframe.extract_col(df, k) - if not col.nullable: - continue - nullmask = column.as_column(df[k]._column.nullmask) - - if out_mask is None: - out_mask = column.as_column( - nullmask.copy(), dtype=utils.mask_dtype - ) - else: - out_mask = libcudf.binaryop.binaryop( - nullmask, out_mask, op, out_mask.dtype - ) - - return out_mask - - -class ApplyKernelCompilerBase: - def __init__( - self, func, incols, outcols, kwargs, pessimistic_nulls, cache_key - ): - # Get signature of user function - sig = pysignature(func) - self.sig = sig - self.incols = incols - self.outcols = outcols - self.kwargs = kwargs - self.pessimistic_nulls = pessimistic_nulls - self.cache_key = cache_key - self.kernel = self.compile(func, sig.parameters.keys(), kwargs.keys()) - - @acquire_spill_lock() - def run(self, df, **launch_params): - # Get input columns - if isinstance(self.incols, dict): - inputs = { - v: df[k]._column.data_array_view(mode="read") - for (k, v) in self.incols.items() - } - else: - inputs = { - k: df[k]._column.data_array_view(mode="read") - for k in self.incols - } - # Allocate output columns - outputs = {} - for k, dt in self.outcols.items(): - outputs[k] = column.column_empty( - len(df), dt, False - ).data_array_view(mode="write") - # Bind argument - args = {} - for dct in [inputs, outputs, self.kwargs]: - args.update(dct) - bound = self.sig.bind(**args) - # Launch kernel - self.launch_kernel(df, bound.args, **launch_params) - # Prepare pessimistic nullmask - if self.pessimistic_nulls: - out_mask = make_aggregate_nullmask(df, columns=self.incols) - else: - out_mask = None - # Prepare output frame - outdf = df.copy() - for k in sorted(self.outcols): - outdf[k] = cudf.Series( - outputs[k], index=outdf.index, nan_as_null=False - ) - if out_mask is not None: - outdf._data[k] = outdf[k]._column.set_mask( - out_mask.data_array_view(mode="write") - ) - - return outdf - - -class ApplyRowsCompiler(ApplyKernelCompilerBase): - def compile(self, func, argnames, extra_argnames): - # Compile kernel - kernel = _load_cache_or_make_row_wise_kernel( - self.cache_key, func, argnames, extra_argnames - ) - return kernel - - def launch_kernel(self, df, args): - with _CUDFNumbaConfig(): - self.kernel.forall(len(df))(*args) - - -class ApplyChunksCompiler(ApplyKernelCompilerBase): - def compile(self, func, argnames, extra_argnames): - # Compile kernel - kernel = _load_cache_or_make_chunk_wise_kernel( - func, argnames, extra_argnames - ) - return kernel - - def launch_kernel(self, df, args, chunks, blkct=None, tpb=None): - chunks = self.normalize_chunks(len(df), chunks) - if blkct is None and tpb is None: - with _CUDFNumbaConfig(): - self.kernel.forall(len(df))(len(df), chunks, *args) - else: - assert tpb is not None - if blkct is None: - blkct = chunks.size - with _CUDFNumbaConfig(): - self.kernel[blkct, tpb](len(df), chunks, *args) - - def normalize_chunks(self, size, chunks): - if isinstance(chunks, int): - # *chunks* is the chunksize - return cuda.as_cuda_array( - cp.arange(start=0, stop=size, step=chunks) - ).view("int64") - else: - # *chunks* is an array of chunk leading offset - return cuda.as_cuda_array(cp.asarray(chunks)).view("int64") - - -def _make_row_wise_kernel(func, argnames, extras): - """ - Make a kernel that does a stride loop over the input rows. - - Each thread is responsible for a row in each iteration. - Several iteration may be needed to handling a large number of rows. - - The resulting kernel can be used with any 1D grid size and 1D block size. - """ - # Build kernel source - argnames = list(map(_mangle_user, argnames)) - extras = list(map(_mangle_user, extras)) - source = """ -def row_wise_kernel({args}): -{body} -""" - - args = ", ".join(argnames) - body = [] - - body.append("tid = cuda.grid(1)") - body.append("ntid = cuda.gridsize(1)") - - for a in argnames: - if a not in extras: - start = "tid" - stop = "" - stride = "ntid" - srcidx = "{a} = {a}[{start}:{stop}:{stride}]" - body.append( - srcidx.format(a=a, start=start, stop=stop, stride=stride) - ) - - body.append(f"inner({args})") - - indented = ["{}{}".format(" " * 4, ln) for ln in body] - # Finalize source - concrete = source.format(args=args, body="\n".join(indented)) - # Get bytecode - glbs = {"inner": cuda.jit(device=True)(func), "cuda": cuda} - exec(concrete, glbs) - # Compile as CUDA kernel - kernel = cuda.jit(glbs["row_wise_kernel"]) - return kernel - - -def _make_chunk_wise_kernel(func, argnames, extras): - """ - Make a kernel that does a stride loop over the input chunks. - - Each block is responsible for a chunk in each iteration. - Several iteration may be needed to handling a large number of chunks. - - The user function *func* will have all threads in the block for its - computation. - - The resulting kernel can be used with any 1D grid size and 1D block size. - """ - - # Build kernel source - argnames = list(map(_mangle_user, argnames)) - extras = list(map(_mangle_user, extras)) - source = """ -def chunk_wise_kernel(nrows, chunks, {args}): -{body} -""" - - args = ", ".join(argnames) - body = [] - - body.append("blkid = cuda.blockIdx.x") - body.append("nblkid = cuda.gridDim.x") - body.append("tid = cuda.threadIdx.x") - body.append("ntid = cuda.blockDim.x") - - # Stride loop over the block - body.append("for curblk in range(blkid, chunks.size, nblkid):") - indent = " " * 4 - - body.append(indent + "start = chunks[curblk]") - body.append( - indent - + "stop = chunks[curblk + 1]" - + " if curblk + 1 < chunks.size else nrows" - ) - - slicedargs = {} - for a in argnames: - if a not in extras: - slicedargs[a] = f"{a}[start:stop]" - else: - slicedargs[a] = str(a) - body.append( - "{}inner({})".format( - indent, ", ".join(slicedargs[k] for k in argnames) - ) - ) - - indented = ["{}{}".format(" " * 4, ln) for ln in body] - # Finalize source - concrete = source.format(args=args, body="\n".join(indented)) - # Get bytecode - glbs = {"inner": cuda.jit(device=True)(func), "cuda": cuda} - exec(concrete, glbs) - # Compile as CUDA kernel - kernel = cuda.jit(glbs["chunk_wise_kernel"]) - return kernel - - -_cache: dict[Any, Any] = dict() - - -@functools.wraps(_make_row_wise_kernel) -def _load_cache_or_make_row_wise_kernel(cache_key, func, *args, **kwargs): - """Caching version of ``_make_row_wise_kernel``.""" - if cache_key is None: - cache_key = func - try: - out = _cache[cache_key] - # print("apply cache loaded", cache_key) - return out - except KeyError: - # print("apply cache NOT loaded", cache_key) - kernel = _make_row_wise_kernel(func, *args, **kwargs) - _cache[cache_key] = kernel - return kernel - - -@functools.wraps(_make_chunk_wise_kernel) -def _load_cache_or_make_chunk_wise_kernel(func, *args, **kwargs): - """Caching version of ``_make_row_wise_kernel``.""" - try: - return _cache[func] - except KeyError: - kernel = _make_chunk_wise_kernel(func, *args, **kwargs) - _cache[func] = kernel - return kernel - - -def _mangle_user(name): - """Mangle user variable name""" - return f"__user_{name}" diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py deleted file mode 100755 index 020c32de9f3..00000000000 --- a/python/cudf/cudf/utils/cudautils.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. - -from pickle import dumps - -import cachetools -from numba import cuda -from numba.np import numpy_support - -from cudf.utils._numba import _CUDFNumbaConfig - -# -# Misc kernels -# - - -@cuda.jit -def gpu_window_sizes_from_offset(arr, window_sizes, offset): - i = cuda.grid(1) - j = i - if i < arr.size: - while j > -1: - if (arr[i] - arr[j]) >= offset: - break - j -= 1 - window_sizes[i] = i - j - - -def window_sizes_from_offset(arr, offset): - window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32") - if arr.size > 0: - with _CUDFNumbaConfig(): - gpu_window_sizes_from_offset.forall(arr.size)( - arr, window_sizes, offset - ) - return window_sizes - - -@cuda.jit -def gpu_grouped_window_sizes_from_offset( - arr, window_sizes, group_starts, offset -): - i = cuda.grid(1) - j = i - if i < arr.size: - while j > (group_starts[i] - 1): - if (arr[i] - arr[j]) >= offset: - break - j -= 1 - window_sizes[i] = i - j - - -def grouped_window_sizes_from_offset(arr, group_starts, offset): - window_sizes = cuda.device_array(shape=(arr.shape), dtype="int32") - if arr.size > 0: - with _CUDFNumbaConfig(): - gpu_grouped_window_sizes_from_offset.forall(arr.size)( - arr, window_sizes, group_starts, offset - ) - return window_sizes - - -# This cache is keyed on the (signature, code, closure variables) of UDFs, so -# it can hit for distinct functions that are similar. The lru_cache wrapping -# compile_udf misses for these similar functions, but doesn't need to serialize -# closure variables to check for a hit. -_udf_code_cache: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) - - -def make_cache_key(udf, sig): - """ - Build a cache key for a user defined function. Used to avoid - recompiling the same function for the same set of types - """ - codebytes = udf.__code__.co_code - constants = udf.__code__.co_consts - names = udf.__code__.co_names - - if udf.__closure__ is not None: - cvars = tuple(x.cell_contents for x in udf.__closure__) - cvarbytes = dumps(cvars) - else: - cvarbytes = b"" - - return names, constants, codebytes, cvarbytes, sig - - -def compile_udf(udf, type_signature): - """Compile ``udf`` with `numba` - - Compile a python callable function ``udf`` with - `numba.cuda.compile_ptx_for_current_device(device=True)` using - ``type_signature`` into CUDA PTX together with the generated output type. - - The output is expected to be passed to the PTX parser in `libcudf` - to generate a CUDA device function to be inlined into CUDA kernels, - compiled at runtime and launched. - - Parameters - ---------- - udf: - a python callable function - - type_signature: - a tuple that specifies types of each of the input parameters of ``udf``. - The types should be one in `numba.types` and could be converted from - numpy types with `numba.numpy_support.from_dtype(...)`. - - Returns - ------- - ptx_code: - The compiled CUDA PTX - - output_type: - An numpy type - - """ - import cudf.core.udf - - key = make_cache_key(udf, type_signature) - res = _udf_code_cache.get(key) - if res: - return res - - # We haven't compiled a function like this before, so need to fall back to - # compilation with Numba - ptx_code, return_type = cuda.compile_ptx_for_current_device( - udf, type_signature, device=True - ) - if not isinstance(return_type, cudf.core.udf.masked_typing.MaskedType): - output_type = numpy_support.as_dtype(return_type).type - else: - output_type = return_type - - # Populate the cache for this function - res = (ptx_code, output_type) - _udf_code_cache[key] = res - - return res diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py deleted file mode 100644 index 336b92dba4f..00000000000 --- a/python/cudf/cudf/utils/docutils.py +++ /dev/null @@ -1,343 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -""" -Helper functions for parameterized docstring -""" - -import functools -import re -import string - -_regex_whitespaces = re.compile(r"^\s+$") - - -def _only_spaces(s): - return bool(_regex_whitespaces.match(s)) - - -_wrapopts = {"width": 78, "replace_whitespace": False} - - -def docfmt(**kwargs): - """Format docstring. - - Similar to saving the result of ``__doc__.format(**kwargs)`` as the - function's docstring. - """ - kwargs = {k: v.lstrip() for k, v in kwargs.items()} - - def outer(fn): - buf = [] - if fn.__doc__ is None: - return fn - formatsiter = string.Formatter().parse(fn.__doc__) - for literal, field, fmtspec, conv in formatsiter: - assert conv is None - assert not fmtspec - buf.append(literal) - if field is not None: - # get indentation - lines = literal.rsplit("\n", 1) - if _only_spaces(lines[-1]): - indent = " " * len(lines[-1]) - valuelines = kwargs[field].splitlines(True) - # first line - buf.append(valuelines[0]) - # subsequent lines are indented - buf.extend([indent + ln for ln in valuelines[1:]]) - else: - buf.append(kwargs[field]) - fn.__doc__ = "".join(buf) - return fn - - return outer - - -def docfmt_partial(**kwargs): - return functools.partial(docfmt, **kwargs) - - -def copy_docstring(other): - """ - Decorator that sets ``__doc__`` to ``other.__doc___``. - """ - - def wrapper(func): - func.__doc__ = other.__doc__ - return func - - return wrapper - - -def doc_apply(doc): - """Set `__doc__` attribute of `func` to `doc`.""" - - def wrapper(func): - func.__doc__ = doc - return func - - return wrapper - - -doc_describe = docfmt_partial( - docstring=""" - Generate descriptive statistics. - - Descriptive statistics include those that summarize the - central tendency, dispersion and shape of a dataset's - distribution, excluding ``NaN`` values. - - Analyzes both numeric and object series, as well as - ``DataFrame`` column sets of mixed data types. The - output will vary depending on what is provided. - Refer to the notes below for more detail. - - Parameters - ---------- - percentiles : list-like of numbers, optional - The percentiles to include in the output. - All should fall between 0 and 1. The default is - ``[.25, .5, .75]``, which returns the 25th, 50th, - and 75th percentiles. - - include : 'all', list-like of dtypes or None(default), optional - A list of data types to include in the result. - Ignored for ``Series``. Here are the options: - - - 'all' : All columns of the input will be included in the output. - - A list-like of dtypes : Limits the results to the - provided data types. - To limit the result to numeric types submit - ``numpy.number``. To limit it instead to object columns submit - the ``numpy.object`` data type. Strings - can also be used in the style of - ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To - select pandas categorical columns, use ``'category'`` - - None (default) : The result will include all numeric columns. - - exclude : list-like of dtypes or None (default), optional, - A list of data types to omit from the result. Ignored - for ``Series``. Here are the options: - - - A list-like of dtypes : Excludes the provided data types - from the result. To exclude numeric types submit - ``numpy.number``. To exclude object columns submit the data - type ``numpy.object``. Strings can also be used in the style of - ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To - exclude pandas categorical columns, use ``'category'`` - - None (default) : The result will exclude nothing. - - Returns - ------- - output_frame : Series or DataFrame - Summary statistics of the Series or Dataframe provided. - - Notes - ----- - For numeric data, the result's index will include ``count``, - ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and - upper percentiles. By default the lower percentile is ``25`` and the - upper percentile is ``75``. The ``50`` percentile is the - same as the median. - - For strings dtype or datetime dtype, the result's index - will include ``count``, ``unique``, ``top``, and ``freq``. The ``top`` - is the most common value. The ``freq`` is the most common value's - frequency. Timestamps also include the ``first`` and ``last`` items. - - If multiple object values have the highest count, then the - ``count`` and ``top`` results will be arbitrarily chosen from - among those with the highest count. - - For mixed data types provided via a ``DataFrame``, the default is to - return only an analysis of numeric columns. If the dataframe consists - only of object and categorical data without any numeric columns, the - default is to return an analysis of both the object and categorical - columns. If ``include='all'`` is provided as an option, the result - will include a union of attributes of each type. - - The ``include`` and ``exclude`` parameters can be used to limit - which columns in a ``DataFrame`` are analyzed for the output. - The parameters are ignored when analyzing a ``Series``. - - Examples - -------- - Describing a ``Series`` containing numeric values. - - >>> import cudf - >>> s = cudf.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) - >>> s - 0 1 - 1 2 - 2 3 - 3 4 - 4 5 - 5 6 - 6 7 - 7 8 - 8 9 - 9 10 - dtype: int64 - >>> s.describe() - count 10.00000 - mean 5.50000 - std 3.02765 - min 1.00000 - 25% 3.25000 - 50% 5.50000 - 75% 7.75000 - max 10.00000 - dtype: float64 - - Describing a categorical ``Series``. - - >>> s = cudf.Series(['a', 'b', 'a', 'b', 'c', 'a'], dtype='category') - >>> s - 0 a - 1 b - 2 a - 3 b - 4 c - 5 a - dtype: category - Categories (3, object): ['a', 'b', 'c'] - >>> s.describe() - count 6 - unique 3 - top a - freq 3 - dtype: object - - Describing a timestamp ``Series``. - - >>> s = cudf.Series([ - ... "2000-01-01", - ... "2010-01-01", - ... "2010-01-01" - ... ], dtype="datetime64[s]") - >>> s - 0 2000-01-01 - 1 2010-01-01 - 2 2010-01-01 - dtype: datetime64[s] - >>> s.describe() - count 3 - mean 2006-09-01 08:00:00 - min 2000-01-01 00:00:00 - 25% 2004-12-31 12:00:00 - 50% 2010-01-01 00:00:00 - 75% 2010-01-01 00:00:00 - max 2010-01-01 00:00:00 - dtype: object - - Describing a ``DataFrame``. By default only numeric fields are - returned. - - >>> df = cudf.DataFrame({"categorical": cudf.Series(['d', 'e', 'f'], - ... dtype='category'), - ... "numeric": [1, 2, 3], - ... "object": ['a', 'b', 'c'] - ... }) - >>> df - categorical numeric object - 0 d 1 a - 1 e 2 b - 2 f 3 c - >>> df.describe() - numeric - count 3.0 - mean 2.0 - std 1.0 - min 1.0 - 25% 1.5 - 50% 2.0 - 75% 2.5 - max 3.0 - - Describing all columns of a ``DataFrame`` regardless of data type. - - >>> df.describe(include='all') - categorical numeric object - count 3 3.0 3 - unique 3 3 - top d a - freq 1 1 - mean 2.0 - std 1.0 - min 1.0 - 25% 1.5 - 50% 2.0 - 75% 2.5 - max 3.0 - - Describing a column from a ``DataFrame`` by accessing it as an - attribute. - - >>> df.numeric.describe() - count 3.0 - mean 2.0 - std 1.0 - min 1.0 - 25% 1.5 - 50% 2.0 - 75% 2.5 - max 3.0 - Name: numeric, dtype: float64 - - Including only numeric columns in a ``DataFrame`` description. - - >>> df.describe(include=[np.number]) - numeric - count 3.0 - mean 2.0 - std 1.0 - min 1.0 - 25% 1.5 - 50% 2.0 - 75% 2.5 - max 3.0 - - Including only string columns in a ``DataFrame`` description. - - >>> df.describe(include=[object]) - object - count 3 - unique 3 - top a - freq 1 - - Including only categorical columns from a ``DataFrame`` description. - - >>> df.describe(include=['category']) - categorical - count 3 - unique 3 - top d - freq 1 - - Excluding numeric columns from a ``DataFrame`` description. - - >>> df.describe(exclude=[np.number]) - categorical object - count 3 3 - unique 3 3 - top d a - freq 1 1 - - Excluding object columns from a ``DataFrame`` description. - - >>> df.describe(exclude=[object]) - categorical numeric - count 3 3.0 - unique 3 - top d - freq 1 - mean 2.0 - std 1.0 - min 1.0 - 25% 1.5 - 50% 2.0 - 75% 2.5 - max 3.0 -""" -) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py deleted file mode 100644 index b0788bcc0fc..00000000000 --- a/python/cudf/cudf/utils/dtypes.py +++ /dev/null @@ -1,705 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import datetime -from decimal import Decimal -from typing import TYPE_CHECKING - -import cupy as cp -import numpy as np -import pandas as pd -import pyarrow as pa -from pandas.core.dtypes.common import infer_dtype_from_object - -import cudf - -if TYPE_CHECKING: - from cudf._typing import DtypeObj - -"""Map numpy dtype to pyarrow types. -Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special -handling is required when converting a Boolean column into arrow. -""" -_np_pa_dtypes = { - np.float64: pa.float64(), - np.float32: pa.float32(), - np.int64: pa.int64(), - np.longlong: pa.int64(), - np.int32: pa.int32(), - np.int16: pa.int16(), - np.int8: pa.int8(), - np.bool_: pa.bool_(), - np.uint64: pa.uint64(), - np.uint32: pa.uint32(), - np.uint16: pa.uint16(), - np.uint8: pa.uint8(), - np.datetime64: pa.date64(), - np.object_: pa.string(), - np.str_: pa.string(), -} - -np_dtypes_to_pandas_dtypes = { - np.dtype("uint8"): pd.UInt8Dtype(), - np.dtype("uint16"): pd.UInt16Dtype(), - np.dtype("uint32"): pd.UInt32Dtype(), - np.dtype("uint64"): pd.UInt64Dtype(), - np.dtype("int8"): pd.Int8Dtype(), - np.dtype("int16"): pd.Int16Dtype(), - np.dtype("int32"): pd.Int32Dtype(), - np.dtype("int64"): pd.Int64Dtype(), - np.dtype("bool_"): pd.BooleanDtype(), - np.dtype("object"): pd.StringDtype(), - np.dtype("float32"): pd.Float32Dtype(), - np.dtype("float64"): pd.Float64Dtype(), -} -pandas_dtypes_to_np_dtypes = { - pd_dtype: np_dtype - for np_dtype, pd_dtype in np_dtypes_to_pandas_dtypes.items() -} - -pyarrow_dtypes_to_pandas_dtypes = { - pa.uint8(): pd.UInt8Dtype(), - pa.uint16(): pd.UInt16Dtype(), - pa.uint32(): pd.UInt32Dtype(), - pa.uint64(): pd.UInt64Dtype(), - pa.int8(): pd.Int8Dtype(), - pa.int16(): pd.Int16Dtype(), - pa.int32(): pd.Int32Dtype(), - pa.int64(): pd.Int64Dtype(), - pa.bool_(): pd.BooleanDtype(), - pa.string(): pd.StringDtype(), -} - - -SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} -UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} -INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES -FLOAT_TYPES = {"float32", "float64"} -SIGNED_TYPES = SIGNED_INTEGER_TYPES | FLOAT_TYPES -NUMERIC_TYPES = SIGNED_TYPES | UNSIGNED_TYPES -DATETIME_TYPES = { - "datetime64[s]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[ns]", -} -TIMEDELTA_TYPES = { - "timedelta64[s]", - "timedelta64[ms]", - "timedelta64[us]", - "timedelta64[ns]", -} -OTHER_TYPES = {"bool", "category", "str"} -STRING_TYPES = {"object"} -BOOL_TYPES = {"bool"} -ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES - - -def np_to_pa_dtype(dtype): - """Util to convert numpy dtype to PyArrow dtype.""" - # special case when dtype is np.datetime64 - if dtype.kind == "M": - time_unit, _ = np.datetime_data(dtype) - if time_unit in ("s", "ms", "us", "ns"): - # return a pa.Timestamp of the appropriate unit - return pa.timestamp(time_unit) - # default is int64_t UNIX ms - return pa.date64() - elif dtype.kind == "m": - time_unit, _ = np.datetime_data(dtype) - if time_unit in ("s", "ms", "us", "ns"): - # return a pa.Duration of the appropriate unit - return pa.duration(time_unit) - # default fallback unit is ns - return pa.duration("ns") - return _np_pa_dtypes[cudf.dtype(dtype).type] - - -def _find_common_type_decimal(dtypes): - # Find the largest scale and the largest difference between - # precision and scale of the columns to be concatenated - s = max(dtype.scale for dtype in dtypes) - lhs = max(dtype.precision - dtype.scale for dtype in dtypes) - # Combine to get the necessary precision and clip at the maximum - # precision - p = s + lhs - - if p > cudf.Decimal64Dtype.MAX_PRECISION: - return cudf.Decimal128Dtype( - min(cudf.Decimal128Dtype.MAX_PRECISION, p), s - ) - elif p > cudf.Decimal32Dtype.MAX_PRECISION: - return cudf.Decimal64Dtype( - min(cudf.Decimal64Dtype.MAX_PRECISION, p), s - ) - else: - return cudf.Decimal32Dtype( - min(cudf.Decimal32Dtype.MAX_PRECISION, p), s - ) - - -def cudf_dtype_from_pydata_dtype(dtype): - """Given a numpy or pandas dtype, converts it into the equivalent cuDF - Python dtype. - """ - - if cudf.api.types._is_categorical_dtype(dtype): - return cudf.core.dtypes.CategoricalDtype - elif cudf.api.types.is_decimal32_dtype(dtype): - return cudf.core.dtypes.Decimal32Dtype - elif cudf.api.types.is_decimal64_dtype(dtype): - return cudf.core.dtypes.Decimal64Dtype - elif cudf.api.types.is_decimal128_dtype(dtype): - return cudf.core.dtypes.Decimal128Dtype - elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES: - return dtype.type - - return infer_dtype_from_object(dtype) - - -def cudf_dtype_to_pa_type(dtype): - """Given a cudf pandas dtype, converts it into the equivalent cuDF - Python dtype. - """ - if isinstance(dtype, cudf.CategoricalDtype): - raise NotImplementedError( - "No conversion from Categorical to pyarrow type" - ) - elif isinstance( - dtype, - (cudf.StructDtype, cudf.ListDtype, cudf.core.dtypes.DecimalDtype), - ): - return dtype.to_arrow() - else: - return np_to_pa_dtype(cudf.dtype(dtype)) - - -def cudf_dtype_from_pa_type(typ): - """Given a cuDF pyarrow dtype, converts it into the equivalent - cudf pandas dtype. - """ - if pa.types.is_list(typ): - return cudf.core.dtypes.ListDtype.from_arrow(typ) - elif pa.types.is_struct(typ): - return cudf.core.dtypes.StructDtype.from_arrow(typ) - elif pa.types.is_decimal(typ): - return cudf.core.dtypes.Decimal128Dtype.from_arrow(typ) - elif pa.types.is_large_string(typ): - return cudf.dtype("str") - else: - return cudf.api.types.pandas_dtype(typ.to_pandas_dtype()) - - -def to_cudf_compatible_scalar(val, dtype=None): - """ - Converts the value `val` to a numpy/Pandas scalar, - optionally casting to `dtype`. - - If `val` is None, returns None. - """ - - if cudf._lib.scalar._is_null_host_scalar(val) or isinstance( - val, cudf.Scalar - ): - return val - - if not cudf.api.types._is_scalar_or_zero_d_array(val): - raise ValueError( - f"Cannot convert value of type {type(val).__name__} " - "to cudf scalar" - ) - - if isinstance(val, Decimal): - return val - - if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: - val = val.item() - - if ( - (dtype is None) and isinstance(val, str) - ) or cudf.api.types.is_string_dtype(dtype): - dtype = "str" - - if isinstance(val, str) and val.endswith("\x00"): - # Numpy string dtypes are fixed width and use NULL to - # indicate the end of the string, so they cannot - # distinguish between "abc\x00" and "abc". - # https://github.com/numpy/numpy/issues/20118 - # In this case, don't try going through numpy and just use - # the string value directly (cudf.DeviceScalar will DTRT) - return val - - tz_error_msg = ( - "Cannot covert a timezone-aware timestamp to timezone-naive scalar." - ) - if isinstance(val, pd.Timestamp): - if val.tz is not None: - raise NotImplementedError(tz_error_msg) - - val = val.to_datetime64() - elif isinstance(val, pd.Timedelta): - val = val.to_timedelta64() - elif isinstance(val, datetime.datetime): - if val.tzinfo is not None: - raise NotImplementedError(tz_error_msg) - val = np.datetime64(val) - elif isinstance(val, datetime.timedelta): - val = np.timedelta64(val) - - if dtype is not None: - dtype = np.dtype(dtype) - if isinstance(val, str) and dtype.kind == "M": - # pd.Timestamp can handle str, but not np.str_ - val = pd.Timestamp(str(val)).to_datetime64().astype(dtype) - else: - # At least datetimes cannot be converted to scalar via dtype.type: - val = np.array(val, dtype)[()] - else: - val = _maybe_convert_to_default_type( - cudf.api.types.pandas_dtype(type(val)) - ).type(val) - - if val.dtype.type is np.datetime64: - time_unit, _ = np.datetime_data(val.dtype) - if time_unit in ("D", "W", "M", "Y"): - val = val.astype("datetime64[s]") - elif val.dtype.type is np.timedelta64: - time_unit, _ = np.datetime_data(val.dtype) - if time_unit in ("D", "W", "M", "Y"): - val = val.astype("timedelta64[ns]") - - return val - - -def is_column_like(obj): - """ - This function checks if the given `obj` - is a column-like (Series, Index...) - type or not. - - Parameters - ---------- - obj : object of any type which needs to be validated. - - Returns - ------- - Boolean: True or False depending on whether the - input `obj` is column-like or not. - """ - return ( - isinstance( - obj, - ( - cudf.core.column.ColumnBase, - cudf.Series, - cudf.Index, - pd.Series, - pd.Index, - ), - ) - or ( - hasattr(obj, "__cuda_array_interface__") - and len(obj.__cuda_array_interface__["shape"]) == 1 - ) - or ( - hasattr(obj, "__array_interface__") - and len(obj.__array_interface__["shape"]) == 1 - ) - ) - - -def can_convert_to_column(obj): - """ - This function checks if the given `obj` - can be used to create a column or not. - - Parameters - ---------- - obj : object of any type which needs to be validated. - - Returns - ------- - Boolean: True or False depending on whether the - input `obj` is column-compatible or not. - """ - return is_column_like(obj) or cudf.api.types.is_list_like(obj) - - -def min_signed_type(x: int, min_size: int = 8) -> np.dtype: - """ - Return the smallest *signed* integer dtype - that can represent the integer ``x`` - """ - for int_dtype in (np.int8, np.int16, np.int32, np.int64): - if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: - if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max: - return np.dtype(int_dtype) - # resort to using `int64` and let numpy raise appropriate exception: - return np.int64(x).dtype - - -def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype: - """ - Return the smallest *unsigned* integer dtype - that can represent the integer ``x`` - """ - for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64): - if (cudf.dtype(int_dtype).itemsize * 8) >= min_size: - if 0 <= x <= np.iinfo(int_dtype).max: - return np.dtype(int_dtype) - # resort to using `uint64` and let numpy raise appropriate exception: - return np.uint64(x).dtype - - -def min_column_type(x, expected_type): - """ - Return the smallest dtype which can represent all - elements of the `NumericalColumn` `x` - If the column is not a subtype of `np.signedinteger` or `np.floating` - returns the same dtype as the dtype of `x` without modification - """ - - if not isinstance(x, cudf.core.column.NumericalColumn): - raise TypeError("Argument x must be of type column.NumericalColumn") - if x.null_count == len(x): - return x.dtype - - if x.dtype.kind == "f": - return get_min_float_dtype(x) - - elif cudf.dtype(expected_type).kind in "iu": - max_bound_dtype = np.min_scalar_type(x.max()) - min_bound_dtype = np.min_scalar_type(x.min()) - result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - else: - result_type = x.dtype - - return cudf.dtype(result_type) - - -def get_min_float_dtype(col): - max_bound_dtype = np.min_scalar_type(float(col.max())) - min_bound_dtype = np.min_scalar_type(float(col.min())) - result_type = np.promote_types( - "float32", np.promote_types(max_bound_dtype, min_bound_dtype) - ) - return cudf.dtype(result_type) - - -def is_mixed_with_object_dtype(lhs, rhs): - if isinstance(lhs.dtype, cudf.CategoricalDtype): - return is_mixed_with_object_dtype(lhs.dtype.categories, rhs) - elif isinstance(rhs.dtype, cudf.CategoricalDtype): - return is_mixed_with_object_dtype(lhs, rhs.dtype.categories) - - return (lhs.dtype == "object" and rhs.dtype != "object") or ( - rhs.dtype == "object" and lhs.dtype != "object" - ) - - -def get_time_unit(obj): - if isinstance( - obj, - ( - cudf.core.column.datetime.DatetimeColumn, - cudf.core.column.timedelta.TimeDeltaColumn, - ), - ): - return obj.time_unit - - time_unit, _ = np.datetime_data(obj.dtype) - return time_unit - - -def _get_nan_for_dtype(dtype): - dtype = cudf.dtype(dtype) - if dtype.kind in "mM": - time_unit, _ = np.datetime_data(dtype) - return dtype.type("nat", time_unit) - elif dtype.kind == "f": - return dtype.type("nan") - else: - return np.float64("nan") - - -def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): - error = TypeError( - f"{op} not supported between {dtype_l} and {dtype_r} scalars" - ) - - to_numpy_ops = { - "__add__": _ADD_TYPES, - "__radd__": _ADD_TYPES, - "__sub__": _SUB_TYPES, - "__rsub__": _SUB_TYPES, - "__mul__": _MUL_TYPES, - "__rmul__": _MUL_TYPES, - "__floordiv__": _FLOORDIV_TYPES, - "__rfloordiv__": _FLOORDIV_TYPES, - "__truediv__": _TRUEDIV_TYPES, - "__rtruediv__": _TRUEDIV_TYPES, - "__mod__": _MOD_TYPES, - "__rmod__": _MOD_TYPES, - "__pow__": _POW_TYPES, - "__rpow__": _POW_TYPES, - } - allowed = to_numpy_ops.get(op, op) - - # special rules for string - if dtype_l == "object" or dtype_r == "object": - if (dtype_l == dtype_r == "object") and op == "__add__": - return "str" - else: - raise error - - # Check if we can directly operate - - for valid_combo in allowed: - ltype, rtype, outtype = valid_combo - if np.can_cast(dtype_l.char, ltype) and np.can_cast( - dtype_r.char, rtype - ): - return outtype - - raise error - - -def find_common_type(dtypes): - """ - Wrapper over np.find_common_type to handle special cases - - Corner cases: - 1. "M8", "M8" -> "M8" | "m8", "m8" -> "m8" - - Parameters - ---------- - dtypes : iterable, sequence of dtypes to find common types - - Returns - ------- - dtype : np.dtype optional, the result from np.find_common_type, - None if input is empty - - """ - - if len(dtypes) == 0: - return None - - # Early exit for categoricals since they're not hashable and therefore - # can't be put in a set. - if any(cudf.api.types._is_categorical_dtype(dtype) for dtype in dtypes): - if all( - ( - cudf.api.types._is_categorical_dtype(dtype) - and (not dtype.ordered if hasattr(dtype, "ordered") else True) - ) - for dtype in dtypes - ): - if len({dtype._categories.dtype for dtype in dtypes}) == 1: - return cudf.CategoricalDtype( - cudf.core.column.concat_columns( - [dtype._categories for dtype in dtypes] - ).unique() - ) - else: - raise ValueError( - "Only unordered categories of the same underlying type " - "may be coerced to a common type." - ) - else: - # TODO: Should this be an error case (mixing categorical with other - # dtypes) or should this return object? Unclear if we have enough - # information to decide right now, may have to come back to this as - # usage of find_common_type increases. - return cudf.dtype("O") - - # Aggregate same types - dtypes = {cudf.dtype(dtype) for dtype in dtypes} - if len(dtypes) == 1: - return dtypes.pop() - - if any( - isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes - ): - if all(cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes): - return _find_common_type_decimal( - [ - dtype - for dtype in dtypes - if cudf.api.types.is_decimal_dtype(dtype) - ] - ) - else: - return cudf.dtype("O") - elif any( - isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)) - for dtype in dtypes - ): - # TODO: As list dtypes allow casting - # to identical types, improve this logic of returning a - # common dtype, for example: - # ListDtype(int64) & ListDtype(int32) common - # dtype could be ListDtype(int64). - raise NotImplementedError( - "Finding a common type for `ListDtype` or `StructDtype` is currently " - "not supported" - ) - - # Corner case 1: - # Resort to np.result_type to handle "M" and "m" types separately - dt_dtypes = set(filter(lambda t: t.kind == "M", dtypes)) - if len(dt_dtypes) > 0: - dtypes = dtypes - dt_dtypes - dtypes.add(np.result_type(*dt_dtypes)) - - td_dtypes = set(filter(lambda t: t.kind == "m", dtypes)) - if len(td_dtypes) > 0: - dtypes = dtypes - td_dtypes - dtypes.add(np.result_type(*td_dtypes)) - - common_dtype = np.result_type(*dtypes) - if common_dtype == np.dtype("float16"): - return cudf.dtype("float32") - return cudf.dtype(common_dtype) - - -def _dtype_pandas_compatible(dtype): - """ - A utility function, that returns `str` instead of `object` - dtype when pandas compatibility mode is enabled. - """ - if cudf.get_option("mode.pandas_compatible") and dtype == cudf.dtype("O"): - return "str" - return dtype - - -def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj: - """Convert `dtype` to default if specified by user. - - If not specified, return as is. - """ - if ib := cudf.get_option("default_integer_bitwidth"): - if dtype.kind == "i": - return cudf.dtype(f"i{ib//8}") - elif dtype.kind == "u": - return cudf.dtype(f"u{ib//8}") - if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f": - return cudf.dtype(f"f{fb//8}") - return dtype - - -def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: - # TODO: replace the use of this function with just `dtype.base` - # when Pandas 2.1.0 is the minimum version we support: - # https://github.com/pandas-dev/pandas/pull/52706 - if isinstance(dtype, pd.DatetimeTZDtype): - return np.dtype(f" 0: - # Cupy throws RunTimeException to get GPU count, - # hence obtaining GPU count by in-house cpp api above - - major_version = getDeviceAttribute( - cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, 0 - ) - - if major_version < 7: - # A GPU with NVIDIA Volta™ architecture or newer is required. - # Reference: https://developer.nvidia.com/cuda-gpus - # Hardware Generation Compute Capability - # Hopper 9.x - # Ampere 8.x - # Turing 7.5 - # Volta 7.0, 7.2 - # Pascal 6.x - # Maxwell 5.x - # Kepler 3.x - # Fermi 2.x - device_name = deviceGetName(0) - minor_version = getDeviceAttribute( - cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, 0 - ) - raise UnsupportedCUDAError( - "A GPU with NVIDIA Volta™ (Compute Capability 7.0) " - "or newer architecture is required.\n" - f"Detected GPU 0: {device_name}\n" - f"Detected Compute Capability: {major_version}.{minor_version}" - ) - - cuda_runtime_version = runtimeGetVersion() - - if cuda_runtime_version < 11000: - # Require CUDA Runtime version 11.0 or greater. - major_version = cuda_runtime_version // 1000 - minor_version = (cuda_runtime_version % 1000) // 10 - raise UnsupportedCUDAError( - "Detected CUDA Runtime version is " - f"{major_version}.{minor_version}. " - "Please update your CUDA Runtime to 11.0 or above." - ) - - cuda_driver_supported_rt_version = driverGetVersion() - - # Though Yes, Externally driver version is represented like `418.39` - # and cuda runtime version like `10.1`. It is not the similar case - # at cuda api's level. Coming down to APIs they follow a uniform - # convention of an integer which corresponds to the versioning - # like (1000 major + 10 minor) for 10.1 Driver version API doesn't - # actually indicate driver version, it indicates only the latest - # CUDA version supported by the driver. - # For reference : - # https://docs.nvidia.com/deploy/cuda-compatibility/index.html - - if cuda_driver_supported_rt_version == 0: - raise UnsupportedCUDAError( - "We couldn't detect the GPU driver properly. Please follow " - "the installation guide to ensure your driver is properly " - "installed: " - "https://docs.nvidia.com/cuda/cuda-installation-guide-linux/" - ) - elif cuda_driver_supported_rt_version >= cuda_runtime_version: - # CUDA Driver Version Check: - # Driver Runtime version is >= Runtime version - pass - elif ( - cuda_driver_supported_rt_version >= 11000 - and cuda_runtime_version >= 11000 - ): - # With cuda enhanced compatibility any code compiled - # with 11.x version of cuda can now run on any - # driver >= 450.80.02. 11000 is the minimum cuda - # version 450.80.02 supports. - pass - else: - raise UnsupportedCUDAError( - "Please update your NVIDIA GPU Driver to support CUDA " - "Runtime.\n" - f"Detected CUDA Runtime version : {cuda_runtime_version}\n" - "Latest version of CUDA supported by current " - f"NVIDIA GPU Driver : {cuda_driver_supported_rt_version}" - ) - else: - warnings.warn("No NVIDIA GPU detected") diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py deleted file mode 100644 index babe4be2715..00000000000 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ /dev/null @@ -1,297 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -# This function is from the rapidsai/clx repo at below link -# https://github.com/rapidsai/clx/blob/267c6d30805c9dcbf80840f222bf31c5c4b7068a/python/clx/analytics/_perfect_hash.py -import numpy as np - -PRIME = np.uint64(281474976710677) - -# Coefficients ranges for inner hash - This are important to set to be -# large so that we have randomness in the bottom bits when modding -A_SECOND_LEVEL_POW = np.uint64(48) -B_SECOND_LEVEL_POW = np.uint64(7) - -A_LBOUND_SECOND_LEVEL_HASH = 2**16 -A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW - -B_LBOUND_SECOND_LEVEL_HASH = 0 -B_HBOUND_SECOND_LEVEL_HASH = 2**B_SECOND_LEVEL_POW - -# Extremely generous and should not ever happen. This limit is imposed -# To ensure we can bit pack all the information needed for the bin hash -# functions - a, b and table size -MAX_SIZE_FOR_INITIAL_BIN = 2**8 - 1 - - -# Shifts for bit packing -A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW) -B_SECOND_LEVEL_SHIFT_AMT = np.uint64( - 64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW -) -BITS_FOR_INNER_TABLE_SIZE = np.uint64(8) - -NOT_FOUND = -1 - - -def _sdbm_hash(string): - hv = 0 - mask = (1 << 48) - 1 - for c in string: - hv = ord(c) + (hv << 6) + (hv << 16) - hv - hv &= mask - return hv - - -def _hash_func(k, a, b, size): - k = np.uint64(k) - a = np.uint64(a) - b = np.uint64(b) - size = np.uint64(size) - return ((a * k + b) % PRIME) % size - - -def _longest_bin_length(bins): - return len(max(bins, key=len)) - - -def _make_bins(data, num_bins, a, b): - bins = [[] for i in range(num_bins)] - - for item in data: - bins[_hash_func(item, a, b, num_bins)].append(item) - return bins - - -def _new_bin_length(orig_length): - return int(orig_length) - - -def _get_space_util(bins, init_bins): - return sum(_new_bin_length(len(b)) for b in bins) + 2 * init_bins - - -def _pick_initial_a_b(data, max_constant, init_bins): - while True: - a = np.random.randint(2**12, 2**15) - b = np.random.randint(2**12, 2**15) - bins = _make_bins(data, init_bins, a, b) - score = _get_space_util(bins, init_bins) / len(data) - - longest = _new_bin_length(_longest_bin_length(bins)) - - if score <= max_constant and longest <= MAX_SIZE_FOR_INITIAL_BIN: - print(f"Attempting to build table using {score:.6f}n space") - print(f"Longest bin was {longest}") - break - - return bins, a, b - - -def _find_hash_for_internal(hash_bin): - if not hash_bin: - return [[], 0, 0] - - new_length = _new_bin_length(len(hash_bin)) - - while True: - a = np.random.randint( - A_LBOUND_SECOND_LEVEL_HASH, - A_HBOUND_SECOND_LEVEL_HASH, - ) - b = np.random.randint( - B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH - ) - bins = _make_bins(hash_bin, new_length, a, b) - - max_length = len(max(bins, key=len)) - if max_length == 1: - bins = [b[0] if b else 0 for b in bins] - return bins, a, b - - -def _perfect_hash(integers, max_constant): - num_top_level_bins = len(integers) // 4 - - init_bins, init_a, init_b = _pick_initial_a_b( - integers, max_constant, num_top_level_bins - ) - flattened_bins = [] - - internal_table_coeffs = np.zeros( - shape=[num_top_level_bins], dtype=np.uint64 - ) - offset_into_flattened_table = np.zeros( - shape=[num_top_level_bins + 1], dtype=np.uint64 - ) - - max_bin_length = 0 - for i, b in enumerate(init_bins): - if i % 500 == 0: - print(f"Processing bin {i} / {len(init_bins)} of size = {len(b)}") - internal_table, coeff_a, coeff_b = _find_hash_for_internal(b) - bin_length = len(internal_table) - max_bin_length = max(bin_length, max_bin_length) - internal_table_coeffs[i] = ( - np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT - | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT - | np.uint64(bin_length) - ) - offset_into_flattened_table[i + 1] = offset_into_flattened_table[ - i - ] + np.uint64(bin_length) - flattened_bins.extend(internal_table) - - print( - "Final table size {} elements compared to {} for original".format( - len(flattened_bins), len(integers) - ) - ) - - print("Max bin length was", max_bin_length) - - return ( - init_a, - init_b, - num_top_level_bins, - flattened_bins, - internal_table_coeffs, - offset_into_flattened_table, - ) - - -def _pack_keys_and_values(flattened_hash_table, original_dict): - for i in range(len(flattened_hash_table)): - if flattened_hash_table[i] in original_dict: - value = original_dict[flattened_hash_table[i]] - flattened_hash_table[i] <<= 16 - flattened_hash_table[i] |= value - - -def _load_vocab_dict(path): - vocab = {} - with open(path, encoding="utf-8") as f: - counter = 0 - for line in f: - vocab[line.strip()] = counter - counter += 1 - - return vocab - - -def _store_func( - out_name, - outer_a, - outer_b, - num_outer_bins, - hash_table, - inner_table_coeffs, - offsets_into_ht, - unk_tok_id, - first_token_id, - sep_token_id, -): - with open(out_name, mode="w+") as f: - f.write(f"{outer_a}\n") - f.write(f"{outer_b}\n") - f.write(f"{num_outer_bins}\n") - f.writelines( - f"{coeff} {offset}\n" - for coeff, offset in zip(inner_table_coeffs, offsets_into_ht) - ) - f.write(f"{len(hash_table)}\n") - f.writelines(f"{kv}\n" for kv in hash_table) - f.writelines( - f"{tok_id}\n" - for tok_id in [unk_tok_id, first_token_id, sep_token_id] - ) - - -def _retrieve( - k, - outer_a, - outer_b, - num_outer_bins, - hash_table, - inner_table_coeffs, - offsets_into_ht, -): - bin_hash = _hash_func(k, outer_a, outer_b, num_outer_bins) - start_offset_in_ht = offsets_into_ht[bin_hash] - inner_table_values = inner_table_coeffs[bin_hash] - - one = np.uint64(1) - - inner_a = inner_table_values >> A_SECOND_LEVEL_SHIFT_AMT - inner_b = (inner_table_values >> B_SECOND_LEVEL_SHIFT_AMT) & ( - (one << B_SECOND_LEVEL_POW) - one - ) - size = inner_table_values & ((one << BITS_FOR_INNER_TABLE_SIZE) - one) - - inner_offset = _hash_func(k, inner_a, inner_b, size) - kv = hash_table[start_offset_in_ht + inner_offset] - - key, value = kv >> 16, kv & ((1 << 16) - 1) - indicator = key == k - - return indicator * value + (not indicator) * NOT_FOUND - - -def hash_vocab( - vocab_path, - output_path, - unk_tok="[UNK]", - first_token="[CLS]", - sep_token="[SEP]", -): - """ - Write the vocab vocabulary hashtable to the output_path - """ - np.random.seed(1243342) - vocab = _load_vocab_dict(vocab_path) - keys = list(map(_sdbm_hash, vocab.keys())) - - hashed_vocab = {_sdbm_hash(key): value for key, value in vocab.items()} - - error_message = ( - "A collision occurred and only sdbm token hash is currently " - "supported. This can be extended to use random hashes if needed." - ) - assert len(hashed_vocab) == len(vocab), error_message - - ( - outer_a, - outer_b, - num_outer_bins, - hash_table, - inner_table_coeffs, - offsets_into_ht, - ) = _perfect_hash(keys, 10) - - _pack_keys_and_values(hash_table, hashed_vocab) - _store_func( - output_path, - outer_a, - outer_b, - num_outer_bins, - hash_table, - inner_table_coeffs, - offsets_into_ht, - vocab[unk_tok], - vocab[first_token], - vocab[sep_token], - ) - - for key, value in hashed_vocab.items(): - val = _retrieve( - key, - outer_a, - outer_b, - num_outer_bins, - hash_table, - inner_table_coeffs, - offsets_into_ht, - ) - assert ( - val == value - ), f"Incorrect value found. Got {val} expected {value}" - - print("All present tokens return correct value.") diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py deleted file mode 100644 index d636f36f282..00000000000 --- a/python/cudf/cudf/utils/ioutils.py +++ /dev/null @@ -1,2206 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -import datetime -import functools -import operator -import os -import urllib -import warnings -from collections.abc import Callable -from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper -from threading import Thread - -import fsspec -import fsspec.implementations.local -import numpy as np -import pandas as pd -from fsspec.core import expand_paths_if_needed, get_fs_token_paths - -from cudf.api.types import is_list_like -from cudf.core._compat import PANDAS_LT_300 -from cudf.utils.docutils import docfmt_partial - -try: - import fsspec.parquet as fsspec_parquet - -except ImportError: - fsspec_parquet = None - -_BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024 -_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max - -_docstring_remote_sources = """ -- cuDF supports local and remote data stores. See configuration details for - available sources - `here `__. -""" - -_docstring_read_avro = """ -Load an Avro dataset into a DataFrame - -Parameters ----------- -filepath_or_buffer : str, path object, bytes, or file-like object - Either a path to a file (a `str`, `pathlib.Path`, or - `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), - Python bytes of raw binary data, or any object with a `read()` method - (such as builtin `open()` file handler function or `BytesIO`). -columns : list, default None - If not None, only these columns will be read. -skiprows : int, default None - If not None, the number of rows to skip from the start of the file. -num_rows : int, default None - If not None, the total number of rows to read. -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. - -Returns -------- -DataFrame - -Notes ------ -{remote_data_sources} - -Examples --------- ->>> import pandavro ->>> import pandas as pd ->>> import cudf ->>> pandas_df = pd.DataFrame() ->>> pandas_df['numbers'] = [10, 20, 30] ->>> pandas_df['text'] = ["hello", "rapids", "ai"] ->>> pandas_df - numbers text -0 10 hello -1 20 rapids -2 30 ai ->>> pandavro.to_avro("data.avro", pandas_df) ->>> cudf.read_avro("data.avro") - numbers text -0 10 hello -1 20 rapids -2 30 ai -""".format(remote_data_sources=_docstring_remote_sources) -doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro) - -_docstring_read_parquet_metadata = """ -Read a Parquet file's metadata and schema - -Parameters ----------- -path : string or path object - Path of file to be read - -Returns -------- -Total number of rows -Number of row groups -List of column names -Number of columns -List of metadata of row groups - -Examples --------- ->>> import cudf ->>> num_rows, num_row_groups, names, num_columns, row_group_metadata = cudf.io.read_parquet_metadata(filename) ->>> df = [cudf.read_parquet(fname, row_group=i) for i in range(row_groups)] ->>> df = cudf.concat(df) ->>> df - num1 datetime text -0 123 2018-11-13T12:00:00.000 5451 -1 456 2018-11-14T12:35:01.000 5784 -2 789 2018-11-15T18:02:59.000 6117 - -See Also --------- -cudf.read_parquet -""" -doc_read_parquet_metadata = docfmt_partial( - docstring=_docstring_read_parquet_metadata -) - -_docstring_read_parquet = """ -Load a Parquet dataset into a DataFrame - -Parameters ----------- -filepath_or_buffer : str, path object, bytes, file-like object, or a list - of such objects. - Contains one or more of the following: either a path to a file (a `str`, - `pathlib.Path`, or `py._path.local.LocalPath`), URL (including http, ftp, - and S3 locations), Python bytes of raw binary data, or any object with a - `read()` method (such as builtin `open()` file handler function or - `BytesIO`). -engine : {{ 'cudf', 'pyarrow' }}, default 'cudf' - Parser engine to use. -columns : list, default None - If not None, only these columns will be read. -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. -filesystem : fsspec.AbstractFileSystem, default None - Filesystem object to use when reading the parquet data. This argument - should not be used at the same time as `storage_options`. -filters : list of tuple, list of lists of tuples, default None - If not None, specifies a filter predicate used to filter out row groups - using statistics stored for each row group as Parquet metadata. Row groups - that do not match the given filter predicate are not read. The filters - will also be applied to the rows of the in-memory DataFrame after IO. - The predicate is expressed in disjunctive normal form (DNF) like - `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical - combinations of single column predicates. The innermost tuples each - describe a single column predicate. The list of inner predicates is - interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the most outer list combines - these filters as a disjunction (OR). Predicates may also be passed - as a list of tuples. This form is interpreted as a single conjunction. - To express OR in predicates, one must use the (preferred) notation of - list of lists of tuples. -row_groups : int, or list, or a list of lists default None - If not None, specifies, for each input file, which row groups to read. - If reading multiple inputs, a list of lists should be passed, one list - for each input. -categorical_partitions : boolean, default True - Whether directory-partitioned columns should be interpreted as categorical - or raw dtypes. -use_pandas_metadata : boolean, default True - If True and dataset has custom PANDAS schema metadata, ensure that index - columns are also loaded. -bytes_per_thread : int, default None - Determines the number of bytes to be allocated per thread to read the - files in parallel. When there is a file of large size, we get slightly - better throughput by decomposing it and transferring multiple "blocks" - in parallel (using a python thread pool). Default allocation is - {bytes_per_thread} bytes. -skiprows : int, default None - If not None, the number of rows to skip from the start of the file. - - .. note:: - This option is not supported when the low-memory mode is on. -nrows : int, default None - If not None, the total number of rows to read. - - .. note: - This option is not supported when the low-memory mode is on. -allow_mismatched_pq_schemas : boolean, default False - If True, enables reading (matching) columns specified in `columns` and `filters` - options from the input files with otherwise mismatched schemas. -prefetch_options : dict, default None - WARNING: This is an experimental feature and may be removed at any - time without warning or deprecation period. - Dictionary of options to use to prefetch bytes from remote storage. - These options are passed through to `get_reader_filepath_or_buffer`. - -Returns -------- -DataFrame - -Notes ------ -{remote_data_sources} - -Examples --------- ->>> import cudf ->>> df = cudf.read_parquet(filename) ->>> df - num1 datetime text -0 123 2018-11-13T12:00:00.000 5451 -1 456 2018-11-14T12:35:01.000 5784 -2 789 2018-11-15T18:02:59.000 6117 - -See Also --------- -cudf.io.parquet.read_parquet_metadata -cudf.DataFrame.to_parquet -cudf.read_orc -""".format( - remote_data_sources=_docstring_remote_sources, - bytes_per_thread=_BYTES_PER_THREAD_DEFAULT, -) -doc_read_parquet = docfmt_partial(docstring=_docstring_read_parquet) - -_docstring_to_parquet = """ -Write a DataFrame to the parquet format. - -Parameters ----------- -path : str or list of str - File path or Root Directory path. Will be used as Root Directory path - while writing a partitioned dataset. Use list of str with partition_offsets - to write parts of the dataframe to different files. -compression : {{'snappy', 'ZSTD', 'LZ4', None}}, default 'snappy' - Name of the compression to use; case insensitive. - Use ``None`` for no compression. -index : bool, default None - If ``True``, include the dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. - If ``None``, similar to ``True`` the dataframe's index(es) will - be saved, however, instead of being saved as values any - ``RangeIndex`` will be stored as a range in the metadata so it - doesn't require much space and is faster. Other indexes will - be included as columns in the file output. -partition_cols : list, optional, default None - Column names by which to partition the dataset - Columns are partitioned in the order they are given -partition_file_name : str, optional, default None - File name to use for partitioned datasets. Different partitions - will be written to different directories, but all files will - have this name. If nothing is specified, a random uuid4 hex string - will be used for each file. This parameter is only supported by 'cudf' - engine, and will be ignored by other engines. -partition_offsets : list, optional, default None - Offsets to partition the dataframe by. Should be used when path is list - of str. Should be a list of integers of size ``len(path) + 1`` -statistics : {{'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}}, default 'ROWGROUP' - Level at which column statistics should be included in file. -metadata_file_path : str, optional, default None - If specified, this function will return a binary blob containing the footer - metadata of the written parquet file. The returned blob will have the - ``chunk.file_path`` field set to the ``metadata_file_path`` for each chunk. - When using with ``partition_offsets``, should be same size as ``len(path)`` -int96_timestamps : bool, default False - If ``True``, write timestamps in int96 format. This will convert - timestamps from timestamp[ns], timestamp[ms], timestamp[s], and - timestamp[us] to the int96 format, which is the number of Julian - days and the number of nanoseconds since midnight of 1970-01-01. - If ``False``, timestamps will not be altered. -row_group_size_bytes: integer, default None - Maximum size of each stripe of the output. - If None, no limit on row group stripe size will be used. -row_group_size_rows: integer or None, default None - Maximum number of rows of each stripe of the output. - If None, 1000000 will be used. -max_page_size_bytes: integer or None, default None - Maximum uncompressed size of each page of the output. - If None, 524288 (512KB) will be used. -max_page_size_rows: integer or None, default None - Maximum number of rows of each page of the output. - If None, 20000 will be used. -max_dictionary_size: integer or None, default None - Maximum size of the dictionary page for each output column chunk. Dictionary - encoding for column chunks that exceeds this limit will be disabled. - If None, 1048576 (1MB) will be used. -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. -return_metadata : bool, default False - Return parquet metadata for written data. Returned metadata will - include the file path metadata (relative to `root_path`). - To request metadata binary blob when using with ``partition_cols``, Pass - ``return_metadata=True`` instead of specifying ``metadata_file_path`` -use_dictionary : bool, default True - When ``False``, prevents the use of dictionary encoding for Parquet page - data. When ``True``, dictionary encoding is preferred subject to - ``max_dictionary_size`` constraints. -header_version : {{'1.0', '2.0'}}, default "1.0" - Controls whether to use version 1.0 or version 2.0 page headers when - encoding. Version 1.0 is more portable, but version 2.0 enables the - use of newer encoding schemes. -force_nullable_schema : bool, default False. - If True, writes all columns as `null` in schema. - If False, columns are written as `null` if they contain null values, - otherwise as `not null`. -skip_compression : set, optional, default None - If a column name is present in the set, that column will not be compressed, - regardless of the ``compression`` setting. -column_encoding : dict, optional, default None - Sets the page encoding to use on a per-column basis. The key is a column - name, and the value is one of: 'PLAIN', 'DICTIONARY', 'DELTA_BINARY_PACKED', - 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'BYTE_STREAM_SPLIT', or - 'USE_DEFAULT'. -column_type_length : dict, optional, default None - Specifies the width in bytes of ``FIXED_LEN_BYTE_ARRAY`` column elements. - The key is a column name and the value is an integer. The named column - will be output as unannotated binary (i.e. the column will behave as if - ``output_as_binary`` was set). -output_as_binary : set, optional, default None - If a column name is present in the set, that column will be output as - unannotated binary, rather than the default 'UTF-8'. -store_schema : bool, default False - If ``True``, writes arrow schema to Parquet file footer's key-value - metadata section to faithfully round-trip ``duration`` types with arrow. - This cannot be used with ``int96_timestamps`` enabled as int96 timestamps - are deprecated in arrow. Also, all decimal32 and decimal64 columns will be - converted to decimal128 as arrow only supports decimal128 and decimal256 types. -**kwargs - Additional parameters will be passed to execution engines other - than ``cudf``. - - -See Also --------- -cudf.read_parquet -""" -doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet) - -_docstring_merge_parquet_filemetadata = """ -Merge multiple parquet metadata blobs - -Parameters ----------- -metadata_list : list - List of buffers returned by to_parquet - -Returns -------- -Combined parquet metadata blob - -See Also --------- -cudf.DataFrame.to_parquet -""" -doc_merge_parquet_filemetadata = docfmt_partial( - docstring=_docstring_merge_parquet_filemetadata -) - - -_docstring_read_orc_metadata = """ -Read an ORC file's metadata and schema - -Parameters ----------- -path : string or path object - Path of file to be read - -Returns -------- -Total number of rows -Number of stripes -List of column names - -Notes ------ -{remote_data_sources} - -Examples --------- ->>> import cudf ->>> num_rows, stripes, names = cudf.io.read_orc_metadata(filename) ->>> df = [cudf.read_orc(fname, stripes=i) for i in range(stripes)] ->>> df = cudf.concat(df) ->>> df - num1 datetime text -0 123 2018-11-13T12:00:00.000 5451 -1 456 2018-11-14T12:35:01.000 5784 -2 789 2018-11-15T18:02:59.000 6117 - -See Also --------- -cudf.read_orc -""" -doc_read_orc_metadata = docfmt_partial(docstring=_docstring_read_orc_metadata) - - -_docstring_read_orc_statistics = """ -Read an ORC file's file-level and stripe-level statistics - -Parameters ----------- -filepath_or_buffer : str, path object, bytes, or file-like object - Either a path to a file (a `str`, `pathlib.Path`, or - `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), - Python bytes of raw binary data, or any object with a `read()` method - (such as builtin `open()` file handler function or `BytesIO`). -columns : list, default None - If not None, statistics for only these columns will be read from the file. - - -Returns -------- -Statistics for each column of given file -Statistics for each column for each stripe of given file - -See Also --------- -cudf.read_orc -""" -doc_read_orc_statistics = docfmt_partial( - docstring=_docstring_read_orc_statistics -) - -_docstring_read_orc = """ -Load an ORC dataset into a DataFrame - -Parameters ----------- -filepath_or_buffer : str, path object, bytes, or file-like object - Either a path to a file (a `str`, `pathlib.Path`, or - `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), - Python bytes of raw binary data, or any object with a `read()` method - (such as builtin `open()` file handler function or `BytesIO`). -engine : {{ 'cudf', 'pyarrow' }}, default 'cudf' - Parser engine to use. -columns : list, default None - If not None, only these columns will be read from the file. -filters : list of tuple, list of lists of tuples default None - If not None, specifies a filter predicate used to filter out row groups - using statistics stored for each row group as Parquet metadata. Row groups - that do not match the given filter predicate are not read. The - predicate is expressed in disjunctive normal form (DNF) like - `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical - combinations of single column predicates. The innermost tuples each - describe a single column predicate. The list of inner predicates is - interpreted as a conjunction (AND), forming a more selective and - multiple column predicate. Finally, the outermost list combines - these filters as a disjunction (OR). Predicates may also be passed - as a list of tuples. This form is interpreted as a single conjunction. - To express OR in predicates, one must use the (preferred) notation of - list of lists of tuples. -stripes: list, default None - If not None, only these stripe will be read from the file. Stripes are - concatenated with index ignored. -skiprows : int, default None - If not None, the number of rows to skip from the start of the file. - This parameter is deprecated. -num_rows : int, default None - If not None, the total number of rows to read. - This parameter is deprecated. -use_index : bool, default True - If True, use row index if available for faster seeking. -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. -bytes_per_thread : int, default None - Determines the number of bytes to be allocated per thread to read the - files in parallel. When there is a file of large size, we get slightly - better throughput by decomposing it and transferring multiple "blocks" - in parallel (using a python thread pool). Default allocation is - {bytes_per_thread} bytes. - -Returns -------- -DataFrame - -Notes ------ -{remote_data_sources} - -Examples --------- ->>> import cudf ->>> df = cudf.read_orc(filename) ->>> df - num1 datetime text -0 123 2018-11-13T12:00:00.000 5451 -1 456 2018-11-14T12:35:01.000 5784 -2 789 2018-11-15T18:02:59.000 6117 - -See Also --------- -cudf.DataFrame.to_orc -""".format( - remote_data_sources=_docstring_remote_sources, - bytes_per_thread=_BYTES_PER_THREAD_DEFAULT, -) -doc_read_orc = docfmt_partial(docstring=_docstring_read_orc) - -_docstring_to_orc = """ -Write a DataFrame to the ORC format. - -Parameters ----------- -fname : str - File path or object where the ORC dataset will be stored. -compression : {{ 'snappy', 'ZSTD', 'ZLIB', 'LZ4', None }}, default 'snappy' - Name of the compression to use; case insensitive. - Use ``None`` for no compression. -statistics: str {{ "ROWGROUP", "STRIPE", None }}, default "ROWGROUP" - The granularity with which column statistics must - be written to the file. -stripe_size_bytes: integer or None, default None - Maximum size of each stripe of the output. - If None, 67108864 (64MB) will be used. -stripe_size_rows: integer or None, default None - Maximum number of rows of each stripe of the output. - If None, 1000000 will be used. -row_index_stride: integer or None, default None - Row index stride (maximum number of rows in each row group). - If None, 10000 will be used. -cols_as_map_type : list of column names or None, default None - A list of column names which should be written as map type in the ORC file. - Note that this option only affects columns of ListDtype. Names of other - column types will be ignored. -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. -index : bool, default None - If ``True``, include the dataframe's index(es) in the file output. - If ``False``, they will not be written to the file. - If ``None``, similar to ``True`` the dataframe's index(es) will - be saved, however, instead of being saved as values any - ``RangeIndex`` will be stored as a range in the metadata so it - doesn't require much space and is faster. Other indexes will - be included as columns in the file output. - -See Also --------- -cudf.read_orc -""" -doc_to_orc = docfmt_partial(docstring=_docstring_to_orc) - -_docstring_read_json = r""" -Load a JSON dataset into a DataFrame - -Parameters ----------- -path_or_buf : list, str, path object, or file-like object - Either JSON data in a `str`, path to a file (a `str`, `pathlib.Path`, or - `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), - or any object with a `read()` method (such as builtin `open()` file handler - function or `StringIO`). Multiple inputs may be provided as a list. If a - list is specified each list entry may be of a different input type as long - as each input is of a valid type and all input JSON schema(s) match. -engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto' - Parser engine to use. If 'auto' is passed, the engine will be - automatically selected based on the other parameters. See notes below. -orient : string - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - Indication of expected JSON string format. - Compatible JSON strings can be produced by ``to_json()`` with a - corresponding orient value. - The set of possible orients is: - - - ``'split'`` : dict like - ``{index -> [index], columns -> [columns], data -> [values]}`` - - ``'records'`` : list like - ``[{column -> value}, ... , {column -> value}]`` - - ``'index'`` : dict like ``{index -> {column -> value}}`` - - ``'columns'`` : dict like ``{column -> {index -> value}}`` - - ``'values'`` : just the values array - - The allowed and default values depend on the value - of the `typ` parameter. - - * when ``typ == 'series'``, - - - allowed orients are ``{'split','records','index'}`` - - default is ``'index'`` - - The Series index must be unique for orient ``'index'``. - * when ``typ == 'frame'``, - - - allowed orients are ``{'split','records','index', - 'columns','values', 'table'}`` - - default is ``'columns'`` - - The DataFrame index must be unique for orients ``'index'`` and - ``'columns'``. - - The DataFrame columns must be unique for orients ``'index'``, - ``'columns'``, and ``'records'``. -typ : type of object to recover (series or frame), default 'frame' - With cudf engine, only frame output is supported. -dtype : boolean or dict, default None - If True, infer dtypes for all columns; if False, then don't infer dtypes at all, - if a dict, provide a mapping from column names to their respective dtype (any missing - columns will have their dtype inferred). Applies only to the data. - For all ``orient`` values except ``'table'``, default is ``True``. -convert_axes : boolean, default True - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - Try to convert the axes to the proper dtypes. -convert_dates : boolean, default True - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - List of columns to parse for dates; If True, then try - to parse datelike columns default is True; a column label is datelike if - - * it ends with ``'_at'``, - * it ends with ``'_time'``, - * it begins with ``'timestamp'``, - * it is ``'modified'``, or - * it is ``'date'`` -keep_default_dates : boolean, default True - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - If parsing dates, parse the default datelike columns. -numpy : boolean, default False - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - Direct decoding to numpy arrays. Supports numeric - data only, but non-numeric column and index labels are supported. Note - also that the JSON ordering MUST be the same for each term if numpy=True. -precise_float : boolean, default False - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - Set to enable usage of higher precision (strtod) function when - decoding string to double values (pandas engine only). Default (False) - is to use fast but less precise builtin functionality -date_unit : string, default None - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - The timestamp unit to detect if converting dates. - The default behavior is to try and detect the correct precision, but if - this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force - parsing only seconds, milliseconds, microseconds or nanoseconds. -encoding : str, default is 'utf-8' - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - The encoding to use to decode py3 bytes. - With cudf engine, only utf-8 is supported. -lines : boolean, default False - Read the file as a json object per line. -chunksize : integer, default None - - .. admonition:: Not GPU-accelerated - - This parameter is only supported with ``engine='pandas'``. - - Return JsonReader object for iteration. - See the `line-delimited json docs - `_ - for more information on ``chunksize``. - This can only be passed if `lines=True`. - If this is None, the file will be read into memory all at once. -compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, zip or xz if path_or_buf is a string ending in - '.gz', '.bz2', '.zip', or 'xz', respectively, and no decompression - otherwise. If using 'zip', the ZIP file must contain only one data - file to be read in. Set to None for no decompression. -byte_range : list or tuple, default None - - .. admonition:: GPU-accelerated - - This parameter is only supported with ``engine='cudf'``. - - Byte range within the input file to be read. - The first number is the offset in bytes, the second number is the range - size in bytes. Set the size to zero to read all data after the offset - location. Reads the row that starts before or at the end of the range, - even if it ends after the end of the range. -keep_quotes : bool, default False - - .. admonition:: GPU-accelerated feature - - This parameter is only supported with ``engine='cudf'``. - - If `True`, any string values are read literally (and wrapped in an - additional set of quotes). - If `False` string values are parsed into Python strings. -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. -mixed_types_as_string : bool, default False - - .. admonition:: GPU-accelerated feature - - This parameter is only supported with ``engine='cudf'``. - - If True, mixed type columns are returned as string columns. - If `False` parsing mixed type columns will thrown an error. -prune_columns : bool, default False - - .. admonition:: GPU-accelerated feature - - This parameter is only supported with ``engine='cudf'``. - - If True, only return those columns mentioned in the dtype argument. - If `False` dtype argument is used a type inference suggestion. -on_bad_lines : {'error', 'recover'}, default 'error' - Specifies what to do upon encountering a bad line. Allowed values are : - - - ``'error'``, raise an Exception when a bad line is encountered. - - ``'recover'``, fills the row with when a bad line is encountered. -Returns -------- -result : Series or DataFrame, depending on the value of `typ`. - -Notes ------ -When `engine='auto'`, and `line=False`, the `pandas` json -reader will be used. To override the selection, please -use `engine='cudf'`. - -See Also --------- -cudf.DataFrame.to_json - -Examples --------- ->>> import cudf ->>> df = cudf.DataFrame({'a': ["hello", "rapids"], 'b': ["hello", "worlds"]}) ->>> df - a b -0 hello hello -1 rapids worlds ->>> json_str = df.to_json(orient='records', lines=True) ->>> json_str -'{"a":"hello","b":"hello"}\n{"a":"rapids","b":"worlds"}\n' ->>> cudf.read_json(json_str, engine="cudf", lines=True) - a b -0 hello hello -1 rapids worlds - -To read the strings with additional set of quotes: - ->>> cudf.read_json(json_str, engine="cudf", lines=True, -... keep_quotes=True) - a b -0 "hello" "hello" -1 "rapids" "worlds" - -Reading a JSON string containing ordered lists and name/value pairs: - ->>> json_str = '[{"list": [0,1,2], "struct": {"k":"v1"}}, {"list": [3,4,5], "struct": {"k":"v2"}}]' ->>> cudf.read_json(json_str, engine='cudf') - list struct -0 [0, 1, 2] {'k': 'v1'} -1 [3, 4, 5] {'k': 'v2'} - -Reading JSON Lines data containing ordered lists and name/value pairs: - ->>> json_str = '{"a": [{"k1": "v1"}]}\n{"a": [{"k1":"v2"}]}' ->>> cudf.read_json(json_str, engine='cudf', lines=True) - a -0 [{'k1': 'v1'}] -1 [{'k1': 'v2'}] - -Using the `dtype` argument to specify type casting: - ->>> json_str = '{"k1": 1, "k2":[1.5]}' ->>> cudf.read_json(json_str, engine='cudf', lines=True, dtype={'k1':float, 'k2':cudf.ListDtype(int)}) - k1 k2 -0 1.0 [1] -""" # noqa: E501 -doc_read_json: Callable = docfmt_partial(docstring=_docstring_read_json) - -_docstring_to_json = """ -Convert the cuDF object to a JSON string. -Note nulls and NaNs will be converted to null and datetime objects -will be converted to UNIX timestamps. - -Parameters ----------- -path_or_buf : string or file handle, optional - File path or object. If not specified, the result is returned as a string. -engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto' - Parser engine to use. If 'auto' is passed, the `pandas` engine - will be selected. -orient : string - Indication of expected JSON string format. - - * Series - - default is 'index' - - allowed values are: {'split','records','index','table'} - * DataFrame - - default is 'columns' - - allowed values are: - {'split','records','index','columns','values','table'} - * The format of the JSON string - - 'split' : dict like {'index' -> [index], - 'columns' -> [columns], 'data' -> [values]} - - 'records' : list like - [{column -> value}, ... , {column -> value}] - - 'index' : dict like {index -> {column -> value}} - - 'columns' : dict like {column -> {index -> value}} - - 'values' : just the values array - - 'table' : dict like {'schema': {schema}, 'data': {data}} - describing the data, and the data component is - like ``orient='records'``. -date_format : {None, 'epoch', 'iso'} - Type of date conversion. 'epoch' = epoch milliseconds, - 'iso' = ISO8601. The default depends on the `orient`. For - ``orient='table'``, the default is 'iso'. For all other orients, - the default is 'epoch'. -double_precision : int, default 10 - The number of decimal places to use when encoding - floating point values. -force_ascii : bool, default True - Force encoded string to be ASCII. -date_unit : string, default 'ms' (milliseconds) - The time unit to encode to, governs timestamp and ISO8601 - precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, - microsecond, and nanosecond respectively. -default_handler : callable, default None - Handler to call if object cannot otherwise be converted to a - suitable format for JSON. Should receive a single argument which is - the object to convert and return a serializable object. -lines : bool, default False - If 'orient' is 'records' write out line delimited json format. Will - throw ValueError if incorrect 'orient' since others are not list - like. -compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} - A string representing the compression to use in the output file, - only used when the first argument is a filename. By default, the - compression is inferred from the filename. -index : bool, default True - Whether to include the index values in the JSON string. Not - including the index (``index=False``) is only supported when - orient is 'split' or 'table'. - -See Also --------- -cudf.read_json -""" -doc_to_json: Callable = docfmt_partial(docstring=_docstring_to_json) - -_docstring_read_hdf = """ -Read from the store, close it if we opened it. - -Retrieve pandas object stored in file, optionally based on where -criteria - -Parameters ----------- -path_or_buf : string, buffer or path object - Path to the file to open, or an open `HDFStore - `_. - object. - Supports any object implementing the ``__fspath__`` protocol. - This includes :class:`pathlib.Path` and py._path.local.LocalPath - objects. -key : object, optional - The group identifier in the store. Can be omitted if the HDF file - contains a single pandas object. -mode : {'r', 'r+', 'a'}, optional - Mode to use when opening the file. Ignored if path_or_buf is a - `Pandas HDFS - `_. - Default is 'r'. -where : list, optional - A list of Term (or convertible) objects. -start : int, optional - Row number to start selection. -stop : int, optional - Row number to stop selection. -columns : list, optional - A list of columns names to return. -iterator : bool, optional - Return an iterator object. -chunksize : int, optional - Number of rows to include in an iteration when using an iterator. -errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. -**kwargs - Additional keyword arguments passed to HDFStore. - -Returns -------- -item : object - The selected object. Return type depends on the object stored. - -See Also --------- -cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame. -""" -doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf) - -_docstring_to_hdf = """ -Write the contained data to an HDF5 file using HDFStore. - -Hierarchical Data Format (HDF) is self-describing, allowing an -application to interpret the structure and contents of a file with -no outside information. One HDF file can hold a mix of related objects -which can be accessed as a group or as individual objects. - -In order to add another DataFrame or Series to an existing HDF file -please use append mode and a different a key. - -For more information see the `user guide -`_. - -Parameters ----------- -path_or_buf : str or pandas.HDFStore - File path or HDFStore object. -key : str - Identifier for the group in the store. -mode : {'a', 'w', 'r+'}, default 'a' - Mode to open file: - - - 'w': write, a new file is created (an existing file with the same name - would be deleted). - - 'a': append, an existing file is opened for reading and writing, and if - the file does not exist it is created. - - 'r+': similar to 'a', but the file must already exist. -format : {'fixed', 'table'}, default 'fixed' - Possible values: - - - 'fixed': Fixed format. Fast writing/reading. Not-appendable, - nor searchable. - - 'table': Table format. Write as a PyTables Table structure - which may perform worse but allow more flexible operations - like searching / selecting subsets of the data. -append : bool, default False - For Table formats, append the input data to the existing. -data_columns : list of columns or True, optional - List of columns to create as indexed data columns for on-disk - queries, or True to use all columns. By default only the axes - of the object are indexed. See `Query via Data Columns - `_. - Applicable only to format='table'. -complevel : {0-9}, optional - Specifies a compression level for data. - A value of 0 disables compression. -complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' - Specifies the compression library to be used. - As of v0.20.2 these additional compressors for Blosc are supported - (default if no compressor specified: 'blosc:blosclz'): - {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', - 'blosc:zlib', 'blosc:zstd'}. - Specifying a compression library which is not available issues - a ValueError. -fletcher32 : bool, default False - If applying compression use the fletcher32 checksum. -dropna : bool, default False - If true, ALL nan rows will not be written to store. -errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. - -See Also --------- -cudf.read_hdf : Read from HDF file. -cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format. -cudf.DataFrame.to_feather : Write out feather-format for DataFrames. -""" -doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf) - -_docstring_read_feather = """ -Load an feather object from the file path, returning a DataFrame. - -Parameters ----------- -path : string - File path -columns : list, default=None - If not None, only these columns will be read from the file. - -Returns -------- -DataFrame - -Examples --------- ->>> import cudf ->>> df = cudf.read_feather(filename) ->>> df - num1 datetime text -0 123 2018-11-13T12:00:00.000 5451 -1 456 2018-11-14T12:35:01.000 5784 -2 789 2018-11-15T18:02:59.000 6117 - -See Also --------- -cudf.DataFrame.to_feather -""" -doc_read_feather = docfmt_partial(docstring=_docstring_read_feather) - -_docstring_to_feather = """ -Write a DataFrame to the feather format. - -Parameters ----------- -path : str - File path - -See Also --------- -cudf.read_feather -""" -doc_to_feather = docfmt_partial(docstring=_docstring_to_feather) - -_docstring_to_dlpack = """ -Converts a cuDF object into a DLPack tensor. - -DLPack is an open-source memory tensor structure: -`dmlc/dlpack `_. - -This function takes a cuDF object and converts it to a PyCapsule object -which contains a pointer to a DLPack tensor. This function deep copies the -data into the DLPack tensor from the cuDF object. - -Parameters ----------- -cudf_obj : DataFrame, Series, Index, or Column - -Returns -------- -pycapsule_obj : PyCapsule - Output DLPack tensor pointer which is encapsulated in a PyCapsule - object. -""" -doc_to_dlpack = docfmt_partial(docstring=_docstring_to_dlpack) - -_docstring_read_csv = """ -Load a comma-separated-values (CSV) dataset into a DataFrame - -Parameters ----------- -filepath_or_buffer : str, path object, or file-like object - Either a path to a file (a `str`, `pathlib.Path`, or - `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), - or any object with a `read()` method (such as builtin `open()` file handler - function or `StringIO`). -sep : char, default ',' - Delimiter to be used. -delimiter : char, default None - Alternative argument name for sep. -header : int, default 'infer' - Row number to use as the column names. Default behavior is to infer - the column names: if no names are passed, header=0; - if column names are passed explicitly, header=None. -names : list of str, default None - List of column names to be used. Needs to include names of all columns in - the file, or names of all columns selected using `usecols` (only when - `usecols` holds integer indices). When `usecols` is not used to select - column indices, `names` can contain more names than there are columns i.n - the file. In this case the extra columns will only contain null rows. -index_col : int, string or False, default None - Column to use as the row labels of the DataFrame. Passing `index_col=False` - explicitly disables index column inference and discards the last column. -usecols : list of int or str, default None - Returns subset of the columns given in the list. All elements must be - either integer indices (column number) or strings that correspond to - column names. When an integer index is passed for each name in the `names` - parameter, the names are interpreted as names in the output table, not as - names in the input file. -prefix : str, default None - Prefix to add to column numbers when parsing without a header row. -mangle_dupe_cols : boolean, default True - Duplicate columns will be specified as 'X','X.1',...'X.N'. -dtype : type, str, list of types, or dict of column -> type, default None - Data type(s) for data or columns. If `dtype` is a type/str, all columns - are mapped to the particular type passed. If list, types are applied in - the same order as the column names. If dict, types are mapped to the - column names. - E.g. {{'a': np.float64, 'b': int32, 'c': 'float'}} - If `None`, dtypes are inferred from the dataset. Use `str` to preserve data - and not infer or interpret to dtype. -true_values : list, default None - Values to consider as boolean True -false_values : list, default None - Values to consider as boolean False -skipinitialspace : bool, default False - Skip spaces after delimiter. -skiprows : int, default 0 - Number of rows to be skipped from the start of file. -skipfooter : int, default 0 - Number of rows to be skipped at the bottom of file. -nrows : int, default None - If specified, maximum number of rows to read -na_values : scalar, str, or list-like, optional - Additional strings to recognize as nulls. - By default the following values are interpreted as - nulls: '', '#N/A', '#N/A N/A', '#NA', '-1.#IND', - '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', - '', 'N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', - 'null'. -keep_default_na : bool, default True - Whether or not to include the default NA values when parsing the data. -na_filter : bool, default True - Detect missing values (empty strings and the values in na_values). - Passing False can improve performance. -skip_blank_lines : bool, default True - If True, discard and do not parse empty lines - If False, interpret empty lines as NaN values -parse_dates : list of int or names, default None - If list of columns, then attempt to parse each entry as a date. - Columns may not always be recognized as dates, for instance due to - unusual or non-standard formats. To guarantee a date and increase parsing - speed, explicitly specify `dtype='date'` for the desired columns. -dayfirst : bool, default False - DD/MM format dates, international and European format. -compression : {{'infer', 'gzip', 'zip', None}}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then detect - compression from the following extensions: '.gz','.zip' (otherwise no - decompression). If using 'zip', the ZIP file must contain only one - data file to be read in, otherwise the first non-zero-sized file will - be used. Set to None for no decompression. -thousands : char, default None - Character used as a thousands delimiter. -decimal : char, default '.' - Character used as a decimal point. -lineterminator : char, default '\\n' - Character to indicate end of line. -quotechar : char, default '"' - Character to indicate start and end of quote item. -quoting : str or int, default 0 - Controls quoting behavior. Set to one of - 0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL), - 2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE). - Quoting is enabled with all values except 3. -doublequote : bool, default True - When quoting is enabled, indicates whether to interpret two - consecutive quotechar inside fields as single quotechar -comment : char, default None - Character used as a comments indicator. If found at the beginning of a - line, the line will be ignored altogether. -delim_whitespace : bool, default False - Determines whether to use whitespace as delimiter. -byte_range : list or tuple, default None - Byte range within the input file to be read. The first number is the - offset in bytes, the second number is the range size in bytes. Set the - size to zero to read all data after the offset location. Reads the row - that starts before or at the end of the range, even if it ends after - the end of the range. -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. -bytes_per_thread : int, default None - Determines the number of bytes to be allocated per thread to read the - files in parallel. When there is a file of large size, we get slightly - better throughput by decomposing it and transferring multiple "blocks" - in parallel (using a python thread pool). Default allocation is - {bytes_per_thread} bytes. -Returns -------- -GPU ``DataFrame`` object. - -Notes ------ -{remote_data_sources} - -Examples --------- - -Create a test csv file - ->>> import cudf ->>> filename = 'foo.csv' ->>> lines = [ -... "num1,datetime,text", -... "123,2018-11-13T12:00:00,abc", -... "456,2018-11-14T12:35:01,def", -... "789,2018-11-15T18:02:59,ghi" -... ] ->>> with open(filename, 'w') as fp: -... fp.write('\\n'.join(lines)+'\\n') - -Read the file with ``cudf.read_csv`` - ->>> cudf.read_csv(filename) - num1 datetime text -0 123 2018-11-13T12:00:00.000 5451 -1 456 2018-11-14T12:35:01.000 5784 -2 789 2018-11-15T18:02:59.000 6117 - -See Also --------- -cudf.DataFrame.to_csv -""".format( - remote_data_sources=_docstring_remote_sources, - bytes_per_thread=_BYTES_PER_THREAD_DEFAULT, -) -doc_read_csv = docfmt_partial(docstring=_docstring_read_csv) - -_to_csv_example = """ - -Write a dataframe to csv. - ->>> import cudf ->>> filename = 'foo.csv' ->>> df = cudf.DataFrame({'x': [0, 1, 2, 3], -... 'y': [1.0, 3.3, 2.2, 4.4], -... 'z': ['a', 'b', 'c', 'd']}) ->>> df = df.set_index(cudf.Series([3, 2, 1, 0])) ->>> df.to_csv(filename) - -""" -_docstring_to_csv = """ - -Write a dataframe to csv file format. - -Parameters ----------- -{df_param} -path_or_buf : str or file handle, default None - File path or object, if None is provided - the result is returned as a string. -sep : char, default ',' - Delimiter to be used. -na_rep : str, default '' - String to use for null entries -columns : list of str, optional - Columns to write -header : bool, default True - Write out the column names -index : bool, default True - Write out the index as a column -encoding : str, default 'utf-8' - A string representing the encoding to use in the output file - Only 'utf-8' is currently supported -compression : str, None - A string representing the compression scheme to use in the output file - Compression while writing csv is not supported currently -lineterminator : str, optional - The newline character or character sequence to use in the output file. - Defaults to :data:`os.linesep`. -chunksize : int or None, default None - Rows to write at a time -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. - -Returns -------- -None or str - If `path_or_buf` is None, returns the resulting csv format as a string. - Otherwise returns None. - -Notes ------ -- Follows the standard of Pandas csv.QUOTE_NONNUMERIC for all output. -- The default behaviour is to write all rows of the dataframe at once. - This can lead to memory or overflow errors for large tables. If this - happens, consider setting the ``chunksize`` argument to some - reasonable fraction of the total rows in the dataframe. - -Examples --------- -{example} - -See Also --------- -cudf.read_csv -""" -doc_to_csv = docfmt_partial( - docstring=_docstring_to_csv.format( - df_param=""" -df : DataFrame - DataFrame object to be written to csv -""", - example=_to_csv_example, - ) -) - -doc_dataframe_to_csv = docfmt_partial( - docstring=_docstring_to_csv.format(df_param="", example=_to_csv_example) -) - -_docstring_kafka_datasource = """ -Configuration object for a Kafka Datasource - -Parameters ----------- -kafka_configs : dict, key/value pairs of librdkafka configuration values. - The complete list of valid configurations can be found at - https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md -topic : string, case sensitive name of the Kafka topic that contains the - source data. -partition : int, - Zero-based identifier of the Kafka partition that the underlying consumer - should consume messages from. Valid values are 0 - (N-1) -start_offset : int, Kafka Topic/Partition offset that consumption - should begin at. Inclusive. -end_offset : int, Kafka Topic/Partition offset that consumption - should end at. Inclusive. -batch_timeout : int, default 10000 - Maximum number of milliseconds that will be spent trying to - consume messages between the specified 'start_offset' and 'end_offset'. -delimiter : string, default None, optional delimiter to insert into the - output between kafka messages, Ex: "\n" - -""" -doc_kafka_datasource = docfmt_partial(docstring=_docstring_kafka_datasource) - - -_docstring_text_datasource = """ -Configuration object for a text Datasource - -Parameters ----------- -filepath_or_buffer : str, path object, or file-like object - Either a path to a file (a `str`, `pathlib.Path`, or - `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), - or any object with a `read()` method (such as builtin `open()` file handler - function or `StringIO`). -delimiter : string, default None - The delimiter that should be used for splitting text chunks into - separate cudf column rows. The delimiter may be one or more characters. -byte_range : list or tuple, default None - Byte range within the input file to be read. The first number is the - offset in bytes, the second number is the range size in bytes. - The output contains all rows that start inside the byte range - (i.e. at or after the offset, and before the end at `offset + size`), - which may include rows that continue past the end. -strip_delimiters : boolean, default False - Unlike the `str.split()` function, `read_text` preserves the delimiter - at the end of a field in output by default, meaning `a;b;c` will turn into - `['a;','b;','c']` when using `;` as a delimiter. - Setting this option to `True` will strip these trailing delimiters, - leaving only the contents between delimiters in the resulting column: - `['a','b','c']` -compression : string, default None - Which compression type is the input compressed with. - Currently supports only `bgzip`, and requires the path to a file as input. -compression_offsets: list or tuple, default None - The virtual begin and end offset associated with the provided compression. - For `bgzip`, they are composed of a local uncompressed offset inside a - BGZIP block (lower 16 bits) and the start offset of this BGZIP block in the - compressed file (upper 48 bits). - The start offset points to the first byte to be read, the end offset points - one past the last byte to be read. -storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details. - -Returns -------- -result : Series - -""" -doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource) - - -_docstring_get_reader_filepath_or_buffer = """ -Return either a filepath string to data, or a memory buffer of data. -If filepath, then the source filepath is expanded to user's environment. -If buffer, then data is returned in-memory as bytes or a ByteIO object. -This function is designed to process multiple data sources of the same -type at once. If path_or_data is a list, the output will also be a list. - -Parameters ----------- -path_or_data : str, file-like object, bytes, ByteIO, list - Path to data or the data itself. Pass in a list to process multiple - sources of the same type at once. -mode : str - Mode in which file is opened -iotypes : (), default (BytesIO) - Object type to exclude from file-like check -allow_raw_text_input : boolean, default False - If True, this indicates the input `path_or_data` could be a raw text - input and will not check for its existence in the filesystem. If False, - the input must be a path and an error will be raised if it does not - exist. -storage_options : dict, optional - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc. For HTTP(S) URLs the key-value - pairs are forwarded to ``urllib.request.Request`` as header options. - For other URLs (e.g. starting with "s3://", and "gcs://") the key-value - pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and - ``urllib`` for more details, and for more examples on storage options - refer `here `__. -bytes_per_thread : int, default None - Determines the number of bytes to be allocated per thread to read the - files in parallel. When there is a file of large size, we get slightly - better throughput by decomposing it and transferring multiple "blocks" - in parallel (using a Python thread pool). Default allocation is - {bytes_per_thread} bytes. -expand_dir_pattern : str, default None - Glob pattern to use when expanding directories into file paths - (e.g. "*.json"). If this parameter is not specified, directories - will not be expanded. -prefetch_options : dict, default None - WARNING: This is an experimental feature and may be removed at any - time without warning or deprecation period. - Dictionary of options to use to prefetch bytes from remote storage. - These options are only used when `path_or_data` is a list of remote - paths. If 'method' is set to 'all' (the default), the only supported - option is 'blocksize' (default 256 MB). If method is set to 'parquet', - 'columns' and 'row_groups' are also supported (default None). - -Returns -------- -List[str, bytes, BytesIO] - List of filepath strings or in-memory data buffers. - """.format(bytes_per_thread=_BYTES_PER_THREAD_DEFAULT) - - -doc_get_reader_filepath_or_buffer = docfmt_partial( - docstring=_docstring_get_reader_filepath_or_buffer -) - - -def is_url(url): - """Check if a string is a valid URL to a network location. - - Parameters - ---------- - url : str - String containing a possible URL - - Returns - ------- - bool : bool - If `url` has a valid protocol return True otherwise False. - """ - # Do not include the empty ('') scheme in the check - schemes = urllib.parse.uses_netloc[1:] - try: - return urllib.parse.urlparse(url).scheme in schemes - except Exception: - return False - - -def is_file_like(obj): - """Check if the object is a file-like object, per PANDAS' definition. - An object is considered file-like if it has an iterator AND has a either or - both `read()` / `write()` methods as attributes. - - Parameters - ---------- - obj : object - Object to check for file-like properties - - Returns - ------- - is_file_like : bool - If `obj` is file-like returns True otherwise False - """ - if not (hasattr(obj, "read") or hasattr(obj, "write")): - return False - elif not hasattr(obj, "__iter__"): - return False - else: - return True - - -def _is_local_filesystem(fs): - return isinstance(fs, fsspec.implementations.local.LocalFileSystem) - - -def _select_single_source(sources: list, caller: str): - """Select the first element from a list of sources. - Raise an error if sources contains multiple elements - """ - if len(sources) > 1: - raise ValueError( - f"{caller} does not support multiple sources, got: {sources}" - ) - return sources[0] - - -def is_directory(path_or_data, storage_options=None): - """Returns True if the provided filepath is a directory""" - path_or_data = stringify_pathlike(path_or_data) - if isinstance(path_or_data, str): - path_or_data = os.path.expanduser(path_or_data) - try: - fs = get_fs_token_paths( - path_or_data, mode="rb", storage_options=storage_options - )[0] - except ValueError as e: - if str(e).startswith("Protocol not known"): - return False - else: - raise e - - return fs.isdir(path_or_data) - - return False - - -def _get_filesystem_and_paths( - path_or_data, - storage_options, - *, - filesystem=None, -): - # Returns a filesystem object and the filesystem-normalized - # paths. If `path_or_data` does not correspond to a path or - # list of paths (or if the protocol is not supported), the - # return will be `None` for the fs and `[]` for the paths. - # If a filesystem object is already available, it can be - # passed with the `filesystem` argument. - - fs = None - return_paths = path_or_data - if isinstance(path_or_data, str) or ( - isinstance(path_or_data, list) - and isinstance(stringify_pathlike(path_or_data[0]), str) - ): - # Ensure we are always working with a list - if isinstance(path_or_data, list): - path_or_data = [ - os.path.expanduser(stringify_pathlike(source)) - for source in path_or_data - ] - else: - path_or_data = [path_or_data] - - if filesystem is None: - try: - fs, _, fs_paths = get_fs_token_paths( - path_or_data, mode="rb", storage_options=storage_options - ) - return_paths = fs_paths - except ValueError as e: - if str(e).startswith("Protocol not known"): - return None, [] - else: - raise e - else: - if not isinstance(filesystem, fsspec.AbstractFileSystem): - raise ValueError( - f"Expected fsspec.AbstractFileSystem. Got {filesystem}" - ) - - if storage_options: - raise ValueError( - f"Cannot specify storage_options when an explicit " - f"filesystem object is specified. Got: {storage_options}" - ) - - fs = filesystem - return_paths = [ - fs._strip_protocol(u) - for u in expand_paths_if_needed( - path_or_data, "rb", 1, fs, None - ) - ] - - return fs, return_paths - - -def _maybe_expand_directories(paths, glob_pattern, fs): - # Expand directory paths using a glob pattern. - # This is a no-op if either glob_pattern or fs are None - if fs is None or glob_pattern is None: - return paths - expanded_paths = [] - for path in paths: - if fs.isdir(path): - expanded_paths.extend(fs.glob(fs.sep.join([path, glob_pattern]))) - else: - expanded_paths.append(path) - return expanded_paths - - -@doc_get_reader_filepath_or_buffer() -def get_reader_filepath_or_buffer( - path_or_data, - *, - mode="rb", - fs=None, - iotypes=(BytesIO,), - allow_raw_text_input=False, - storage_options=None, - bytes_per_thread=_BYTES_PER_THREAD_DEFAULT, - warn_on_raw_text_input=None, - warn_meta=None, - expand_dir_pattern=None, - prefetch_options=None, -): - """{docstring}""" - - # Convert path_or_data to a list of input data sources - input_sources = [ - stringify_pathlike(source) - for source in ( - path_or_data if is_list_like(path_or_data) else [path_or_data] - ) - ] - if not input_sources: - raise ValueError("Empty input source list: {input_sources}.") - - filepaths_or_buffers = [] - string_paths = [isinstance(source, str) for source in input_sources] - if any(string_paths): - # Sources are all strings. Thes strings are typically - # file paths, but they may also be raw text strings. - - # Don't allow a mix of source types - if not all(string_paths): - raise ValueError("Invalid input source list: {input_sources}.") - - # Make sure we define a filesystem (if possible) - paths = input_sources - raw_text_input = False - if fs is None: - fs, paths = _get_filesystem_and_paths(paths, storage_options) - - # Expand directories (if necessary) - paths = _maybe_expand_directories(paths, expand_dir_pattern, fs) - - if _is_local_filesystem(fs): - # Doing this as `read_json` accepts a json string - # path_or_data need not be a filepath like string - - # helper for checking if raw text looks like a json filename - compression_extensions = [ - ".tar", - ".tar.gz", - ".tar.bz2", - ".tar.xz", - ".gz", - ".bz2", - ".zip", - ".xz", - ".zst", - "", - ] - - if len(paths): - if fs.exists(paths[0]): - filepaths_or_buffers = paths - - # raise FileNotFound if path looks like json - # following pandas - # see - # https://github.com/pandas-dev/pandas/pull/46718/files#diff-472ce5fe087e67387942e1e1c409a5bc58dde9eb8a2db6877f1a45ae4974f694R724-R729 - elif not allow_raw_text_input or paths[0].lower().endswith( - tuple(f".json{c}" for c in compression_extensions) - ): - raise FileNotFoundError( - f"{input_sources} could not be resolved to any files" - ) - else: - raw_text_input = True - else: - raw_text_input = True - - elif fs is not None: - if len(paths) == 0: - raise FileNotFoundError( - f"{input_sources} could not be resolved to any files" - ) - filepaths_or_buffers = _prefetch_remote_buffers( - paths, - fs, - **(prefetch_options or {}), - ) - else: - raw_text_input = True - - if raw_text_input: - filepaths_or_buffers = input_sources - if warn_on_raw_text_input: - # Do not remove until pandas 3.0 support is added. - assert ( - PANDAS_LT_300 - ), "Need to drop after pandas-3.0 support is added." - warnings.warn( - f"Passing literal {warn_meta[0]} to {warn_meta[1]} is " - "deprecated and will be removed in a future version. " - "To read from a literal string, wrap it in a " - "'StringIO' object.", - FutureWarning, - ) - - else: - # Sources are already buffers or file-like objects - for source in input_sources: - if not isinstance(source, iotypes) and is_file_like(source): - if isinstance(source, TextIOWrapper): - source = source.buffer - filepaths_or_buffers.append( - BytesIO( - _fsspec_data_transfer( - source, - mode=mode, - bytes_per_thread=bytes_per_thread, - ) - ) - ) - else: - filepaths_or_buffers.append(source) - - return filepaths_or_buffers - - -def get_writer_filepath_or_buffer(path_or_data, mode, storage_options=None): - """ - Return either a filepath string to data, - or a open file object to the output filesystem - - Parameters - ---------- - path_or_data : str, file-like object, bytes, ByteIO - Path to data or the data itself. - mode : str - Mode in which file is opened - storage_options : dict, optional, default None - Extra options that make sense for a particular storage connection, - e.g. host, port, username, password, etc. For HTTP(S) URLs the - key-value pairs are forwarded to ``urllib.request.Request`` as - header options. For other URLs (e.g. starting with "s3://", and - "gcs://") the key-value pairs are forwarded to ``fsspec.open``. - Please see ``fsspec`` and ``urllib`` for more details. - - Returns - ------- - filepath_or_buffer : str, - Filepath string or buffer of data - """ - if storage_options is None: - storage_options = {} - - if isinstance(path_or_data, str): - path_or_data = os.path.expanduser(path_or_data) - fs = get_fs_token_paths( - path_or_data, mode=mode or "w", storage_options=storage_options - )[0] - - if not _is_local_filesystem(fs): - filepath_or_buffer = fsspec.open( - path_or_data, mode=mode or "w", **(storage_options) - ) - return filepath_or_buffer - - return path_or_data - - -def get_IOBase_writer(file_obj): - """ - Parameters - ---------- - file_obj : file-like object - Open file object for writing to any filesystem - - Returns - ------- - iobase_file_obj : file-like object - Open file object inheriting from io.IOBase - """ - if not isinstance(file_obj, IOBase): - if "b" in file_obj.mode: - iobase_file_obj = BufferedWriter(file_obj) - else: - iobase_file_obj = TextIOWrapper(file_obj) - return iobase_file_obj - - return file_obj - - -def is_fsspec_open_file(file_obj): - if isinstance(file_obj, fsspec.core.OpenFile): - return True - return False - - -def stringify_pathlike(pathlike): - """ - Convert any object that implements the fspath protocol - to a string. Leaves other objects unchanged - - Parameters - ---------- - pathlike - Pathlike object that implements the fspath protocol - - Returns - ------- - maybe_pathlike_str - String version of the object if possible - """ - maybe_pathlike_str = ( - pathlike.__fspath__() if hasattr(pathlike, "__fspath__") else pathlike - ) - - return maybe_pathlike_str - - -def buffer_write_lines(buf, lines): - """ - Appends lines to a buffer. - - Parameters - ---------- - buf - The buffer to write to - lines - The lines to append. - """ - if any(isinstance(x, str) for x in lines): - lines = [str(x) for x in lines] - buf.write("\n".join(lines)) - - -def _apply_filter_bool_eq(val, col_stats): - if "true_count" in col_stats and "false_count" in col_stats: - if val is True: - if (col_stats["true_count"] == 0) or ( - col_stats["false_count"] == col_stats["number_of_values"] - ): - return False - elif val is False: - if (col_stats["false_count"] == 0) or ( - col_stats["true_count"] == col_stats.number_of_values - ): - return False - return True - - -def _apply_filter_not_eq(val, col_stats): - return ("minimum" in col_stats and val < col_stats["minimum"]) or ( - "maximum" in col_stats and val > col_stats["maximum"] - ) - - -def _apply_predicate(op, val, col_stats): - # Sanitize operator - if op not in {"=", "==", "!=", "<", "<=", ">", ">=", "in", "not in"}: - raise ValueError(f"'{op}' is not a valid operator in predicates.") - - col_min = col_stats.get("minimum", None) - col_max = col_stats.get("maximum", None) - col_sum = col_stats.get("sum", None) - - # Apply operator - if op == "=" or op == "==": - if _apply_filter_not_eq(val, col_stats): - return False - # TODO: Replace pd.isnull with - # cudf.isnull once it is implemented - if pd.isnull(val) and not col_stats.has_null: - return False - if not _apply_filter_bool_eq(val, col_stats): - return False - elif op == "!=": - if ( - col_min is not None - and col_max is not None - and val == col_min - and val == col_max - ): - return False - if _apply_filter_bool_eq(val, col_stats): - return False - elif col_min is not None and ( - (op == "<" and val <= col_min) or (op == "<=" and val < col_min) - ): - return False - elif col_max is not None and ( - (op == ">" and val >= col_max) or (op == ">=" and val > col_max) - ): - return False - elif ( - col_sum is not None - and op == ">" - and ( - (col_min is not None and col_min >= 0 and col_sum <= val) - or (col_max is not None and col_max <= 0 and col_sum >= val) - ) - ): - return False - elif ( - col_sum is not None - and op == ">=" - and ( - (col_min is not None and col_min >= 0 and col_sum < val) - or (col_max is not None and col_max <= 0 and col_sum > val) - ) - ): - return False - elif op == "in": - if (col_max is not None and col_max < min(val)) or ( - col_min is not None and col_min > max(val) - ): - return False - if all(_apply_filter_not_eq(elem, col_stats) for elem in val): - return False - elif op == "not in" and col_min is not None and col_max is not None: - if any(elem == col_min == col_max for elem in val): - return False - col_range = None - if isinstance(col_min, int): - col_range = range(col_min, col_max) - elif isinstance(col_min, datetime.datetime): - col_range = pd.date_range(col_min, col_max) - if col_range and all(elem in val for elem in col_range): - return False - return True - - -def _apply_filters(filters, stats): - for conjunction in filters: - if all( - _apply_predicate(op, val, stats[col]) - for col, op, val in conjunction - ): - return True - return False - - -def _prepare_filters(filters): - # Coerce filters into list of lists of tuples - if isinstance(filters[0][0], str): - filters = [filters] - - return filters - - -def _ensure_filesystem(passed_filesystem, path, storage_options): - if passed_filesystem is None: - return get_fs_token_paths( - path[0] if isinstance(path, list) else path, - storage_options={} if storage_options is None else storage_options, - )[0] - return passed_filesystem - - -# -# Fsspec Data-transfer Optimization Code -# - - -def _fsspec_data_transfer( - path_or_fob, - fs=None, - file_size=None, - bytes_per_thread=_BYTES_PER_THREAD_DEFAULT, - max_gap=64_000, - mode="rb", -): - if bytes_per_thread is None: - bytes_per_thread = _BYTES_PER_THREAD_DEFAULT - - # Require `fs` if `path_or_fob` is not file-like - file_like = is_file_like(path_or_fob) - if fs is None and not file_like: - raise ValueError( - "fs must be defined if `path_or_fob` is not file-like" - ) - - # Calculate total file size - if file_like: - try: - file_size = path_or_fob.size - except AttributeError: - # If we cannot find the size of path_or_fob - # just read it. - return path_or_fob.read() - file_size = file_size or fs.size(path_or_fob) - - # Check if a direct read makes the most sense - if bytes_per_thread >= file_size: - if file_like: - return path_or_fob.read() - else: - return fs.open(path_or_fob, mode=mode, cache_type="all").read() - - # Threaded read into "local" buffer - buf = np.zeros(file_size, dtype="b") - - byte_ranges = [ - (b, min(bytes_per_thread, file_size - b)) - for b in range(0, file_size, bytes_per_thread) - ] - _read_byte_ranges( - path_or_fob, - byte_ranges, - buf, - fs=fs, - ) - - return buf.tobytes() - - -def _merge_ranges(byte_ranges, max_block=256_000_000, max_gap=64_000): - # Simple utility to merge small/adjacent byte ranges - new_ranges = [] - if not byte_ranges: - # Early return - return new_ranges - - offset, size = byte_ranges[0] - for new_offset, new_size in byte_ranges[1:]: - gap = new_offset - (offset + size) - if gap > max_gap or (size + new_size + gap) > max_block: - # Gap is too large or total read is too large - new_ranges.append((offset, size)) - offset = new_offset - size = new_size - continue - size += new_size + gap - new_ranges.append((offset, size)) - return new_ranges - - -def _assign_block(fs, path_or_fob, local_buffer, offset, nbytes): - if fs is None: - # We have an open fsspec file object - path_or_fob.seek(offset) - local_buffer[offset : offset + nbytes] = np.frombuffer( - path_or_fob.read(nbytes), - dtype="b", - ) - else: - # We have an fsspec filesystem and a path - with fs.open(path_or_fob, mode="rb", cache_type="none") as fob: - fob.seek(offset) - local_buffer[offset : offset + nbytes] = np.frombuffer( - fob.read(nbytes), - dtype="b", - ) - - -def _read_byte_ranges( - path_or_fob, - ranges, - local_buffer, - fs=None, -): - # Simple utility to copy remote byte ranges - # into a local buffer for IO in libcudf - workers = [] - for offset, nbytes in ranges: - if len(ranges) > 1: - workers.append( - Thread( - target=_assign_block, - args=(fs, path_or_fob, local_buffer, offset, nbytes), - ) - ) - workers[-1].start() - else: - _assign_block(fs, path_or_fob, local_buffer, offset, nbytes) - - for worker in workers: - worker.join() - - -def _get_remote_bytes_all( - remote_paths, fs, *, blocksize=_BYTES_PER_THREAD_DEFAULT -): - # TODO: Experiment with a heuristic to avoid the fs.sizes - # call when we are reading many files at once (the latency - # of collecting the file sizes is unnecessary in this case) - if max(sizes := fs.sizes(remote_paths)) <= blocksize: - # Don't bother breaking up individual files - return fs.cat_ranges(remote_paths, None, None) - else: - # Construct list of paths, starts, and ends - paths, starts, ends = map( - list, - zip( - *( - (r, j, min(j + blocksize, s)) - for r, s in zip(remote_paths, sizes) - for j in range(0, s, blocksize) - ) - ), - ) - - # Collect the byte ranges - chunks = fs.cat_ranges(paths, starts, ends) - - # Construct local byte buffers - # (Need to make sure path offsets are ordered correctly) - unique_count = dict(zip(*np.unique(paths, return_counts=True))) - offset = np.cumsum([0] + [unique_count[p] for p in remote_paths]) - buffers = [ - functools.reduce(operator.add, chunks[offset[i] : offset[i + 1]]) - for i in range(len(remote_paths)) - ] - return buffers - - -def _get_remote_bytes_parquet( - remote_paths, - fs, - *, - columns=None, - row_groups=None, - blocksize=_BYTES_PER_THREAD_DEFAULT, -): - if fsspec_parquet is None or (columns is None and row_groups is None): - return _get_remote_bytes_all(remote_paths, fs, blocksize=blocksize) - - sizes = fs.sizes(remote_paths) - data = fsspec_parquet._get_parquet_byte_ranges( - remote_paths, - fs, - columns=columns, - row_groups=row_groups, - max_block=blocksize, - ) - - buffers = [] - for size, path in zip(sizes, remote_paths): - path_data = data[path] - buf = np.empty(size, dtype="b") - for range_offset in path_data.keys(): - chunk = path_data[range_offset] - buf[range_offset[0] : range_offset[1]] = np.frombuffer( - chunk, dtype="b" - ) - buffers.append(buf.tobytes()) - return buffers - - -def _prefetch_remote_buffers( - paths, - fs, - *, - method="all", - **prefetch_options, -): - # Gather bytes ahead of time for remote filesystems - if fs and paths and not _is_local_filesystem(fs): - try: - prefetcher = { - "parquet": _get_remote_bytes_parquet, - "all": _get_remote_bytes_all, - }[method] - except KeyError: - raise ValueError( - f"{method} is not a supported remote-data prefetcher." - " Expected 'parquet' or 'all'." - ) - return prefetcher( - paths, - fs, - **prefetch_options, - ) - - else: - return paths diff --git a/python/cudf/cudf/utils/performance_tracking.py b/python/cudf/cudf/utils/performance_tracking.py deleted file mode 100644 index 30c891d0d5a..00000000000 --- a/python/cudf/cudf/utils/performance_tracking.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from __future__ import annotations - -import contextlib -import functools -import hashlib -import sys - -import nvtx - -import rmm.statistics - -from cudf.options import get_option - -_NVTX_COLORS = ["green", "blue", "purple", "rapids"] - - -def _get_color_for_nvtx(name): - m = hashlib.sha256() - m.update(name.encode()) - hash_value = int(m.hexdigest(), 16) - idx = hash_value % len(_NVTX_COLORS) - return _NVTX_COLORS[idx] - - -def _performance_tracking(func, domain="cudf_python"): - """Decorator for applying performance tracking (if enabled).""" - - @functools.wraps(func) - def wrapper(*args, **kwargs): - with contextlib.ExitStack() as stack: - if get_option("memory_profiling"): - # NB: the user still needs to call `rmm.statistics.enable_statistics()` - # to enable memory profiling. - stack.enter_context( - rmm.statistics.profiler( - name=rmm.statistics._get_descriptive_name_of_object( - func - ) - ) - ) - if nvtx.enabled(): - stack.enter_context( - nvtx.annotate( - message=func.__qualname__, - color=_get_color_for_nvtx(func.__qualname__), - domain=domain, - ) - ) - return func(*args, **kwargs) - - return wrapper - - -_dask_cudf_performance_tracking = functools.partial( - _performance_tracking, domain="dask_cudf_python" -) - - -def get_memory_records() -> ( - dict[str, rmm.statistics.ProfilerRecords.MemoryRecord] -): - """Get the memory records from the memory profiling - - Returns - ------- - Dict that maps function names to memory records. Empty if - memory profiling is disabled - """ - return rmm.statistics.default_profiler_records.records - - -def print_memory_report(file=sys.stdout) -> None: - """Pretty print the result of the memory profiling - - Parameters - ---------- - file - The output stream - """ - print(rmm.statistics.default_profiler_records.report(), file=file) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py deleted file mode 100644 index 78aeac425f7..00000000000 --- a/python/cudf/cudf/utils/queryutils.py +++ /dev/null @@ -1,256 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import ast -import datetime -from typing import Any - -import numpy as np -from numba import cuda - -import cudf -from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import column_empty -from cudf.utils import applyutils -from cudf.utils._numba import _CUDFNumbaConfig -from cudf.utils.dtypes import ( - BOOL_TYPES, - DATETIME_TYPES, - NUMERIC_TYPES, - TIMEDELTA_TYPES, -) - -ENVREF_PREFIX = "__CUDF_ENVREF__" - -SUPPORTED_QUERY_TYPES = { - np.dtype(dt) - for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES -} - - -class QuerySyntaxError(ValueError): - pass - - -class _NameExtractor(ast.NodeVisitor): - def __init__(self): - self.colnames = set() - self.refnames = set() - - def visit_Name(self, node): - if not isinstance(node.ctx, ast.Load): - raise QuerySyntaxError("assignment is not allowed") - - name = node.id - chosen = ( - self.refnames if name.startswith(ENVREF_PREFIX) else self.colnames - ) - chosen.add(name) - - -def query_parser(text): - """The query expression parser. - - See https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.query.html - - * names with '@' prefix are global reference. - * other names must be column names of the dataframe. - - Parameters - ---------- - text: str - The query string - - Returns - ------- - info: a `dict` of the parsed info - """ # noqa - # convert any '@' to - text = text.replace("@", ENVREF_PREFIX) - tree = ast.parse(text) - _check_error(tree) - [expr] = tree.body - extractor = _NameExtractor() - extractor.visit(expr) - colnames = sorted(extractor.colnames) - refnames = sorted(extractor.refnames) - info = { - "source": text, - "args": colnames + refnames, - "colnames": colnames, - "refnames": refnames, - } - return info - - -def query_builder(info, funcid): - """Function builder for the query expression - - Parameters - ---------- - info: dict - From the `query_parser()` - funcid: str - The name for the function being generated - - Returns - ------- - func: a python function of the query - """ - args = info["args"] - def_line = "def {funcid}({args}):".format( - funcid=funcid, args=", ".join(args) - ) - lines = [def_line, " return {}".format(info["source"])] - source = "\n".join(lines) - glbs = {} - exec(source, glbs) - return glbs[funcid] - - -def _check_error(tree): - if not isinstance(tree, ast.Module): - raise QuerySyntaxError("top level should be of ast.Module") - if len(tree.body) != 1: - raise QuerySyntaxError("too many expressions") - - -_cache: dict[Any, Any] = {} - - -def query_compile(expr): - """Compile the query expression. - - This generates a CUDA Kernel for the query expression. The kernel is - cached for reuse. All variable names, including both references to - columns and references to variables in the calling environment, in the - expression are passed as argument to the kernel. Thus, the kernel is - reusable on any dataframe and in any environment. - - Parameters - ---------- - expr : str - The boolean expression - - Returns - ------- - compiled: dict - key "kernel" is the cuda kernel for the query. - key "args" is a sequence of name of the arguments. - """ - - # hash returns in the semi-open interval [-2**63, 2**63) - funcid = f"queryexpr_{(hash(expr) + 2**63):x}" - # Load cache - compiled = _cache.get(funcid) - # Cache not found - if compiled is None: - info = query_parser(expr) - fn = query_builder(info, funcid) - args = info["args"] - # compile - devicefn = cuda.jit(device=True)(fn) - - kernelid = f"kernel_{funcid}" - kernel = _wrap_query_expr(kernelid, devicefn, args) - - compiled = info.copy() - compiled["kernel"] = kernel - # Store cache - _cache[funcid] = compiled - return compiled - - -_kernel_source = """ -@cuda.jit -def {kernelname}(out, {args}): - idx = cuda.grid(1) - if idx < out.size: - out[idx] = queryfn({indiced_args}) -""" - - -def _wrap_query_expr(name, fn, args): - """Wrap the query expression in a cuda kernel.""" - - def _add_idx(arg): - if arg.startswith(ENVREF_PREFIX): - return arg - else: - return f"{arg}[idx]" - - def _add_prefix(arg): - return f"_args_{arg}" - - glbls = {"queryfn": fn, "cuda": cuda} - kernargs = map(_add_prefix, args) - indiced_args = map(_add_prefix, map(_add_idx, args)) - src = _kernel_source.format( - kernelname=name, - args=", ".join(kernargs), - indiced_args=", ".join(indiced_args), - ) - exec(src, glbls) - kernel = glbls[name] - return kernel - - -@acquire_spill_lock() -def query_execute(df, expr, callenv): - """Compile & execute the query expression - - Note: the expression is compiled and cached for future reuse. - - Parameters - ---------- - df : DataFrame - expr : str - boolean expression - callenv : dict - Contains keys 'local_dict', 'locals' and 'globals' which are all dict. - They represent the arg, local and global dictionaries of the caller. - """ - - # compile - compiled = query_compile(expr) - columns = compiled["colnames"] - - # prepare col args - colarrays = [cudf.core.dataframe.extract_col(df, col) for col in columns] - - # wait to check the types until we know which cols are used - if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays): - raise TypeError( - "query only supports numeric, datetime, timedelta, " - "or bool dtypes." - ) - - colarrays = [col.data_array_view(mode="read") for col in colarrays] - - kernel = compiled["kernel"] - # process env args - envargs = [] - envdict = callenv["globals"].copy() - envdict.update(callenv["locals"]) - envdict.update(callenv["local_dict"]) - for name in compiled["refnames"]: - name = name[len(ENVREF_PREFIX) :] - try: - val = envdict[name] - if isinstance(val, datetime.datetime): - val = np.datetime64(val) - except KeyError: - msg = "{!r} not defined in the calling environment" - raise NameError(msg.format(name)) - else: - envargs.append(val) - - # allocate output buffer - nrows = len(df) - out = column_empty(nrows, dtype=np.bool_) - # run kernel - args = [out] + colarrays + envargs - with _CUDFNumbaConfig(): - kernel.forall(nrows)(*args) - out_mask = applyutils.make_aggregate_nullmask(df, columns=columns) - return out.set_mask(out_mask).fillna(False) diff --git a/python/cudf/cudf/utils/string.py b/python/cudf/cudf/utils/string.py deleted file mode 100644 index 9c02d1d6b34..00000000000 --- a/python/cudf/cudf/utils/string.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - - -def format_bytes(nbytes: int) -> str: - """Format `nbytes` to a human readable string""" - n = float(nbytes) - for unit in ["B", "KiB", "MiB", "GiB", "TiB"]: - if abs(n) < 1024: - if n.is_integer(): - return f"{int(n)}{unit}" - return f"{n:.2f}{unit}" - n /= 1024 - return f"{n:.2f} PiB" diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py deleted file mode 100644 index 7347ec7866a..00000000000 --- a/python/cudf/cudf/utils/utils.py +++ /dev/null @@ -1,405 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -import decimal -import functools -import os -import traceback -import warnings - -import numpy as np -import pandas as pd - -import rmm - -import cudf -import cudf.api.types -from cudf.core import column -from cudf.core.buffer import as_buffer - -# The size of the mask in bytes -mask_dtype = cudf.api.types.dtype(np.int32) -mask_bitsize = mask_dtype.itemsize * 8 - -# Mapping from ufuncs to the corresponding binary operators. -_ufunc_binary_operations = { - # Arithmetic binary operations. - "add": "add", - "subtract": "sub", - "multiply": "mul", - "matmul": "matmul", - "divide": "truediv", - "true_divide": "truediv", - "floor_divide": "floordiv", - "power": "pow", - "float_power": "pow", - "remainder": "mod", - "mod": "mod", - "fmod": "mod", - # Bitwise binary operations. - "bitwise_and": "and", - "bitwise_or": "or", - "bitwise_xor": "xor", - # Comparison binary operators - "greater": "gt", - "greater_equal": "ge", - "less": "lt", - "less_equal": "le", - "not_equal": "ne", - "equal": "eq", -} - -# These operators need to be mapped to their inverses when performing a -# reflected ufunc operation because no reflected version of the operators -# themselves exist. When these operators are invoked directly (not via -# __array_ufunc__) Python takes care of calling the inverse operation. -_ops_without_reflection = { - "gt": "lt", - "ge": "le", - "lt": "gt", - "le": "ge", - # ne and eq are symmetric, so they are their own inverse op - "ne": "ne", - "eq": "eq", -} - - -# This is the implementation of __array_ufunc__ used for Frame and Column. -# For more detail on this function and how it should work, see -# https://numpy.org/doc/stable/reference/ufuncs.html -def _array_ufunc(obj, ufunc, method, inputs, kwargs): - # We don't currently support reduction, accumulation, etc. We also - # don't support any special kwargs or higher arity ufuncs than binary. - if method != "__call__" or kwargs or ufunc.nin > 2: - return NotImplemented - - fname = ufunc.__name__ - if fname in _ufunc_binary_operations: - reflect = obj is not inputs[0] - other = inputs[0] if reflect else inputs[1] - - op = _ufunc_binary_operations[fname] - if reflect and op in _ops_without_reflection: - op = _ops_without_reflection[op] - reflect = False - op = f"__{'r' if reflect else ''}{op}__" - - # float_power returns float irrespective of the input type. - # TODO: Do not get the attribute directly, get from the operator module - # so that we can still exploit reflection. - if fname == "float_power": - return getattr(obj, op)(other).astype(float) - return getattr(obj, op)(other) - - # Special handling for various unary operations. - if fname == "negative": - return obj * -1 - if fname == "positive": - return obj.copy(deep=True) - if fname == "invert": - return ~obj - if fname == "absolute": - # TODO: Make sure all obj (mainly Column) implement abs. - return abs(obj) - if fname == "fabs": - return abs(obj).astype(np.float64) - - # None is a sentinel used by subclasses to trigger cupy dispatch. - return None - - -_EQUALITY_OPS = { - "__eq__", - "__ne__", - "__lt__", - "__gt__", - "__le__", - "__ge__", -} - -# The test root is set by pytest to support situations where tests are run from -# a source tree on a built version of cudf. -NO_EXTERNAL_ONLY_APIS = os.getenv("NO_EXTERNAL_ONLY_APIS") - -_cudf_root = os.path.dirname(cudf.__file__) -# If the environment variable for the test root is not set, we default to -# using the path relative to the cudf root directory. -_tests_root = os.getenv("_CUDF_TEST_ROOT") or os.path.join(_cudf_root, "tests") - - -def _external_only_api(func, alternative=""): - """Decorator to indicate that a function should not be used internally. - - cudf contains many APIs that exist for pandas compatibility but are - intrinsically inefficient. For some of these cudf has internal - equivalents that are much faster. Usage of the slow public APIs inside - our implementation can lead to unnecessary performance bottlenecks. - Applying this decorator to such functions and setting the environment - variable NO_EXTERNAL_ONLY_APIS will cause such functions to raise - exceptions if they are called from anywhere inside cudf, making it easy - to identify and excise such usage. - - The `alternative` should be a complete phrase or sentence since it will - be used verbatim in error messages. - """ - - # If the first arg is a string then an alternative function to use in - # place of this API was provided, so we pass that to a subsequent call. - # It would be cleaner to implement this pattern by using a class - # decorator with a factory method, but there is no way to generically - # wrap docstrings on a class (we would need the docstring to be on the - # class itself, not instances, because that's what `help` looks at) and - # there is also no way to make mypy happy with that approach. - if isinstance(func, str): - return lambda actual_func: _external_only_api(actual_func, func) - - if not NO_EXTERNAL_ONLY_APIS: - return func - - @functools.wraps(func) - def wrapper(*args, **kwargs): - # Check the immediately preceding frame to see if it's in cudf. - pre_frame = traceback.extract_stack(limit=2)[0] - fn = pre_frame.filename - lineno = pre_frame.lineno - if _cudf_root in fn and _tests_root not in fn: - raise RuntimeError( - f"External-only API called in {fn} at line {lineno}. " - f"{alternative}" - ) - return func(*args, **kwargs) - - return wrapper - - -def initfunc(f): - """ - Decorator for initialization functions that should - be run exactly once. - """ - - @functools.wraps(f) - def wrapper(*args, **kwargs): - if wrapper.initialized: - return - wrapper.initialized = True - return f(*args, **kwargs) - - wrapper.initialized = False - return wrapper - - -def clear_cache(): - """Clear all internal caches""" - cudf.Scalar._clear_instance_cache() - - -class GetAttrGetItemMixin: - """This mixin changes `__getattr__` to attempt a `__getitem__` call. - - Classes that include this mixin gain enhanced functionality for the - behavior of attribute access like `obj.foo`: if `foo` is not an attribute - of `obj`, obj['foo'] will be attempted, and the result returned. To make - this behavior safe, classes that include this mixin must define a class - attribute `_PROTECTED_KEYS` that defines the attributes that are accessed - within `__getitem__`. For example, if `__getitem__` is defined as - `return self._data[key]`, we must define `_PROTECTED_KEYS={'_data'}`. - """ - - # Tracking of protected keys by each subclass is necessary to make the - # `__getattr__`->`__getitem__` call safe. See - # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html # noqa: E501 - # for an explanation. In brief, defining the `_PROTECTED_KEYS` allows this - # class to avoid calling `__getitem__` inside `__getattr__` when - # `__getitem__` will internally again call `__getattr__`, resulting in an - # infinite recursion. - # This problem only arises when the copy protocol is invoked (e.g. by - # `copy.copy` or `pickle.dumps`), and could also be avoided by redefining - # methods involved with the copy protocol such as `__reduce__` or - # `__setstate__`, but this class may be used in complex multiple - # inheritance hierarchies that might also override serialization. The - # solution here is a minimally invasive change that avoids such conflicts. - _PROTECTED_KEYS: frozenset[str] | set[str] = frozenset() - - def __getattr__(self, key): - if key in self._PROTECTED_KEYS: - raise AttributeError - try: - return self[key] - except KeyError: - raise AttributeError( - f"{type(self).__name__} object has no attribute {key}" - ) - - -class NotIterable: - def __iter__(self): - """ - Iteration is unsupported. - - See :ref:`iteration ` for more - information. - """ - raise TypeError( - f"{self.__class__.__name__} object is not iterable. " - f"Consider using `.to_arrow()`, `.to_pandas()` or `.values_host` " - f"if you wish to iterate over the values." - ) - - -def pa_mask_buffer_to_mask(mask_buf, size): - """ - Convert PyArrow mask buffer to cuDF mask buffer - """ - mask_size = cudf._lib.null_mask.bitmask_allocation_size_bytes(size) - if mask_buf.size < mask_size: - dbuf = rmm.DeviceBuffer(size=mask_size) - dbuf.copy_from_host(np.asarray(mask_buf).view("u1")) - return as_buffer(dbuf) - return as_buffer(mask_buf) - - -def _isnat(val): - """Wraps np.isnat to return False instead of error on invalid inputs.""" - if val is pd.NaT: - return True - elif not isinstance(val, (np.datetime64, np.timedelta64, str)): - return False - else: - try: - return val in {"NaT", "NAT"} or np.isnat(val) - except TypeError: - return False - - -def search_range(x: int, ri: range, *, side: str) -> int: - """ - - Find insertion point in a range to maintain sorted order - - Parameters - ---------- - x - Integer to insert - ri - Range to insert into - side - Tie-breaking decision for the case that `x` is a member of the - range. If `"left"` then the insertion point is before the - entry, otherwise it is after. - - Returns - ------- - int - The insertion point - - See Also - -------- - numpy.searchsorted - - Notes - ----- - Let ``p`` be the return value, then if ``side="left"`` the - following invariants are maintained:: - - all(x < n for n in ri[:p]) - all(x >= n for n in ri[p:]) - - Conversely, if ``side="right"`` then we have:: - - all(x <= n for n in ri[:p]) - all(x > n for n in ri[p:]) - - Examples - -------- - For series: 1 4 7 - >>> search_range(4, range(1, 10, 3), side="left") - 1 - >>> search_range(4, range(1, 10, 3), side="right") - 2 - """ - assert side in {"left", "right"} - if flip := (ri.step < 0): - ri = ri[::-1] - shift = int(side == "right") - else: - shift = int(side == "left") - - offset = (x - ri.start - shift) // ri.step + 1 - if flip: - offset = len(ri) - offset - return max(min(len(ri), offset), 0) - - -def is_na_like(obj): - """ - Check if `obj` is a cudf NA value, - i.e., None, cudf.NA or cudf.NaT - """ - return obj is None or obj is cudf.NA or obj is cudf.NaT - - -def _warn_no_dask_cudf(fn): - @functools.wraps(fn) - def wrapper(self): - # try import - try: - # Import dask_cudf (if available) in case - # this is being called within Dask Dataframe - import dask_cudf # noqa: F401 - - except ImportError: - warnings.warn( - f"Using dask to tokenize a {type(self)} object, " - "but `dask_cudf` is not installed. Please install " - "`dask_cudf` for proper dispatching." - ) - return fn(self) - - return wrapper - - -def _is_same_name(left_name, right_name): - # Internal utility to compare if two names are same. - with warnings.catch_warnings(): - # numpy throws warnings while comparing - # NaT values with non-NaT values. - warnings.simplefilter("ignore") - try: - same = (left_name is right_name) or (left_name == right_name) - if not same: - if isinstance(left_name, decimal.Decimal) and isinstance( - right_name, decimal.Decimal - ): - return left_name.is_nan() and right_name.is_nan() - if isinstance(left_name, float) and isinstance( - right_name, float - ): - return np.isnan(left_name) and np.isnan(right_name) - if isinstance(left_name, np.datetime64) and isinstance( - right_name, np.datetime64 - ): - return np.isnan(left_name) and np.isnan(right_name) - return same - except TypeError: - return False - - -def _all_bools_with_nulls(lhs, rhs, bool_fill_value): - # Internal utility to construct a boolean column - # by combining nulls from `lhs` & `rhs`. - if lhs.has_nulls() and rhs.has_nulls(): - result_mask = lhs._get_mask_as_column() & rhs._get_mask_as_column() - elif lhs.has_nulls(): - result_mask = lhs._get_mask_as_column() - elif rhs.has_nulls(): - result_mask = rhs._get_mask_as_column() - else: - result_mask = None - - result_col = column.as_column( - bool_fill_value, dtype=cudf.dtype(np.bool_), length=len(lhs) - ) - if result_mask is not None: - result_col = result_col.set_mask(result_mask.as_mask()) - return result_col diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a7cb66d7b16..9c550cd0d60 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -13,41 +13,17 @@ # ============================================================================= set(cython_sources - aggregation.pyx binaryop.pyx column.pyx column_factories.pyx concatenate.pyx - copying.pyx datetime.pyx - experimental.pyx - expressions.pyx filling.pyx gpumemoryview.pyx - groupby.pyx interop.pyx - join.pyx - labeling.pyx - lists.pyx - merge.pyx - null_mask.pyx - partitioning.pyx - quantiles.pyx - reduce.pyx - replace.pyx - reshape.pyx - rolling.pyx - round.pyx scalar.pyx - search.pyx - stream_compaction.pyx - sorting.pyx table.pyx - traits.pyx - transform.pyx - transpose.pyx types.pyx - unary.pyx utils.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 2a5365e8fad..b96028b1959 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -45,51 +45,8 @@ types, unary, ) -from .column import Column -from .gpumemoryview import gpumemoryview -from .scalar import Scalar -from .table import Table -from .types import DataType, MaskState, TypeId - -__all__ = [ - "Column", - "DataType", - "MaskState", - "Scalar", - "Table", - "TypeId", - "aggregation", - "binaryop", - "column_factories", - "concatenate", - "copying", - "datetime", - "experimental", - "expressions", - "filling", - "gpumemoryview", - "groupby", - "interop", - "io", - "join", - "labeling", - "lists", - "merge", - "null_mask", - "partitioning", - "quantiles", - "reduce", - "replace", - "reshape", - "rolling", - "round", - "search", - "stream_compaction", - "strings", - "sorting", - "traits", - "transform", - "transpose", - "types", - "unary", -] +# from .column import Column +# from .gpumemoryview import gpumemoryview +# from .scalar import Scalar +# from .table import Table +# from .types import DataType, MaskState, TypeId