From 4dc8300c6104697b1d9313a48d1c4c7f5dabf81a Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 16 Nov 2023 16:22:44 -0600 Subject: [PATCH 1/2] Raise error in `reindex` when `index` is not unique (#14400) (#14429) Bacport of #14400 Fixes: #14398 This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14400 Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Richard (Rick) Zamora (https://github.com/rjzamora) - Ashwin Srinath (https://github.com/shwina) - Ray Douglass (https://github.com/raydouglass) --- python/cudf/cudf/core/indexed_frame.py | 4 ++++ python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++ python/cudf/cudf/tests/test_series.py | 12 ++++++++++++ python/dask_cudf/dask_cudf/backends.py | 13 ++++--------- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 219b8021241..fef62594fb8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2606,6 +2606,10 @@ def _reindex( df = self if index is not None: + if not df._index.is_unique: + raise ValueError( + "cannot reindex on an axis with duplicate labels" + ) index = cudf.core.index.as_index( index, name=getattr(index, "name", self._index.name) ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d44cf594e8b..5677f97408a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10723,3 +10723,15 @@ def test_dataframe_series_dot(): expected = gser @ [12, 13] assert_eq(expected, actual) + + +def test_dataframe_duplicate_index_reindex(): + gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1]) + pdf = gdf.to_pandas() + + assert_exceptions_equal( + gdf.reindex, + pdf.reindex, + lfunc_args_and_kwargs=([10, 11, 12, 13], {}), + rfunc_args_and_kwargs=([10, 11, 12, 13], {}), + ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8f8f87c20e0..c15a797713f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype(): s = cudf.Series([True, False, True]) with pytest.raises(TypeError): s[0] = 10 + + +def test_series_duplicate_index_reindex(): + gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1]) + ps = gs.to_pandas() + + assert_exceptions_equal( + gs.reindex, + ps.reindex, + lfunc_args_and_kwargs=([10, 11, 12, 13], {}), + rfunc_args_and_kwargs=([10, 11, 12, 13], {}), + ) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 344b03c631d..2be256f85e8 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -437,17 +437,12 @@ def union_categoricals_cudf( ) -@_dask_cudf_nvtx_annotate -def safe_hash(frame): - return cudf.Series(frame.hash_values(), index=frame.index) - - @hash_object_dispatch.register((cudf.DataFrame, cudf.Series)) @_dask_cudf_nvtx_annotate def hash_object_cudf(frame, index=True): if index: - return safe_hash(frame.reset_index()) - return safe_hash(frame) + frame = frame.reset_index() + return frame.hash_values() @hash_object_dispatch.register(cudf.BaseIndex) @@ -455,10 +450,10 @@ def hash_object_cudf(frame, index=True): def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): - return safe_hash(ind.to_frame(index=False)) + return ind.to_frame(index=False).hash_values() col = cudf.core.column.as_column(ind) - return safe_hash(cudf.Series(col)) + return cudf.Series(col).hash_values() @group_split_dispatch.register((cudf.Series, cudf.DataFrame)) From fc8c81f3d4bde674d4123ae4848c578bcc7158b6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 28 Nov 2023 14:13:20 -0600 Subject: [PATCH 2/2] Fix function name typo in `cudf.pandas` profiler (#14514) Fixes: #14512 This PR fixes a function name typo in `cudf.pandas` profiler. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) --- python/cudf/cudf/pandas/__main__.py | 2 +- .../cudf_pandas_tests/data/profile_basic.py | 13 ++++++ .../cudf/cudf_pandas_tests/test_profiler.py | 41 +++++++++++++++++++ 3 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf_pandas_tests/data/profile_basic.py diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py index 02e8e960678..fb8569fa1d0 100644 --- a/python/cudf/cudf/pandas/__main__.py +++ b/python/cudf/cudf/pandas/__main__.py @@ -33,7 +33,7 @@ def profile(function_profile, line_profile, fn): elif function_profile: with Profiler() as profiler: yield fn - profiler.print_per_func_stats() + profiler.print_per_function_stats() else: yield fn diff --git a/python/cudf/cudf_pandas_tests/data/profile_basic.py b/python/cudf/cudf_pandas_tests/data/profile_basic.py new file mode 100644 index 00000000000..f7b4ba89ce7 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/data/profile_basic.py @@ -0,0 +1,13 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +import pandas as pd + +df = pd.DataFrame( + { + "size": [10, 11, 12, 10, 11, 12, 10, 6, 11, 10], + "total_bill": [100, 200, 100, 200, 100, 100, 200, 50, 10, 560], + } +) +df["size"].value_counts() +df.groupby("size").total_bill.mean() +df.apply(list, axis=1) diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py index a947d67b724..4921446ab6b 100644 --- a/python/cudf/cudf_pandas_tests/test_profiler.py +++ b/python/cudf/cudf_pandas_tests/test_profiler.py @@ -2,6 +2,9 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import os +import subprocess + from cudf.pandas import LOADED, Profiler if not LOADED: @@ -68,3 +71,41 @@ def test_profiler_fast_slow_name_mismatch(): with Profiler(): df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) df.iloc[0, 1] = "foo" + + +def test_profiler_commandline(): + data_directory = os.path.dirname(os.path.abspath(__file__)) + # Create a copy of the current environment variables + env = os.environ.copy() + # Setting the 'COLUMNS' environment variable to a large number + # because the terminal output shouldn't be compressed for + # text validations below. + env["COLUMNS"] = "10000" + + sp_completed = subprocess.run( + [ + "python", + "-m", + "cudf.pandas", + "--profile", + data_directory + "/data/profile_basic.py", + ], + capture_output=True, + text=True, + env=env, + ) + assert sp_completed.returncode == 0 + output = sp_completed.stdout + + for string in [ + "Total time", + "Stats", + "Function", + "GPU ncalls", + "GPU cumtime", + "GPU percall", + "CPU ncalls", + "CPU cumtime", + "CPU percall", + ]: + assert string in output