From 4dc8300c6104697b1d9313a48d1c4c7f5dabf81a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 16 Nov 2023 16:22:44 -0600
Subject: [PATCH 1/2] Raise error in `reindex` when `index` is not unique
 (#14400) (#14429)

Bacport of #14400
Fixes: #14398
This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14400

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - Richard (Rick) Zamora (https://github.com/rjzamora)
   - Ashwin Srinath (https://github.com/shwina)
   - Ray Douglass (https://github.com/raydouglass)
---
 python/cudf/cudf/core/indexed_frame.py   |  4 ++++
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 python/cudf/cudf/tests/test_series.py    | 12 ++++++++++++
 python/dask_cudf/dask_cudf/backends.py   | 13 ++++---------
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 219b8021241..fef62594fb8 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2606,6 +2606,10 @@ def _reindex(
 
         df = self
         if index is not None:
+            if not df._index.is_unique:
+                raise ValueError(
+                    "cannot reindex on an axis with duplicate labels"
+                )
             index = cudf.core.index.as_index(
                 index, name=getattr(index, "name", self._index.name)
             )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d44cf594e8b..5677f97408a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
     expected = gser @ [12, 13]
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_duplicate_index_reindex():
+    gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
+    pdf = gdf.to_pandas()
+
+    assert_exceptions_equal(
+        gdf.reindex,
+        pdf.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8f8f87c20e0..c15a797713f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
     s = cudf.Series([True, False, True])
     with pytest.raises(TypeError):
         s[0] = 10
+
+
+def test_series_duplicate_index_reindex():
+    gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        gs.reindex,
+        ps.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 344b03c631d..2be256f85e8 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -437,17 +437,12 @@ def union_categoricals_cudf(
     )
 
 
-@_dask_cudf_nvtx_annotate
-def safe_hash(frame):
-    return cudf.Series(frame.hash_values(), index=frame.index)
-
-
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
 @_dask_cudf_nvtx_annotate
 def hash_object_cudf(frame, index=True):
     if index:
-        return safe_hash(frame.reset_index())
-    return safe_hash(frame)
+        frame = frame.reset_index()
+    return frame.hash_values()
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
@@ -455,10 +450,10 @@ def hash_object_cudf(frame, index=True):
 def hash_object_cudf_index(ind, index=None):
 
     if isinstance(ind, cudf.MultiIndex):
-        return safe_hash(ind.to_frame(index=False))
+        return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return safe_hash(cudf.Series(col))
+    return cudf.Series(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))

From fc8c81f3d4bde674d4123ae4848c578bcc7158b6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 28 Nov 2023 14:13:20 -0600
Subject: [PATCH 2/2] Fix function name typo in `cudf.pandas` profiler (#14514)

Fixes: #14512

This PR fixes a function name typo in `cudf.pandas` profiler.

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - Bradley Dice (https://github.com/bdice)
---
 python/cudf/cudf/pandas/__main__.py           |  2 +-
 .../cudf_pandas_tests/data/profile_basic.py   | 13 ++++++
 .../cudf/cudf_pandas_tests/test_profiler.py   | 41 +++++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf_pandas_tests/data/profile_basic.py

diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 02e8e960678..fb8569fa1d0 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -33,7 +33,7 @@ def profile(function_profile, line_profile, fn):
     elif function_profile:
         with Profiler() as profiler:
             yield fn
-        profiler.print_per_func_stats()
+        profiler.print_per_function_stats()
     else:
         yield fn
 
diff --git a/python/cudf/cudf_pandas_tests/data/profile_basic.py b/python/cudf/cudf_pandas_tests/data/profile_basic.py
new file mode 100644
index 00000000000..f7b4ba89ce7
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/data/profile_basic.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+import pandas as pd
+
+df = pd.DataFrame(
+    {
+        "size": [10, 11, 12, 10, 11, 12, 10, 6, 11, 10],
+        "total_bill": [100, 200, 100, 200, 100, 100, 200, 50, 10, 560],
+    }
+)
+df["size"].value_counts()
+df.groupby("size").total_bill.mean()
+df.apply(list, axis=1)
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index a947d67b724..4921446ab6b 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -2,6 +2,9 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+import os
+import subprocess
+
 from cudf.pandas import LOADED, Profiler
 
 if not LOADED:
@@ -68,3 +71,41 @@ def test_profiler_fast_slow_name_mismatch():
     with Profiler():
         df = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
         df.iloc[0, 1] = "foo"
+
+
+def test_profiler_commandline():
+    data_directory = os.path.dirname(os.path.abspath(__file__))
+    # Create a copy of the current environment variables
+    env = os.environ.copy()
+    # Setting the 'COLUMNS' environment variable to a large number
+    # because the terminal output shouldn't be compressed for
+    # text validations below.
+    env["COLUMNS"] = "10000"
+
+    sp_completed = subprocess.run(
+        [
+            "python",
+            "-m",
+            "cudf.pandas",
+            "--profile",
+            data_directory + "/data/profile_basic.py",
+        ],
+        capture_output=True,
+        text=True,
+        env=env,
+    )
+    assert sp_completed.returncode == 0
+    output = sp_completed.stdout
+
+    for string in [
+        "Total time",
+        "Stats",
+        "Function",
+        "GPU ncalls",
+        "GPU cumtime",
+        "GPU percall",
+        "CPU ncalls",
+        "CPU cumtime",
+        "CPU percall",
+    ]:
+        assert string in output