From dafb3e7559710d5af7118a206312f250eb671558 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 19 Sep 2024 12:06:53 -0500 Subject: [PATCH] Generate GPU vs CPU usage metrics per pytest file in pandas testsuite for `cudf.pandas` (#16739) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces GPU and CPU usage reporting to cudf.pandas pytest suite and the generated metrics will be available for viewing in the existing pandas pytest summary page: https://github.com/rapidsai/cudf/actions/runs/10886370333/attempts/1#summary-30220192117 ![Screenshot 2024-09-16 at 2 39 07 PM](https://github.com/user-attachments/assets/6d31c7d2-8a27-4f02-bf9d-c1b40ad1d756) Note: I'm aware of cases of where both GPU and CPU usage show 0%, which is due to various reasons that I'm working on addressing in a follow-up PR. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/16739 --- .../pandas-tests/job-summary.py | 14 ++++- python/cudf/cudf/pandas/fast_slow_proxy.py | 16 +++++ .../cudf/pandas/scripts/conftest-patch.py | 59 ++++++++++++++++++- .../cudf/pandas/scripts/run-pandas-tests.sh | 5 +- .../pandas/scripts/summarize-test-results.py | 40 +++++++++++++ 5 files changed, 128 insertions(+), 6 deletions(-) diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index 93a815838b7..7a12db927e5 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -68,8 +68,18 @@ def emoji_failed(x): pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() diff_df = pr_df - main_df +total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call'] +pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1) +pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1) -pr_df = pr_df[["total", "passed", "failed", "skipped"]] +cpu_usage_mean = pr_df['CPU Usage'].mean().round(2) +gpu_usage_mean = pr_df['GPU Usage'].mean().round(2) + +# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns +pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%' +pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%' + +pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']] diff_df = diff_df[["total", "passed", "failed", "skipped"]] diff_df.columns = diff_df.columns + "_diff" diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) @@ -95,6 +105,8 @@ def emoji_failed(x): print(comment) print() +print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%") +print() print("Here are the results of running the Pandas tests against this PR:") print() print(df.to_markdown()) diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index afa1ce5f86c..bf2ee6ae624 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -881,6 +881,20 @@ def _assert_fast_slow_eq(left, right): assert_eq(left, right) +def _fast_function_call(): + """ + Placeholder fast function for pytest profiling purposes. + """ + return None + + +def _slow_function_call(): + """ + Placeholder slow function for pytest profiling purposes. + """ + return None + + def _fast_slow_function_call( func: Callable, /, @@ -910,6 +924,7 @@ def _fast_slow_function_call( # try slow path raise Exception() fast = True + _fast_function_call() if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): try: with nvtx.annotate( @@ -952,6 +967,7 @@ def _fast_slow_function_call( from ._logger import log_fallback log_fallback(slow_args, slow_kwargs, err) + _slow_function_call() with disable_module_accelerator(): result = func(*slow_args, **slow_kwargs) return _maybe_wrap_result(result, func, *args, **kwargs), fast diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 505a40b0bfa..d12d2697729 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -1,10 +1,13 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 import contextlib +import json import os import sys +import traceback +from collections import defaultdict from functools import wraps import pytest @@ -36,4 +39,58 @@ def patch_testing_functions(): pytest.raises = replace_kwargs({"match": None})(pytest.raises) +# Dictionary to store function call counts +function_call_counts = {} # type: ignore + +# The specific functions to track +FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"} + + +def find_pytest_file(frame): + stack = traceback.extract_stack() + absolute_paths = [frame.filename for frame in stack] + for file in absolute_paths: + if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[ + -1 + ].startswith("test_"): + return str(file).rsplit("pandas-tests/", 1)[-1] + return None + + +def trace_calls(frame, event, arg): + if event != "call": + return + code = frame.f_code + func_name = code.co_name + + if func_name in FUNCTION_NAME: + filename = find_pytest_file(frame) + if filename is None: + return + if filename not in function_call_counts: + function_call_counts[filename] = defaultdict(int) + function_call_counts[filename][func_name] += 1 + + +def pytest_sessionstart(session): + # Set the profile function to trace calls + sys.setprofile(trace_calls) + + +def pytest_sessionfinish(session, exitstatus): + # Remove the profile function + sys.setprofile(None) + + +@pytest.hookimpl(trylast=True) +def pytest_unconfigure(config): + if hasattr(config, "workerinput"): + # Running in xdist worker, write the counts before exiting + worker_id = config.workerinput["workerid"] + output_file = f"function_call_counts_worker_{worker_id}.json" + with open(output_file, "w") as f: + json.dump(function_call_counts, f, indent=4) + print(f"Function call counts have been written to {output_file}") + + sys.path.append(os.path.dirname(__file__)) diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh index 9c65b74d081..9b9ce026571 100755 --- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh +++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh @@ -64,8 +64,6 @@ markers = [ "skip_ubsan: Tests known to fail UBSAN check", ] EOF - # append the contents of patch-confest.py to conftest.py - cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py # Substitute `pandas.tests` with a relative import. # This will depend on the location of the test module relative to @@ -137,7 +135,7 @@ and not test_eof_states \ and not test_array_tz" # TODO: Remove "not db" once a postgres & mysql container is set up on the CI -PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ +PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \ -v -m "not single_cpu and not db" \ -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \ --import-mode=importlib \ @@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \ mv *.json .. cd .. - rm -rf pandas-testing/pandas-tests/ diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index ffd2abb960d..4ea0b3b4413 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -12,7 +12,9 @@ """ import argparse +import glob import json +import os from rich.console import Console from rich.table import Table @@ -57,6 +59,44 @@ def get_per_module_results(log_file_name): per_module_results[module_name].setdefault(outcome, 0) per_module_results[module_name]["total"] += 1 per_module_results[module_name][outcome] += 1 + + directory = os.path.dirname(log_file_name) + pattern = os.path.join(directory, "function_call_counts_worker_*.json") + matching_files = glob.glob(pattern) + function_call_counts = {} + + for file in matching_files: + with open(file) as f: + function_call_count = json.load(f) + if not function_call_counts: + function_call_counts.update(function_call_count) + else: + for key, value in function_call_count.items(): + if key not in function_call_counts: + function_call_counts[key] = value + else: + if "_slow_function_call" not in function_call_counts[key]: + function_call_counts[key]["_slow_function_call"] = 0 + if "_fast_function_call" not in function_call_counts[key]: + function_call_counts[key]["_fast_function_call"] = 0 + function_call_counts[key]["_slow_function_call"] += ( + value.get("_slow_function_call", 0) + ) + function_call_counts[key]["_fast_function_call"] += ( + value.get("_fast_function_call", 0) + ) + + for key, value in per_module_results.items(): + if key in function_call_counts: + per_module_results[key]["_slow_function_call"] = ( + function_call_counts[key].get("_slow_function_call", 0) + ) + per_module_results[key]["_fast_function_call"] = ( + function_call_counts[key].get("_fast_function_call", 0) + ) + else: + per_module_results[key]["_slow_function_call"] = 0 + per_module_results[key]["_fast_function_call"] = 0 return per_module_results