Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Forward-merge branch-24.10 into branch-24.12 #16841

Merged
merged 1 commit into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion ci/cudf_pandas_scripts/pandas-tests/job-summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,18 @@ def emoji_failed(x):
pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
diff_df = pr_df - main_df
total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call']
pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1)
pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1)

pr_df = pr_df[["total", "passed", "failed", "skipped"]]
cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)

# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%'
pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%'

pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
diff_df = diff_df[["total", "passed", "failed", "skipped"]]
diff_df.columns = diff_df.columns + "_diff"
diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
Expand All @@ -95,6 +105,8 @@ def emoji_failed(x):

print(comment)
print()
print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
print()
print("Here are the results of running the Pandas tests against this PR:")
print()
print(df.to_markdown())
16 changes: 16 additions & 0 deletions python/cudf/cudf/pandas/fast_slow_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,20 @@ def _assert_fast_slow_eq(left, right):
assert_eq(left, right)


def _fast_function_call():
"""
Placeholder fast function for pytest profiling purposes.
"""
return None


def _slow_function_call():
"""
Placeholder slow function for pytest profiling purposes.
"""
return None


def _fast_slow_function_call(
func: Callable,
/,
Expand Down Expand Up @@ -910,6 +924,7 @@ def _fast_slow_function_call(
# try slow path
raise Exception()
fast = True
_fast_function_call()
if _env_get_bool("CUDF_PANDAS_DEBUGGING", False):
try:
with nvtx.annotate(
Expand Down Expand Up @@ -952,6 +967,7 @@ def _fast_slow_function_call(
from ._logger import log_fallback

log_fallback(slow_args, slow_kwargs, err)
_slow_function_call()
with disable_module_accelerator():
result = func(*slow_args, **slow_kwargs)
return _maybe_wrap_result(result, func, *args, **kwargs), fast
Expand Down
59 changes: 58 additions & 1 deletion python/cudf/cudf/pandas/scripts/conftest-patch.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import contextlib
import json
import os
import sys
import traceback
from collections import defaultdict
from functools import wraps

import pytest
Expand Down Expand Up @@ -36,4 +39,58 @@ def patch_testing_functions():
pytest.raises = replace_kwargs({"match": None})(pytest.raises)


# Dictionary to store function call counts
function_call_counts = {} # type: ignore

# The specific functions to track
FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}


def find_pytest_file(frame):
stack = traceback.extract_stack()
absolute_paths = [frame.filename for frame in stack]
for file in absolute_paths:
if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[
-1
].startswith("test_"):
return str(file).rsplit("pandas-tests/", 1)[-1]
return None


def trace_calls(frame, event, arg):
if event != "call":
return
code = frame.f_code
func_name = code.co_name

if func_name in FUNCTION_NAME:
filename = find_pytest_file(frame)
if filename is None:
return
if filename not in function_call_counts:
function_call_counts[filename] = defaultdict(int)
function_call_counts[filename][func_name] += 1


def pytest_sessionstart(session):
# Set the profile function to trace calls
sys.setprofile(trace_calls)


def pytest_sessionfinish(session, exitstatus):
# Remove the profile function
sys.setprofile(None)


@pytest.hookimpl(trylast=True)
def pytest_unconfigure(config):
if hasattr(config, "workerinput"):
# Running in xdist worker, write the counts before exiting
worker_id = config.workerinput["workerid"]
output_file = f"function_call_counts_worker_{worker_id}.json"
with open(output_file, "w") as f:
json.dump(function_call_counts, f, indent=4)
print(f"Function call counts have been written to {output_file}")


sys.path.append(os.path.dirname(__file__))
5 changes: 1 addition & 4 deletions python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ markers = [
"skip_ubsan: Tests known to fail UBSAN check",
]
EOF
# append the contents of patch-confest.py to conftest.py
cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py

# Substitute `pandas.tests` with a relative import.
# This will depend on the location of the test module relative to
Expand Down Expand Up @@ -137,7 +135,7 @@ and not test_eof_states \
and not test_array_tz"

# TODO: Remove "not db" once a postgres & mysql container is set up on the CI
PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \
-v -m "not single_cpu and not db" \
-k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
--import-mode=importlib \
Expand All @@ -146,5 +144,4 @@ PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \

mv *.json ..
cd ..

rm -rf pandas-testing/pandas-tests/
40 changes: 40 additions & 0 deletions python/cudf/cudf/pandas/scripts/summarize-test-results.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
"""

import argparse
import glob
import json
import os

from rich.console import Console
from rich.table import Table
Expand Down Expand Up @@ -57,6 +59,44 @@ def get_per_module_results(log_file_name):
per_module_results[module_name].setdefault(outcome, 0)
per_module_results[module_name]["total"] += 1
per_module_results[module_name][outcome] += 1

directory = os.path.dirname(log_file_name)
pattern = os.path.join(directory, "function_call_counts_worker_*.json")
matching_files = glob.glob(pattern)
function_call_counts = {}

for file in matching_files:
with open(file) as f:
function_call_count = json.load(f)
if not function_call_counts:
function_call_counts.update(function_call_count)
else:
for key, value in function_call_count.items():
if key not in function_call_counts:
function_call_counts[key] = value
else:
if "_slow_function_call" not in function_call_counts[key]:
function_call_counts[key]["_slow_function_call"] = 0
if "_fast_function_call" not in function_call_counts[key]:
function_call_counts[key]["_fast_function_call"] = 0
function_call_counts[key]["_slow_function_call"] += (
value.get("_slow_function_call", 0)
)
function_call_counts[key]["_fast_function_call"] += (
value.get("_fast_function_call", 0)
)

for key, value in per_module_results.items():
if key in function_call_counts:
per_module_results[key]["_slow_function_call"] = (
function_call_counts[key].get("_slow_function_call", 0)
)
per_module_results[key]["_fast_function_call"] = (
function_call_counts[key].get("_fast_function_call", 0)
)
else:
per_module_results[key]["_slow_function_call"] = 0
per_module_results[key]["_fast_function_call"] = 0
return per_module_results


Expand Down
Loading