Skip to content

Commit

Permalink
#4003: Look further back in time per op file to see if we had a prior…
Browse files Browse the repository at this point in the history
… failure
  • Loading branch information
eyonland committed Jan 31, 2024
1 parent 8d1e89a commit 7051709
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 79 deletions.
1 change: 1 addition & 0 deletions docs/requirements-docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ nbsphinx==0.9.3
sphinxcontrib-jquery==4.1
ipython==8.12.3
pandoc==2.3
tabulate==0.9.0
216 changes: 137 additions & 79 deletions tests/ttnn/sweep_tests/build_html_sweep_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,19 @@
import zipfile
import pandas as pd
from loguru import logger
from dataclasses import dataclass
from tabulate import tabulate


def get_list_of_runs():
params = {"per_page": 3}
params = {"per_page": 15}
url = "https://api.github.com/repos/tenstorrent-metal/tt-metal/actions/workflows/ttnn-run-sweeps.yaml/runs"
headers = {"Accept": "application/vnd.github.v3+json"}
response = requests.get(url, headers=headers, params=params)
if response.status_code == 200:
runs = response.json()
else:
raise RuntimeError(f"Error fetching workflow runs: {response.status_code}")
raise RuntimeError(f"Error fetching workflow runs: {response.status_code}:{response.text}")

return runs

Expand All @@ -43,7 +45,7 @@ def download_artifacts(token, artifacts_url, output_path):
else:
raise RuntimeError("No artifacts found. Is there a run in progress?")
else:
raise RuntimeError("Failed to fetch artifacts list.")
raise RuntimeError(f"Failed to fetch artifacts list. {response.status_code}:{response.text}")


def read_csv_from_zip(zip_file, file_name):
Expand Down Expand Up @@ -81,75 +83,127 @@ def build_new_crashes(recent_df, prior_df):
return extract_only_recent_changes(failed_recent, failed_prior)


def diff_results(recent_zip, prior_zip, directory_for_html_pages, commit_hash):
@dataclass
class OperationFailure:
file_name: str
failure_file_name: str
commit_hash_with_failure: str
commit_hash_prior_to_failure: str
failures: int


def diff_results(temp_dir_path, most_recent_run_index, total_runs, directory_for_html_pages):
directory_for_html_pages = pathlib.Path(directory_for_html_pages)
html_files = []
html_failure_files = []
rst_failure_files = []
rst_files = []
failures_since_last_run = 0
with zipfile.ZipFile(recent_zip, "r") as zip1, zipfile.ZipFile(prior_zip, "r") as zip2:

recent_zip = temp_dir_path / str(most_recent_run_index) / "artifact.zip"
most_recent_commit_hash = ""
commit_hash_file = temp_dir_path / str(most_recent_run_index) / "commit_hash.txt"
with open(commit_hash_file, "r") as file:
most_recent_commit_hash = file.read()

new_failures = {}

with zipfile.ZipFile(recent_zip, "r") as zip1:
# We want to put the latest csv from the most recent run into html files
zip1_files = set(zip1.namelist())
zip2_files = set(zip2.namelist())
common_files = zip1_files.intersection(zip2_files)
# pd.set_option("display.max_rows", None)
# pd.set_option("display.max_columns", None)
# pd.set_option("display.width", None)
# pd.set_option("display.max_colwidth", 10)
for file_name in common_files:
for file_name in zip1_files:
test_name = pathlib.Path(file_name).stem
if file_name.endswith(".csv"):
recent_df = read_csv_from_zip(zip1, file_name)
html_table = recent_df.to_html()
html_page_name = directory_for_html_pages / f"{test_name}.html"
with open(html_page_name, "w") as f:
f.write(html_table)
html_files.append(f"{test_name}.html")
prior_df = read_csv_from_zip(zip2, file_name)
failures_df = build_new_failures(recent_df, prior_df)
crashes_df = build_new_crashes(recent_df, prior_df)
combined_test_resutls_df = pd.concat([failures_df, crashes_df])
if combined_test_resutls_df.size > 0:
failures_since_last_run = failures_since_last_run + combined_test_resutls_df.size
html_table = combined_test_resutls_df.to_html()
html_page_name = directory_for_html_pages / f"{test_name}_failure.html"
with open(html_page_name, "w") as f:
f.write(html_table)
html_failure_files.append(f"{test_name}_failure.html")

html_template = """
<!DOCTYPE html>
<html>
<head>
<title>Sweep Test Results</title>
<style>
iframe {{
width: 100%;
height: 300px;
border: none;
margin-bottom: 20px;
}}
</style>
</head>
<body>
<h1>Sweep Tests</h1>
<h2>We have had {failures_since_last_run} failures since the prior run.</h2>
<h2>Commit Hash: {commit_hash}</h2>
<br/>
{iframes}
</body>
</html>
"""

iframe_tags = "".join(
[f'<h3>{file.split(".")[0]}</h3><iframe src="{file}"></iframe>' for file in html_failure_files]
rst_table = tabulate(recent_df, headers="keys", tablefmt="rst")
rst_page_name = directory_for_html_pages / f"{test_name}.rst"
with open(rst_page_name, "w") as f:
f.write(rst_table)
new_failures[test_name] = OperationFailure(
f"{test_name}.rst", f"{test_name}_failure.rst", most_recent_commit_hash, "", 0
)
rst_files.append(test_name)

# Now we need to check and see which differences started showing up relative to the most recent run per operation file
for test_name in new_failures:
commit_hash = most_recent_commit_hash
prior_run_index = most_recent_run_index + 1
while new_failures[test_name].failures == 0 and prior_run_index < total_runs - 1:
prior_zip = temp_dir_path / str(prior_run_index) / "artifact.zip"
with zipfile.ZipFile(prior_zip, "r") as zip2:
for file_name in zip2.namelist():
if file_name.endswith(f"{test_name}.csv"):
test_name = pathlib.Path(file_name).stem
recent_df = read_csv_from_zip(zip1, file_name)
prior_df = read_csv_from_zip(zip2, file_name)
failures_df = build_new_failures(recent_df, prior_df)
crashes_df = build_new_crashes(recent_df, prior_df)
combined_test_results_df = pd.concat([failures_df, crashes_df])
if len(combined_test_results_df) > 0:
failures_since_last_run = failures_since_last_run + len(combined_test_results_df)
new_failures[test_name].failures = combined_test_results_df.size
new_failures[test_name].failure_file_name = f"{test_name}_failure.rst"
new_failures[test_name].commit_hash_with_failure = commit_hash

rst_table = tabulate(combined_test_results_df, headers="keys", tablefmt="rst")
rst_page_name = directory_for_html_pages / f"{test_name}_failure.rst"
with open(rst_page_name, "w") as f:
f.write(rst_table)
rst_failure_files.append(new_failures[test_name])

commit_hash_file = temp_dir_path / str(prior_run_index) / "commit_hash.txt"
with open(commit_hash_file, "r") as file:
commit_hash = file.read()
new_failures[test_name].commit_hash_prior_to_failure = commit_hash

prior_run_index = prior_run_index + 1

rst_template = """
Sweep Test Results
==================
Recent Failures
---------------
We have had {failures_since_last_run} failures since the prior run.
.. contents::
:local:
:depth: 1
{links}
All Sweep Tests
--------------------
These are the sweep tests for commit hash {most_recent_commit_hash}
{sweep_test_results}
"""

link_tags = "\n".join(
[
f"* `{op_failure.file_name.split('.')[0]} <{op_failure.failure_file_name}>`_ "
f"-> ( {op_failure.commit_hash_prior_to_failure} .. {op_failure.commit_hash_with_failure} ]"
for op_failure in rst_failure_files
]
)
complete_html = html_template.format(
commit_hash=commit_hash, failures_since_last_run=failures_since_last_run, iframes=iframe_tags
link_tags = link_tags.lstrip()

sweep_test_link_tags = "\n".join([f"* `{file} <{file}>`_" for file in rst_files])
sweep_test_link_tags = sweep_test_link_tags.lstrip()

complete_rst = rst_template.format(
most_recent_commit_hash=most_recent_commit_hash,
failures_since_last_run=failures_since_last_run,
links=link_tags,
sweep_test_results=sweep_test_link_tags,
)
html_page_name = directory_for_html_pages / f"index.html"
with open(html_page_name, "w") as file:
file.write(complete_html)

logger.info(f"Built {html_page_name}")
rst_page_name = directory_for_html_pages / "index.rst"
with open(rst_page_name, "w") as file:
file.write(complete_rst)

logger.info(f"Built {rst_page_name}")


def download_from_pipeline(token, directory_for_html_pages):
Expand All @@ -161,27 +215,29 @@ def download_from_pipeline(token, directory_for_html_pages):

runs = get_list_of_runs()
if len(runs["workflow_runs"]) < 3:
# Note that if the run is in progress, there will not be any artifacts avaiable yet on the most recent run.
# Note that if the run is in progress, there will not be any artifacts available yet on the most recent run.
raise RuntimeError("We need at least three runs to compare the changes in the sweep tests")

total_runs = len(runs["workflow_runs"])
if runs["workflow_runs"][0]["status"] == "completed":
most_recent_run = runs["workflow_runs"][0]
prior_run = runs["workflow_runs"][1]
else:
most_recent_run = runs["workflow_runs"][1]
prior_run = runs["workflow_runs"][2]

most_recent_artifact_url = most_recent_run["artifacts_url"]
commit_hash = most_recent_run["head_sha"]
prior_artifact_url = prior_run["artifacts_url"]
most_recent_run_index = 0
else: # a run is in progress so we just use the prior two for the first comparison
most_recent_run_index = 1

with tempfile.TemporaryDirectory() as temp_dir:
temp_dir_path = pathlib.Path(temp_dir)
recent_zip = temp_dir_path / "recent.zip"
prior_zip = temp_dir_path / "prior.zip"
download_artifacts(token, most_recent_artifact_url, output_path=recent_zip)
download_artifacts(token, prior_artifact_url, output_path=prior_zip)
diff_results(recent_zip, prior_zip, directory_for_html_pages, commit_hash)
for i in range(most_recent_run_index, total_runs):
most_recent_run = runs["workflow_runs"][i]
most_recent_artifact_url = most_recent_run["artifacts_url"]
(temp_dir_path / str(i)).mkdir(parents=True, exist_ok=True)
artifact_zip = temp_dir_path / str(i) / "artifact.zip"
commit_hash = most_recent_run["head_sha"]
download_artifacts(token, most_recent_artifact_url, output_path=artifact_zip)
commit_hash_file = temp_dir_path / str(i) / "commit_hash.txt"
with open(commit_hash_file, "w") as file:
file.write(commit_hash)

diff_results(temp_dir_path, most_recent_run_index, total_runs, directory_for_html_pages)


def main():
Expand All @@ -190,7 +246,9 @@ def main():
parser.add_argument("--dir")
token = parser.parse_args().token
directory_for_html_pages = parser.parse_args().dir
download_from_pipeline(token, directory_for_html_pages)

# download_from_pipeline(token, directory_for_html_pages)
diff_results(pathlib.Path("/tmp/saved_downloads"), 0, 10, directory_for_html_pages)


if __name__ == "__main__":
Expand Down

0 comments on commit 7051709

Please sign in to comment.