From 35236b6532f85b1b078b32318d0bf3686298cbae Mon Sep 17 00:00:00 2001 From: Eyon Land <41128502+eyonland@users.noreply.github.com> Date: Wed, 22 May 2024 18:15:03 -0500 Subject: [PATCH] =?UTF-8?q?#4003:=20Look=20further=20back=20in=20time=20pe?= =?UTF-8?q?r=20op=20file=20to=20see=20if=20we=20had=20a=20prior=E2=80=A6?= =?UTF-8?q?=20(#5037)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * #4003: Look further back in time per op file to see if we had a prior failure * #4003: placeholder ttnn_sweeps index.rst * #4003: Integrate build_rst_sweep_results.py into build docs flow to generate automated RSTs during build time --------- Co-authored-by: Vincent Tang Co-authored-by: Raymond Kim <109366641+tt-rkim@users.noreply.github.com> --- .github/workflows/docs-latest-public.yaml | 11 +- docs/Makefile | 14 +- docs/requirements-docs.txt | 1 + docs/source/ttnn/index.rst | 1 + docs/source/ttnn/ttnn/onboarding.rst | 2 +- docs/source/ttnn/ttnn_sweeps/index.rst | 6 + docs/spellcheck.sh | 4 +- tests/scripts/run_build_docs.sh | 1 + .../sweep_tests/build_rst_sweep_results.py | 300 ++++++++++++++++++ tt_metal/python_env/requirements-dev.txt | 1 + 10 files changed, 318 insertions(+), 23 deletions(-) create mode 100644 docs/source/ttnn/ttnn_sweeps/index.rst create mode 100644 tests/ttnn/sweep_tests/build_rst_sweep_results.py diff --git a/.github/workflows/docs-latest-public.yaml b/.github/workflows/docs-latest-public.yaml index 98263671de5..9aecbea9a08 100644 --- a/.github/workflows/docs-latest-public.yaml +++ b/.github/workflows/docs-latest-public.yaml @@ -49,20 +49,13 @@ jobs: - uses: ./.github/actions/install-python-deps - name: Build Docs timeout-minutes: 15 + env: + GITHUB_TOKEN: ${{ github.token }} run: | source ${{ github.workspace }}/python_env/bin/activate cd $TT_METAL_HOME export PYTHONPATH=$TT_METAL_HOME ./tests/scripts/run_build_docs.sh - - name: Build additional ttnn sweeps docs - env: - GITHUB_TOKEN: ${{ github.token }} - run: | - export PYTHONPATH=$(pwd) - source ${{ github.workspace }}/python_env/bin/activate - cd docs/ - make ttnn_sweeps/check_directory - make ttnn_sweeps - name: Prepare artifact - move output run: | mkdir gh_pages diff --git a/docs/Makefile b/docs/Makefile index 367c9eb2c16..ee00c5343e5 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -15,13 +15,13 @@ PORT ?= 8888 DOCS_VERSION ?= latest GITHUB_TOKEN ?= INSERT_TOKEN_HERE -TTNN_SWEEPS_DIR = $(HTMLDIR)/ttnn/ttnn_sweeps +TTNN_SWEEPS_DIR = source/ttnn/ttnn_sweeps # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -.PHONY: help default clean html publish build_doxygen sphinx_build_dir html/ttnn html/tt-metalium ttnn_sweeps ttnn_sweeps/check_directory +.PHONY: help default clean html publish build_doxygen sphinx_build_dir html/ttnn html/tt-metalium ttnn_sweeps default: html @@ -49,17 +49,9 @@ html: html/tt-metalium html/ttnn mv -f $(TTNN_BUILDDIR) $(HTMLDIR)/ttnn cp source/index.html $(HTMLDIR)/ -ttnn_sweeps/check_directory: - @if [ -d "$(TTNN_SWEEPS_DIR)" ]; then \ - echo "Error: ttnn sweeps dir $(TTNN_SWEEPS_DIR) exists already."; \ - exit 1; \ - else \ - mkdir -p $(TTNN_SWEEPS_DIR); \ - fi - ttnn_sweeps: @echo "Note that GITHUB_TOKEN must be set before calling this" - @cd .. && python tests/ttnn/sweep_tests/build_html_sweep_results.py --dir docs/$(TTNN_SWEEPS_DIR) --token $(GITHUB_TOKEN) + @cd .. && python tests/ttnn/sweep_tests/build_rst_sweep_results.py --dir docs/$(TTNN_SWEEPS_DIR) --token $(GITHUB_TOKEN) server: @echo "Navigate to: \033[4;33mlocalhost:$(PORT)/index.html\033[0m" diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 1459d6b63e8..fb3b9f3cf71 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -8,3 +8,4 @@ nbsphinx==0.9.3 sphinxcontrib-jquery==4.1 ipython==8.12.3 pandoc==2.3 +tabulate==0.9.0 diff --git a/docs/source/ttnn/index.rst b/docs/source/ttnn/index.rst index 3f6951b7bac..9712c351fcc 100644 --- a/docs/source/ttnn/index.rst +++ b/docs/source/ttnn/index.rst @@ -20,6 +20,7 @@ Welcome to TT-NN documentation! ttnn/profiling_ttnn_operations ttnn/dependencies/index.rst ttnn/demos + ttnn_sweeps/index.rst .. toctree:: :caption: Models diff --git a/docs/source/ttnn/ttnn/onboarding.rst b/docs/source/ttnn/ttnn/onboarding.rst index 5e94bc8d76d..e2e7456fb6e 100644 --- a/docs/source/ttnn/ttnn/onboarding.rst +++ b/docs/source/ttnn/ttnn/onboarding.rst @@ -13,7 +13,7 @@ is that the long term benefits will help us maintain our objectives. Please foll * When creating the branch, please follow the pattern of 'TTNN--'. For example, if the issue is 4730, the branch name would be `TTNN-4730-concat-operation` * Use the `fallback` reference implementation for the operation and implement the functionality. * Add the documentation in the rst format for the operation under `ttnn documentation `_ - * Add sweep tests to the branch using the fallback implementation under `ttnn sweep tests `_ + * Add :ref:`sweep tests` to the branch using the fallback implementation under `ttnn sweep tests `_ 3. Update the issue referencing the pull requests after verifying that all the sweep tests run as expected. A TTNN CODEOWNERS will review the PR and verify that the API is acceptable and that the sweep tests reflect the intended functionality. 4. If the pull request (PR) is accepted it will be merge into the main branch and a new branch should be created that adds the implementation. * The fallback implementation for the Operation should be left and will continue to be used for op-by-op PCC comparisons when debugging models (see `--ttnn-enable-debug-decorator`). diff --git a/docs/source/ttnn/ttnn_sweeps/index.rst b/docs/source/ttnn/ttnn_sweeps/index.rst new file mode 100644 index 00000000000..29212274b75 --- /dev/null +++ b/docs/source/ttnn/ttnn_sweeps/index.rst @@ -0,0 +1,6 @@ +.. _ttnn.sweep_tests: + +Placeholder title +================= + +You must generate ttnn_sweeps here. diff --git a/docs/spellcheck.sh b/docs/spellcheck.sh index 350ec8d41d1..98fd29f3a85 100755 --- a/docs/spellcheck.sh +++ b/docs/spellcheck.sh @@ -12,7 +12,7 @@ then exit -1 fi cd ${TT_METAL_HOME}/docs/ -for i in `find ./source/ -iname '*.rst'`; +for i in `find ./source/ -type d -name 'sweeps' -prune -o -iname '*.rst'`; do echo "Checking $i" if [ $INTERACTIVE_MODE -gt 0 ]; @@ -27,7 +27,7 @@ do echo "-------------------------------" echo "There are typos in the file: $i" echo "Please update text in $i, or update personal dictionary as case maybe" - echo "-------------------------------" + echo "-------------------------------" exit -1 else echo "Skipping empty file $i" diff --git a/tests/scripts/run_build_docs.sh b/tests/scripts/run_build_docs.sh index 66dc0d36ceb..af7987df701 100755 --- a/tests/scripts/run_build_docs.sh +++ b/tests/scripts/run_build_docs.sh @@ -11,4 +11,5 @@ echo "Checking docs build..." cd $TT_METAL_HOME/docs python -m pip install -r requirements-docs.txt make clean +make ttnn_sweeps make html diff --git a/tests/ttnn/sweep_tests/build_rst_sweep_results.py b/tests/ttnn/sweep_tests/build_rst_sweep_results.py new file mode 100644 index 00000000000..2041c7983bd --- /dev/null +++ b/tests/ttnn/sweep_tests/build_rst_sweep_results.py @@ -0,0 +1,300 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + + +import argparse +import requests +import tempfile +import pathlib +import zipfile +import pandas as pd +from loguru import logger +from dataclasses import dataclass +from tabulate import tabulate +import os +import shutil + + +def get_list_of_runs(): + params = {"per_page": 15} + url = "https://api.github.com/repos/tenstorrent-metal/tt-metal/actions/workflows/ttnn-run-sweeps.yaml/runs" + headers = {"Accept": "application/vnd.github.v3+json"} + response = requests.get(url, headers=headers, params=params) + if response.status_code == 200: + runs = response.json() + else: + raise RuntimeError(f"Error fetching workflow runs: {response.status_code}:{response.text}") + + return runs + + +def download_artifacts(token, artifacts_url, temp_dir_path, directory_index): + response = requests.get(artifacts_url) + headers = {"Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json"} + if response.status_code == 200: + artifacts_data = response.json() + if artifacts_data["artifacts"]: + artifact = artifacts_data["artifacts"][0] + artifact_download_url = artifact["archive_download_url"] + artifact_response = requests.get(artifact_download_url, headers=headers) + if artifact_response.status_code == 200: + (temp_dir_path / str(directory_index)).mkdir(parents=True, exist_ok=True) + artifact_zip = temp_dir_path / str(directory_index) / "artifact.zip" + with open(artifact_zip, "wb") as file: + file.write(artifact_response.content) + logger.info(f"{artifacts_url} downloaded successfully.") + return True + else: + raise RuntimeError("Failed to download the artifact.") + else: + print(f"No artifacts found. Is there a run in progress for {artifacts_url} ?") + else: + raise RuntimeError(f"Failed to fetch artifacts list. {response.status_code}:{response.text}") + return False + + +def read_csv_from_zip(zip_file, file_name): + with zip_file.open(file_name) as f: + df = pd.read_csv(f) + if not df.empty and len(df.columns) > 1: + # remove first unamed column which is just the index. + # This will be displayed by tabulate. + df = df.iloc[:, 1:] + return df + + +def trim_column(texte, longueur): + if len(texte) > longueur: + return texte[-longueur + 3 :] + return texte + + +def get_subset_for_status(recent_df, prior_df, status): + failed_recent = recent_df[recent_df["status"] == status] + matching_prior_status = prior_df["status"] == status + failed_prior = prior_df[matching_prior_status] + return failed_recent, failed_prior + + +def extract_only_recent_changes(failed_recent, failed_prior): + run_id_column_name = failed_recent.columns[0] + newly_failed = failed_recent[~failed_recent[run_id_column_name].isin(failed_prior[run_id_column_name])] + for column in newly_failed.columns: + newly_failed[column] = newly_failed[column].apply(lambda x: trim_column(str(x), 10)) + return newly_failed + + +def build_new_failures(recent_df, prior_df): + failed_recent, failed_prior = get_subset_for_status(recent_df, prior_df, "failed") + return extract_only_recent_changes(failed_recent, failed_prior) + + +def build_new_crashes(recent_df, prior_df): + failed_recent, failed_prior = get_subset_for_status(recent_df, prior_df, "crashed") + return extract_only_recent_changes(failed_recent, failed_prior) + + +def delete_directory_contents(dir_path): + for item in os.listdir(dir_path): + item_path = os.path.join(dir_path, item) + if os.path.isfile(item_path) or os.path.islink(item_path): + os.unlink(item_path) + elif os.path.isdir(item_path): + shutil.rmtree(item_path) + + +@dataclass +class OperationFailure: + file_name: str + failure_file_name: str + commit_hash_with_failure: str + commit_hash_prior_to_failure: str + failures: int + + +def diff_results(temp_dir_path, most_recent_run_index, total_runs, directory_for_rst_pages): + directory_for_rst_pages = pathlib.Path(directory_for_rst_pages) + rst_failure_files = [] + rst_files = [] + failures_since_last_run = 0 + + recent_zip = temp_dir_path / str(most_recent_run_index) / "artifact.zip" + most_recent_commit_hash = "" + commit_hash_file = temp_dir_path / str(most_recent_run_index) / "commit_hash.txt" + with open(commit_hash_file, "r") as file: + most_recent_commit_hash = file.read() + + new_failures = {} + + with zipfile.ZipFile(recent_zip, "r") as zip1: + # We want to put the latest csv from the most recent run into html files + zip1_files = set(zip1.namelist()) + for file_name in zip1_files: + test_name = pathlib.Path(file_name).stem + if file_name.endswith(".csv"): + recent_df = read_csv_from_zip(zip1, file_name) + for col in recent_df.columns: + recent_df[col] = recent_df[col].apply(lambda x: str(x).replace("\t", " ").replace("\n", " ")) + rst_table = tabulate(recent_df, headers="keys", tablefmt="rst") + rst_page_name = directory_for_rst_pages / f"{test_name}.rst" + with open(rst_page_name, "w") as f: + f.writelines(f".. _ttnn.sweep_test_{test_name}:\n") + f.writelines("\n") + f.writelines(f"{test_name}\n") + f.writelines("====================================================================\n") + f.write(rst_table) + new_failures[test_name] = OperationFailure( + f"{test_name}.rst", f"{test_name}_failure.rst", most_recent_commit_hash, "", 0 + ) + rst_files.append(test_name) + + # Now we need to check and see which differences started showing up relative to the most recent run per operation file + for test_name in new_failures: + commit_hash = most_recent_commit_hash + prior_run_index = most_recent_run_index + 1 + while new_failures[test_name].failures == 0 and prior_run_index < total_runs - 1: + prior_zip = temp_dir_path / str(prior_run_index) / "artifact.zip" + with zipfile.ZipFile(prior_zip, "r") as zip2: + for file_name in zip2.namelist(): + if file_name.endswith(f"{test_name}.csv"): + test_name = pathlib.Path(file_name).stem + recent_df = read_csv_from_zip(zip1, file_name) + prior_df = read_csv_from_zip(zip2, file_name) + failures_df = build_new_failures(recent_df, prior_df) + crashes_df = build_new_crashes(recent_df, prior_df) + combined_test_results_df = pd.concat([failures_df, crashes_df]) + if len(combined_test_results_df) > 0: + failures_since_last_run = failures_since_last_run + len(combined_test_results_df) + new_failures[test_name].failures = combined_test_results_df.size + new_failures[test_name].failure_file_name = f"{test_name}_failure.rst" + new_failures[test_name].commit_hash_with_failure = commit_hash + + rst_table = tabulate(combined_test_results_df, headers="keys", tablefmt="rst") + rst_page_name = directory_for_rst_pages / f"{test_name}_failure.rst" + with open(rst_page_name, "w") as f: + f.writelines(f".. _ttnn.sweep_test_failure_{test_name}:\n") + f.writelines("\n") + f.writelines(f"{test_name}\n") + f.writelines( + "====================================================================\n" + ) + f.write(rst_table) + rst_failure_files.append(new_failures[test_name]) + + commit_hash_file = temp_dir_path / str(prior_run_index) / "commit_hash.txt" + with open(commit_hash_file, "r") as file: + commit_hash = file.read() + new_failures[test_name].commit_hash_prior_to_failure = commit_hash + + prior_run_index = prior_run_index + 1 + + rst_template = """ +.. _ttnn.sweep_tests: + +Sweep Test Results +================== + +Recent New Failures +------------------- + +We have had {failures_since_last_run} new failures since the prior run. + +.. toctree:: + :maxdepth: 2 + :hidden: + + {toctree_failure_filenames} + +{sweep_test_failure_entries} + + +All Sweep Tests +--------------- + +These are the sweep tests for commit hash {most_recent_commit_hash} + +.. toctree:: + :maxdepth: 2 + + {toctree_entries} +""" + + sweep_test_failure_entries = "\n".join( + [ + f"* :ref:`{op_failure.file_name.split('.')[0]} ` " + f"-> ( {op_failure.commit_hash_prior_to_failure} .. {op_failure.commit_hash_with_failure} ]" + for op_failure in rst_failure_files + ] + ) + sweep_test_failure_entries = sweep_test_failure_entries.lstrip() + + toctree_failure_filenames = "\n ".join( + [op_failure.failure_file_name.replace(".rst", "") for op_failure in rst_failure_files] + ) + + toctree_entries = "\n ".join(sorted(rst_files)) + + complete_rst = rst_template.format( + most_recent_commit_hash=most_recent_commit_hash, + failures_since_last_run=failures_since_last_run, + toctree_failure_filenames=toctree_failure_filenames, + sweep_test_failure_entries=sweep_test_failure_entries, + toctree_entries=toctree_entries, + ) + + rst_page_name = directory_for_rst_pages / "index.rst" + with open(rst_page_name, "w") as file: + file.write(complete_rst) + + logger.info(f"Built {rst_page_name}") + + +def download_from_pipeline(token, directory_for_rst_pages): + """ + Download the results of the sweeps from the GitHub pipeline. + + :param token: Provide your GitHub token. + """ + + runs = get_list_of_runs() + if len(runs["workflow_runs"]) < 3: + # Note that if the run is in progress, there will not be any artifacts available yet on the most recent run. + raise RuntimeError("We need at least three runs to compare the changes in the sweep tests") + + total_runs = len(runs["workflow_runs"]) + if runs["workflow_runs"][0]["status"] == "completed": + most_recent_run_index = 0 + else: # a run is in progress so we just use the prior two for the first comparison + most_recent_run_index = 1 + + directory_index = 0 + with tempfile.TemporaryDirectory() as temp_dir: + temp_dir_path = pathlib.Path(temp_dir) + for i in range(most_recent_run_index, total_runs): + most_recent_run = runs["workflow_runs"][i] + most_recent_artifact_url = most_recent_run["artifacts_url"] + commit_hash = most_recent_run["head_sha"] + if download_artifacts(token, most_recent_artifact_url, temp_dir_path, directory_index): + commit_hash_file = temp_dir_path / str(directory_index) / "commit_hash.txt" + with open(commit_hash_file, "w") as file: + file.write(commit_hash) + directory_index = directory_index + 1 + + total_runs = directory_index + delete_directory_contents(directory_for_rst_pages) + diff_results(temp_dir_path, 0, total_runs, directory_for_rst_pages) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--token") + parser.add_argument("--dir") + token = parser.parse_args().token + directory_for_rst_pages = parser.parse_args().dir + + download_from_pipeline(token, directory_for_rst_pages) + + +if __name__ == "__main__": + main() diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt index b72e3580a50..502be296f77 100644 --- a/tt_metal/python_env/requirements-dev.txt +++ b/tt_metal/python_env/requirements-dev.txt @@ -50,3 +50,4 @@ multiprocess==0.70.14 evaluate==0.4.0 bert-score==0.3.12 fsspec==2023.9.2 # Temporary pin to 2023.9.2: https://github.com/tenstorrent/tt-metal/issues/3314 +tabulate==0.9.0