From 961053cfdeb35ce09bb5e464598d7433fcff0961 Mon Sep 17 00:00:00 2001 From: Vincent Tang Date: Thu, 13 Jun 2024 22:02:21 +0000 Subject: [PATCH] #8729: xdist + reset mechanism on fd nightly, model perf, all t3k (except profiler) - enable timeout mechanism by default if using xdist, use 'metal-timeout' flag to enable if not using xdist - increase GH actions timeout for xdist (review) - get timings of each test and set global timeout to 5 mins (review) - add custom timeouts to nightly + t3k pipelines + post-commit (review) --- ...-dispatch-full-regressions-and-models.yaml | 2 +- .github/workflows/perf-models.yaml | 2 +- .github/workflows/t3000-demo-tests.yaml | 1 + .github/workflows/t3000-model-perf-tests.yaml | 2 +- conftest.py | 51 ++++++++--------- pytest.ini | 4 +- tests/scripts/run_performance.sh | 25 +++++---- tests/scripts/run_tests.sh | 2 +- .../single_card/nightly/run_common_models.sh | 9 ++- .../single_card/nightly/run_gs_only.sh | 11 +++- tests/scripts/single_card/nightly/run_ttnn.sh | 9 ++- .../single_card/nightly/run_wh_b0_only.sh | 11 +++- .../single_card/nightly/run_wh_b0_unstable.sh | 9 ++- tests/scripts/t3000/run_t3000_demo_tests.sh | 37 ++++++++---- .../scripts/t3000/run_t3000_frequent_tests.sh | 48 ++++++++-------- .../t3000/run_t3000_model_perf_tests.sh | 15 +++-- tests/scripts/t3000/run_t3000_unit_tests.sh | 56 ++++++++++--------- tt_metal/python_env/requirements-dev.txt | 1 + 18 files changed, 178 insertions(+), 117 deletions(-) diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml index 115b9415452..b6dc4f619c5 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml @@ -26,7 +26,7 @@ jobs: { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 30 }, { name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 }, { name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 }, - { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 35 }, + { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 45 }, ] name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }} env: diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml index 8c423e865c1..f5905175e7e 100644 --- a/.github/workflows/perf-models.yaml +++ b/.github/workflows/perf-models.yaml @@ -52,7 +52,7 @@ jobs: - uses: ./.github/actions/install-python-deps - name: Run performance regressions id: performance_tests - timeout-minutes: 30 + timeout-minutes: 40 run: | source ${{ github.workspace }}/python_env/bin/activate ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }} diff --git a/.github/workflows/t3000-demo-tests.yaml b/.github/workflows/t3000-demo-tests.yaml index c7c7d218806..bcaf4135df9 100644 --- a/.github/workflows/t3000-demo-tests.yaml +++ b/.github/workflows/t3000-demo-tests.yaml @@ -45,6 +45,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run demo regression tests + shell: bash {0} timeout-minutes: 180 run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml index 7a4ad87cd8c..a4588eaf22b 100644 --- a/.github/workflows/t3000-model-perf-tests.yaml +++ b/.github/workflows/t3000-model-perf-tests.yaml @@ -61,7 +61,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run model perf regression tests - timeout-minutes: 60 + timeout-minutes: 75 run: | source ${{ github.workspace }}/python_env/bin/activate cd $TT_METAL_HOME diff --git a/conftest.py b/conftest.py index c6339ee3ae1..2a8c0b44d5f 100644 --- a/conftest.py +++ b/conftest.py @@ -85,8 +85,6 @@ def device(request, device_params): import tt_lib as ttl device_id = request.config.getoption("device_id") - - request.node.device_ids = [device_id] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)] num_devices = ttl.device.GetNumPCIeDevices() @@ -108,8 +106,6 @@ def pcie_devices(request, device_params): num_devices = ttl.device.GetNumPCIeDevices() device_ids = [i for i in range(num_devices)] - - request.node.device_ids = device_ids request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] # Get only physical devices @@ -129,8 +125,6 @@ def all_devices(request, device_params): num_devices = ttl.device.GetNumAvailableDevices() device_ids = [i for i in range(num_devices)] - - request.node.device_ids = device_ids request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] # Get only physical devices @@ -155,7 +149,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par except (ValueError, AttributeError): num_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_devices_requested] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] device_mesh = ttnn.open_device_mesh( @@ -183,7 +176,6 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic except (ValueError, AttributeError): num_pcie_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_pcie_devices_requested] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]] device_mesh = ttnn.open_device_mesh( @@ -213,7 +205,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device except (ValueError, AttributeError): num_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_devices_requested] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] device_mesh = ttnn.open_device_mesh( @@ -334,13 +325,18 @@ def pytest_addoption(parser): ) parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli") parser.addoption( - "--metal-cleanup", + "--metal-timeout", action="store", default=None, help="Enable process timeout", ) +@pytest.fixture +def input_path(request): + return request.config.getoption("--input-path") + + def pytest_generate_tests(metafunc): """ This is not a standard docstring. @@ -473,14 +469,15 @@ def pytest_runtest_makereport(item, call): @pytest.hookimpl(hookwrapper=True) def pytest_runtest_teardown(item, nextitem): yield - metal_cleanup_enabled = item.config.getoption("--metal-cleanup") - if metal_cleanup_enabled is not None: + metal_timeout_enabled = item.config.getoption("--metal-timeout") + using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0")) + + if metal_timeout_enabled is not None or using_xdist: report = item.stash[phase_report_key] test_failed = report.get("call", None) and report["call"].failed if test_failed: - logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}") - # reset_tensix(set(item.pci_ids)) - reset_tensix() + logger.info(f"In custom teardown, open device ids: {set(item.pci_ids)}") + reset_tensix(set(item.pci_ids)) # This is overriding the timer setup hook from pytest-timeout @@ -488,10 +485,12 @@ def pytest_runtest_teardown(item, nextitem): # At timeout, the process kills it's parent (the test process) and then itself @pytest.hookimpl(tryfirst=True) def pytest_timeout_set_timer(item, settings): - metal_timeout_enabled = item.config.getoption("--metal-cleanup") - if metal_timeout_enabled is not None: + metal_timeout_enabled = item.config.getoption("--metal-timeout") + using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0")) + + if metal_timeout_enabled is not None or using_xdist: parent_pid = os.getpid() - logger.info(f"Metal timeout {settings.timeout} seconds") + logger.info(f"Metal timeout {settings.timeout} seconds {parent_pid} for {item.nodeid}") def get_parent_status(): try: @@ -501,12 +500,15 @@ def get_parent_status(): return parent.status() def run_timer(settings): + logger.info(f"Timer started for {item.nodeid}") dead_status = ["zombie", "dead", "already dead"] timeout = settings.timeout - while get_parent_status() not in dead_status and timeout > 0: - time.sleep(1) - timeout -= 1 - if get_parent_status() != "already dead": + parent_status = "running" + while parent_status not in dead_status and timeout > 0: + time.sleep(5) + timeout -= 5 + parent_status = get_parent_status() + if parent_status != "already dead": logger.info(f"Timing out test case") os.kill(parent_pid, signal.SIGKILL) logger.info(f"Killing timer") @@ -542,10 +544,9 @@ def reset_tensix(tt_open_devices=None): smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh") else: tt_open_devices_str = ",".join([str(i) for i in tt_open_devices]) - check_smi = run_process_and_get_result("tt-smi-metal -h") - logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}") + check_smi_metal = run_process_and_get_result("tt-smi-metal -h") logger.info(f"Running reset for pci devices: {tt_open_devices_str}") - if check_smi.returncode > 0: + if check_smi_metal.returncode > 0: logger.info(f"Test failed - resetting {arch} with tt-smi") smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}") else: diff --git a/pytest.ini b/pytest.ini index 593b9ce3930..699ef215218 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] -timeout = 2400 +timeout = 300 minversion = 7.2 -addopts = --import-mode=importlib -vs -rA +addopts = --import-mode=importlib -vvs -rA --durations=0 empty_parameter_set_mark = skip markers = post_commit: mark tests to run on post-commit diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index b07cc38166c..dc9e7c35ae7 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -1,6 +1,6 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 @@ -11,19 +11,19 @@ run_perf_models_other() { local tt_arch=$1 local test_marker=$2 - env pytest tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker - env pytest tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker - env pytest models/demos/ttnn_falcon7b/tests -m $test_marker + env pytest -n auto models/demos/ttnn_falcon7b/tests -m $test_marker # Separate calls since we can't mix switching between number of cqs - env pytest models/demos/resnet/tests/test_perf_resnet.py -m $test_marker - env pytest models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker + env pytest -n auto models/demos/resnet/tests/test_perf_resnet.py -m $test_marker + env pytest -n auto models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker - env pytest tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker - env pytest models/demos/metal_BERT_large_11/tests -m $test_marker + env pytest -n auto models/demos/metal_BERT_large_11/tests -m $test_marker ## Merge all the generated reports env python models/perf/merge_perf_results.py @@ -33,13 +33,13 @@ run_perf_models_llm_javelin() { local tt_arch=$1 local test_marker=$2 - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m $test_marker + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m $test_marker if [ "$tt_arch" == "wormhole_b0" ]; then - env pytest models/demos/mamba/tests -m $test_marker + env pytest -n auto models/demos/mamba/tests -m $test_marker fi - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mistral7b/tests -m $test_marker + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/mistral7b/tests -m $test_marker ## Merge all the generated reports env python models/perf/merge_perf_results.py @@ -50,7 +50,7 @@ run_perf_models_cnn_javelin() { local test_marker=$2 # Run tests - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/device_perf_tests/stable_diffusion -m $test_marker + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker #env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests -m $test_marker ## Merge all the generated reports @@ -58,6 +58,7 @@ run_perf_models_cnn_javelin() { } run_device_perf_models() { + set -eo pipefail local test_marker=$1 env pytest tests/device_perf_tests/stable_diffusion -m $test_marker diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 334b68b71fd..ebd25264b9c 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -81,7 +81,7 @@ run_frequent_api_pipeline_tests() { ./tests/scripts/run_python_api_unit_tests.sh else if [[ $tt_arch == "wormhole_b0" ]]; then - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly else echo "API tests are not available for fast dispatch because they're already covered in post-commit" fi diff --git a/tests/scripts/single_card/nightly/run_common_models.sh b/tests/scripts/single_card/nightly/run_common_models.sh index 17ca8c4d3cf..19e090065f3 100755 --- a/tests/scripts/single_card/nightly/run_common_models.sh +++ b/tests/scripts/single_card/nightly/run_common_models.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running common models for archs" -env pytest tests/nightly/common_models/ +env pytest -n auto tests/nightly/common_models/ ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_gs_only.sh b/tests/scripts/single_card/nightly/run_gs_only.sh index c5bcc9f9745..bad5b98ea40 100755 --- a/tests/scripts/single_card/nightly/run_gs_only.sh +++ b/tests/scripts/single_card/nightly/run_gs_only.sh @@ -1,14 +1,19 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running model nightly tests for GS only" -env pytest models/demos/resnet/tests/test_metal_resnet50_performant.py +env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_performant.py ; fail+=$? -env pytest models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py +env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_ttnn.sh b/tests/scripts/single_card/nightly/run_ttnn.sh index f0bb3f9cadc..a41836173de 100755 --- a/tests/scripts/single_card/nightly/run_ttnn.sh +++ b/tests/scripts/single_card/nightly/run_ttnn.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running ttnn nightly tests for GS only" -env pytest tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" +env pytest -n auto tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_wh_b0_only.sh b/tests/scripts/single_card/nightly/run_wh_b0_only.sh index d30894713c1..5ae9f0657cb 100755 --- a/tests/scripts/single_card/nightly/run_wh_b0_only.sh +++ b/tests/scripts/single_card/nightly/run_wh_b0_only.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running nightly tests for WH B0 only" -env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/nightly/wh_b0_only_eth -env pytest tests/nightly/wh_b0_only \ No newline at end of file +env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/nightly/wh_b0_only_eth ; fail+=$? +env pytest -n auto tests/nightly/wh_b0_only ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh b/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh index 079087d6e69..35895a64208 100755 --- a/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh +++ b/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running unstable nightly tests for WH B0 only" -SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest tests/ttnn/integration_tests/stable_diffusion +SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest -n auto tests/ttnn/integration_tests/stable_diffusion ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh index bb2a23d9efd..d0f984dcfcb 100755 --- a/tests/scripts/t3000/run_t3000_demo_tests.sh +++ b/tests/scripts/t3000/run_t3000_demo_tests.sh @@ -1,60 +1,72 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_falcon40b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py ; fail+=$? # Falcon40B end to end demo (prefill + decode) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_loops.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_loops.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon7b_tests(){ # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon7b_tests" # Falcon7B demo (perf verification for 128/1024/2048 seq lens and output token verification) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_128_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_1024_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_2048_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-default_mode_1024_greedy_verify] + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_128_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_1024_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_2048_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-default_mode_1024_greedy_verify] ; fail+=$? # Falcon7B perplexity test (prefill and decode) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-prefill_seq1024_dram] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-decode_1024_l1_sharded] + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-prefill_seq1024_dram] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-decode_1024_l1_sharded] ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral8x7b_tests" # mixtral8x7b 8 chip demo test - 100 token generation with general weights (env flags set inside the test) - pytest models/demos/t3000/mixtral8x7b/demo/demo.py::test_mixtral8x7b_demo[wormhole_b0-True-general_weights] + pytest -n auto models/demos/t3000/mixtral8x7b/demo/demo.py::test_mixtral8x7b_demo[wormhole_b0-True-general_weights] ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tests() { @@ -68,6 +80,7 @@ run_t3000_tests() { run_t3000_mixtral_tests } +fail=0 main() { if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then echo "Script is being sourced, not executing main function" @@ -89,6 +102,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh index 55c7bb20370..203ffd436d6 100755 --- a/tests/scripts/t3000/run_t3000_frequent_tests.sh +++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh @@ -1,6 +1,6 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_ethernet_tests() { # Record the start time @@ -8,8 +8,8 @@ run_t3000_ethernet_tests() { echo "LOG_METAL: Running run_t3000_ethernet_tests" - pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py - pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py + pytest -n auto tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py ; fail+=$? + pytest -n auto tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -28,10 +28,10 @@ run_t3000_llama2_70b_tests() { export LLAMA_TOKENIZER_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/tokenizer.model export LLAMA_CACHE_PATH=/mnt/MLPerf/tt_dnn-models/llama-2/llama-data-cache/weights-cache-2 - pytest models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py - pytest models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py - pytest models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py - pytest models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py + pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py ; fail+=$? + pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py ; fail+=$? + pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py ; fail+=$? + pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -48,8 +48,8 @@ run_t3000_llama2_70b_experimental_tests() { # Removing tests to reduce the time taken to run the tests # WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/llama2_70b/tests/test_llama_mlp_t3000.py # WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/llama2_70b/tests/test_llama_attention_t3000.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/llama2_70b/tests/test_llama_decoder_t3000.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/llama2_70b/tests/test_llama_model_t3000.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/experimental/llama2_70b/tests/test_llama_decoder_t3000.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/experimental/llama2_70b/tests/test_llama_model_t3000.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -64,7 +64,7 @@ run_t3000_mixtral_tests() { echo "LOG_METAL: Running run_t3000_mixtral_tests" # mixtral8x7b 8 chip decode model test (env flags set inside the test) - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-10-1-pcc] + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-10-1-pcc] ; fail+=$? # Record the end time end_time=$(date +%s) @@ -78,8 +78,8 @@ run_t3000_tteager_tests() { echo "LOG_METAL: Running run_t3000_tteager_tests" - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_reduce_scatter_post_commit.py + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit ; fail+=$? + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_reduce_scatter_post_commit.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -91,9 +91,8 @@ run_t3000_trace_stress_tests() { start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_trace_stress_tests" - - NUM_TRACE_LOOPS=15 pytest tests/ttnn/unit_tests/test_multi_device_trace.py - NUM_TRACE_LOOPS=15 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py + NUM_TRACE_LOOPS=15 pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + NUM_TRACE_LOOPS=15 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -108,10 +107,10 @@ run_t3000_falcon40b_tests() { echo "LOG_METAL: Running run_t3000_falcon40b_tests" - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_mlp.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_attention.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_decoder.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_causallm.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_mlp.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_attention.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_decoder.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_causallm.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -126,9 +125,6 @@ run_t3000_tests() { # Run tteager tests run_t3000_tteager_tests - # Run trace tests - run_t3000_trace_stress_tests - # Run llama2-70b experimental tests run_t3000_llama2_70b_experimental_tests @@ -141,8 +137,12 @@ run_t3000_tests() { # Run mixtral tests run_t3000_mixtral_tests + # Run trace tests + run_t3000_trace_stress_tests + } +fail=0 main() { if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 @@ -159,6 +159,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 11fe89f1f5c..4176d15afb5 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -1,6 +1,6 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_falcon7b_tests() { # Record the start time @@ -8,7 +8,7 @@ run_t3000_falcon7b_tests() { echo "LOG_METAL: Running run_t3000_falcon7b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m "model_perf_t3000" + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m "model_perf_t3000" ; fail+=$? # Record the end time end_time=$(date +%s) @@ -22,7 +22,7 @@ run_t3000_mixtral_tests() { echo "LOG_METAL: Running run_t3000_mixtral_tests" - env pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py::test_mixtral_model_perf[wormhole_b0-True-2048-150-0.025] -m "model_perf_t3000" + env pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py::test_mixtral_model_perf[wormhole_b0-True-2048-150-0.025] -m "model_perf_t3000" ; fail+=$? # Record the end time end_time=$(date +%s) @@ -36,7 +36,7 @@ run_t3000_llama2_70b_tests() { echo "LOG_METAL: Running run_t3000_llama2_70b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/experimental/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" ; fail+=$? # Record the end time end_time=$(date +%s) @@ -50,7 +50,7 @@ run_t3000_falcon40b_tests() { echo "LOG_METAL: Running run_t3000_falcon40b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_perf_falcon.py -m "model_perf_t3000" + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_perf_falcon.py -m "model_perf_t3000" ; fail+=$? # Record the end time end_time=$(date +%s) @@ -80,6 +80,7 @@ run_t3000_cnn_tests() { env python models/perf/merge_perf_results.py } +fail=0 main() { # Parse the arguments while [[ $# -gt 0 ]]; do @@ -123,6 +124,10 @@ main() { echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1 exit 1 fi + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index a1ae782705e..358ef450520 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -1,6 +1,6 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_ttmetal_tests() { # Record the start time @@ -8,13 +8,13 @@ run_t3000_ttmetal_tests() { echo "LOG_METAL: Running run_t3000_ttmetal_tests" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" ; fail+=$? + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$? # Record the end time end_time=$(date +%s) @@ -27,10 +27,10 @@ run_t3000_ttnn_tests() { start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_ttnn_tests" - pytest tests/ttnn/unit_tests/test_multi_device_trace.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py - pytest tests/ttnn/unit_tests/test_multi_device.py - pytest tests/ttnn/unit_tests/test_multi_device_async.py + pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + pytest -n auto tests/ttnn/unit_tests/test_multi_device.py ; fail+=$? + pytest -n auto tests/ttnn/unit_tests/test_multi_device_async.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) @@ -43,9 +43,9 @@ run_t3000_falcon7b_tests() { echo "LOG_METAL: Running run_t3000_falcon7b_tests" - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py ; fail+=$? + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py ; fail+=$? + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py ; fail+=$? #pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py # Record the end time @@ -60,7 +60,7 @@ run_t3000_falcon40b_tests() { echo "LOG_METAL: Running run_t3000_falcon40b_tests" - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -74,13 +74,13 @@ run_t3000_mixtral_tests() { echo "LOG_METAL: Running run_t3000_mixtral_tests" - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-1-1-pcc] + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-1-1-pcc] ; fail+=$? # Record the end time end_time=$(date +%s) @@ -89,8 +89,6 @@ run_t3000_mixtral_tests() { } run_t3000_tests() { - # Run ttmetal tests - run_t3000_ttmetal_tests # Run ttnn tests run_t3000_ttnn_tests @@ -103,8 +101,12 @@ run_t3000_tests() { # Run mixtral tests run_t3000_mixtral_tests + + # Run ttmetal tests + run_t3000_ttmetal_tests } +fail=0 main() { if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 @@ -121,6 +123,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt index f7f90202919..5a6cf7ebb88 100644 --- a/tt_metal/python_env/requirements-dev.txt +++ b/tt_metal/python_env/requirements-dev.txt @@ -21,6 +21,7 @@ mypy==1.9.0 pytest==7.2.2 pytest-timeout==2.2.0 pytest-split==0.8.2 +pytest-xdist==3.6.1 jsbeautifier==1.14.7 datasets==2.9.0 torch==2.2.1.0+cpu