From d525b1710a5201c0329ffba5686c40217be0b414 Mon Sep 17 00:00:00 2001 From: Vincent Tang Date: Thu, 13 Jun 2024 22:02:21 +0000 Subject: [PATCH] #8729: xdist + reset mechanism on fd nightly, model perf, all t3k (except profiler) - enable timeout mechanism by default if using xdist, use 'metal-timeout' flag to enable if not using xdist - increase GH actions timeout for xdist (review) - get timings of each test and set global timeout to 5 mins (review) - add custom timeouts to nightly + t3k pipelines + post-commit (review) --- ...-dispatch-full-regressions-and-models.yaml | 2 +- .github/workflows/perf-models.yaml | 2 +- .github/workflows/t3000-demo-tests.yaml | 3 +- .github/workflows/t3000-frequent-tests.yaml | 1 + .github/workflows/t3000-model-perf-tests.yaml | 9 +-- .github/workflows/t3000-unit-tests.yaml | 1 + conftest.py | 59 ++++++++-------- pytest.ini | 2 +- tests/scripts/run_performance.sh | 25 +++---- tests/scripts/run_tests.sh | 2 +- .../single_card/nightly/run_common_models.sh | 9 ++- .../single_card/nightly/run_gs_only.sh | 11 ++- tests/scripts/single_card/nightly/run_ttnn.sh | 9 ++- .../single_card/nightly/run_wh_b0_only.sh | 11 ++- .../single_card/nightly/run_wh_b0_unstable.sh | 9 ++- tests/scripts/t3000/run_t3000_demo_tests.sh | 37 +++++++--- .../scripts/t3000/run_t3000_frequent_tests.sh | 68 +++++++++++++------ .../t3000/run_t3000_model_perf_tests.sh | 31 +++++++-- tests/scripts/t3000/run_t3000_unit_tests.sh | 66 ++++++++++++------ tt_metal/python_env/requirements-dev.txt | 1 + 20 files changed, 238 insertions(+), 120 deletions(-) diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml index 115b9415452..b6dc4f619c5 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml @@ -26,7 +26,7 @@ jobs: { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 30 }, { name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 }, { name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 }, - { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 35 }, + { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 45 }, ] name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }} env: diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml index 8c423e865c1..f5905175e7e 100644 --- a/.github/workflows/perf-models.yaml +++ b/.github/workflows/perf-models.yaml @@ -52,7 +52,7 @@ jobs: - uses: ./.github/actions/install-python-deps - name: Run performance regressions id: performance_tests - timeout-minutes: 30 + timeout-minutes: 40 run: | source ${{ github.workspace }}/python_env/bin/activate ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }} diff --git a/.github/workflows/t3000-demo-tests.yaml b/.github/workflows/t3000-demo-tests.yaml index a05a651f0c5..ca524dd3a8a 100644 --- a/.github/workflows/t3000-demo-tests.yaml +++ b/.github/workflows/t3000-demo-tests.yaml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: test-group: [ - { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 40, owner_id: U044T8U8DEF}, #Johanna Rock + { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 50, owner_id: U044T8U8DEF}, #Johanna Rock { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich { name: "t3k falcon7b tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 90, owner_id: U05RWH3QUPM}, #Salar Hosseini { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, # Miguel Tairum @@ -46,6 +46,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run demo regression tests + shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/.github/workflows/t3000-frequent-tests.yaml b/.github/workflows/t3000-frequent-tests.yaml index d6feebce9df..70a13c371f5 100644 --- a/.github/workflows/t3000-frequent-tests.yaml +++ b/.github/workflows/t3000-frequent-tests.yaml @@ -42,6 +42,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run frequent regression tests + shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml index 3edeb388469..4995b036238 100644 --- a/.github/workflows/t3000-model-perf-tests.yaml +++ b/.github/workflows/t3000-model-perf-tests.yaml @@ -17,10 +17,10 @@ jobs: fail-fast: false matrix: test-group: [ - { name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 60, owner_id: S07AJBTLX2L}, #Model Falcon - { name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum - { name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich - { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 60, owner_id: S07AJBTLX2L}, # Model Falcon + { name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 75, owner_id: S07AJBTLX2L}, #Model Falcon + { name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 75, owner_id: U03PUAKE719}, # Miguel Tairum + { name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 75, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: S07AJBTLX2L}, # Model Falcon #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run? ] name: ${{ matrix.test-group.name }} @@ -52,6 +52,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run model perf regression tests + shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/.github/workflows/t3000-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml index 5b9e99baaa2..297b863399f 100644 --- a/.github/workflows/t3000-unit-tests.yaml +++ b/.github/workflows/t3000-unit-tests.yaml @@ -43,6 +43,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run unit regression tests + shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/conftest.py b/conftest.py index c6339ee3ae1..cbbda1b9e72 100644 --- a/conftest.py +++ b/conftest.py @@ -85,8 +85,6 @@ def device(request, device_params): import tt_lib as ttl device_id = request.config.getoption("device_id") - - request.node.device_ids = [device_id] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)] num_devices = ttl.device.GetNumPCIeDevices() @@ -108,9 +106,7 @@ def pcie_devices(request, device_params): num_devices = ttl.device.GetNumPCIeDevices() device_ids = [i for i in range(num_devices)] - - request.node.device_ids = device_ids - request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] + request.node.pci_ids = device_ids # Get only physical devices devices = ttl.device.CreateDevices(device_ids, **device_params) @@ -129,8 +125,6 @@ def all_devices(request, device_params): num_devices = ttl.device.GetNumAvailableDevices() device_ids = [i for i in range(num_devices)] - - request.node.device_ids = device_ids request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] # Get only physical devices @@ -155,7 +149,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par except (ValueError, AttributeError): num_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_devices_requested] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] device_mesh = ttnn.open_device_mesh( @@ -183,8 +176,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic except (ValueError, AttributeError): num_pcie_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_pcie_devices_requested] - request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]] + request.node.pci_ids = device_ids[:num_pcie_devices_requested] device_mesh = ttnn.open_device_mesh( ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params @@ -213,7 +205,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device except (ValueError, AttributeError): num_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_devices_requested] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] device_mesh = ttnn.open_device_mesh( @@ -334,13 +325,18 @@ def pytest_addoption(parser): ) parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli") parser.addoption( - "--metal-cleanup", + "--metal-timeout", action="store", default=None, help="Enable process timeout", ) +@pytest.fixture +def input_path(request): + return request.config.getoption("--input-path") + + def pytest_generate_tests(metafunc): """ This is not a standard docstring. @@ -473,14 +469,15 @@ def pytest_runtest_makereport(item, call): @pytest.hookimpl(hookwrapper=True) def pytest_runtest_teardown(item, nextitem): yield - metal_cleanup_enabled = item.config.getoption("--metal-cleanup") - if metal_cleanup_enabled is not None: + metal_timeout_enabled = item.config.getoption("--metal-timeout") + using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0")) + + if metal_timeout_enabled is not None or using_xdist: report = item.stash[phase_report_key] test_failed = report.get("call", None) and report["call"].failed if test_failed: - logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}") - # reset_tensix(set(item.pci_ids)) - reset_tensix() + logger.info(f"In custom teardown, open device ids: {set(item.pci_ids)}") + reset_tensix(set(item.pci_ids)) # This is overriding the timer setup hook from pytest-timeout @@ -488,10 +485,12 @@ def pytest_runtest_teardown(item, nextitem): # At timeout, the process kills it's parent (the test process) and then itself @pytest.hookimpl(tryfirst=True) def pytest_timeout_set_timer(item, settings): - metal_timeout_enabled = item.config.getoption("--metal-cleanup") - if metal_timeout_enabled is not None: + metal_timeout_enabled = item.config.getoption("--metal-timeout") + using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0")) + + if metal_timeout_enabled is not None or using_xdist: parent_pid = os.getpid() - logger.info(f"Metal timeout {settings.timeout} seconds") + logger.info(f"Metal timeout {settings.timeout} seconds {parent_pid} for {item.nodeid}") def get_parent_status(): try: @@ -501,12 +500,15 @@ def get_parent_status(): return parent.status() def run_timer(settings): + logger.info(f"Timer started for {item.nodeid}") dead_status = ["zombie", "dead", "already dead"] timeout = settings.timeout - while get_parent_status() not in dead_status and timeout > 0: - time.sleep(1) - timeout -= 1 - if get_parent_status() != "already dead": + parent_status = "running" + while parent_status not in dead_status and timeout > 0: + time.sleep(5) + timeout -= 5 + parent_status = get_parent_status() + if parent_status != "already dead": logger.info(f"Timing out test case") os.kill(parent_pid, signal.SIGKILL) logger.info(f"Killing timer") @@ -519,13 +521,12 @@ def cancel(): metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True) item.cancel_timeout = cancel metal_timer.start() - # logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}") return True # This is a hook used in pytest-xdist to handle when a worker crashes out # In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and -# then it should get cleaned up by the controller through this fixture :fingers_crossed: +# then it should get cleaned up by the controller through this fixture @pytest.hookimpl(tryfirst=True) def pytest_handlecrashitem(crashitem, report, sched): reset_tensix() @@ -542,10 +543,9 @@ def reset_tensix(tt_open_devices=None): smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh") else: tt_open_devices_str = ",".join([str(i) for i in tt_open_devices]) - check_smi = run_process_and_get_result("tt-smi-metal -h") - logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}") + check_smi_metal = run_process_and_get_result("tt-smi-metal -h") logger.info(f"Running reset for pci devices: {tt_open_devices_str}") - if check_smi.returncode > 0: + if check_smi_metal.returncode > 0: logger.info(f"Test failed - resetting {arch} with tt-smi") smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}") else: @@ -555,5 +555,4 @@ def reset_tensix(tt_open_devices=None): @pytest.hookimpl(tryfirst=True) def pytest_xdist_auto_num_workers(config): - logger.info("getting num of xdist workers") return 1 diff --git a/pytest.ini b/pytest.ini index c8f8a206f75..699ef215218 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] timeout = 300 minversion = 7.2 -addopts = --import-mode=importlib -vs -rA +addopts = --import-mode=importlib -vvs -rA --durations=0 empty_parameter_set_mark = skip markers = post_commit: mark tests to run on post-commit diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index 754bcbc9ab1..91567864538 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -1,6 +1,6 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 @@ -11,19 +11,19 @@ run_perf_models_other() { local tt_arch=$1 local test_marker=$2 - env pytest tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker - env pytest tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker - env pytest models/demos/ttnn_falcon7b/tests -m $test_marker + env pytest -n auto models/demos/ttnn_falcon7b/tests -m $test_marker # Separate calls since we can't mix switching between number of cqs - env pytest models/demos/resnet/tests/test_perf_resnet.py -m $test_marker - env pytest models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker + env pytest -n auto models/demos/resnet/tests/test_perf_resnet.py -m $test_marker + env pytest -n auto models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker - env pytest tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker - env pytest models/demos/metal_BERT_large_11/tests -m $test_marker + env pytest -n auto models/demos/metal_BERT_large_11/tests -m $test_marker ## Merge all the generated reports env python models/perf/merge_perf_results.py @@ -33,13 +33,13 @@ run_perf_models_llm_javelin() { local tt_arch=$1 local test_marker=$2 - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m $test_marker + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m $test_marker if [ "$tt_arch" == "wormhole_b0" ]; then - env pytest models/demos/mamba/tests -m $test_marker --timeout=360 + env pytest -n auto models/demos/mamba/tests -m $test_marker --timeout=360 fi - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360 ## Merge all the generated reports env python models/perf/merge_perf_results.py @@ -50,7 +50,7 @@ run_perf_models_cnn_javelin() { local test_marker=$2 # Run tests - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480 #env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests -m $test_marker ## Merge all the generated reports @@ -58,6 +58,7 @@ run_perf_models_cnn_javelin() { } run_device_perf_models() { + set -eo pipefail local test_marker=$1 env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600 diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 334b68b71fd..ebd25264b9c 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -81,7 +81,7 @@ run_frequent_api_pipeline_tests() { ./tests/scripts/run_python_api_unit_tests.sh else if [[ $tt_arch == "wormhole_b0" ]]; then - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly else echo "API tests are not available for fast dispatch because they're already covered in post-commit" fi diff --git a/tests/scripts/single_card/nightly/run_common_models.sh b/tests/scripts/single_card/nightly/run_common_models.sh index 17ca8c4d3cf..19e090065f3 100755 --- a/tests/scripts/single_card/nightly/run_common_models.sh +++ b/tests/scripts/single_card/nightly/run_common_models.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running common models for archs" -env pytest tests/nightly/common_models/ +env pytest -n auto tests/nightly/common_models/ ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_gs_only.sh b/tests/scripts/single_card/nightly/run_gs_only.sh index c5bcc9f9745..bad5b98ea40 100755 --- a/tests/scripts/single_card/nightly/run_gs_only.sh +++ b/tests/scripts/single_card/nightly/run_gs_only.sh @@ -1,14 +1,19 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running model nightly tests for GS only" -env pytest models/demos/resnet/tests/test_metal_resnet50_performant.py +env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_performant.py ; fail+=$? -env pytest models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py +env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_ttnn.sh b/tests/scripts/single_card/nightly/run_ttnn.sh index f0bb3f9cadc..a41836173de 100755 --- a/tests/scripts/single_card/nightly/run_ttnn.sh +++ b/tests/scripts/single_card/nightly/run_ttnn.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running ttnn nightly tests for GS only" -env pytest tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" +env pytest -n auto tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_wh_b0_only.sh b/tests/scripts/single_card/nightly/run_wh_b0_only.sh index d30894713c1..5ae9f0657cb 100755 --- a/tests/scripts/single_card/nightly/run_wh_b0_only.sh +++ b/tests/scripts/single_card/nightly/run_wh_b0_only.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running nightly tests for WH B0 only" -env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/nightly/wh_b0_only_eth -env pytest tests/nightly/wh_b0_only \ No newline at end of file +env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/nightly/wh_b0_only_eth ; fail+=$? +env pytest -n auto tests/nightly/wh_b0_only ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh b/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh index 079087d6e69..35895a64208 100755 --- a/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh +++ b/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running unstable nightly tests for WH B0 only" -SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest tests/ttnn/integration_tests/stable_diffusion +SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest -n auto tests/ttnn/integration_tests/stable_diffusion ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh index 96a05371beb..fa050429ddb 100755 --- a/tests/scripts/t3000/run_t3000_demo_tests.sh +++ b/tests/scripts/t3000/run_t3000_demo_tests.sh @@ -1,23 +1,27 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_falcon40b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py --timeout=720 + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py --timeout=720 ; fail+=$? # Falcon40B end to end demo (prefill + decode) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_loops.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_loops.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_llama3_70b_tests() { @@ -38,39 +42,47 @@ run_t3000_llama3_70b_tests() { run_t3000_falcon7b_tests(){ # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon7b_tests" # Falcon7B demo (perf verification for 128/1024/2048 seq lens and output token verification) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_128_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_1024_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_2048_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-default_mode_1024_greedy_verify] + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_128_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_1024_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_2048_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-default_mode_1024_greedy_verify] ; fail+=$? # Falcon7B perplexity test (prefill and decode) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-prefill_seq1024_dram] --timeout=720 - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-decode_1024_l1_sharded] --timeout=720 + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-prefill_seq1024_dram] --timeout=720 ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-decode_1024_l1_sharded] --timeout=720 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral8x7b_tests" # mixtral8x7b 8 chip demo test - 100 token generation with general weights (env flags set inside the test) - pytest models/demos/t3000/mixtral8x7b/demo/demo.py::test_mixtral8x7b_demo[wormhole_b0-True-general_weights] --timeout=720 + pytest -n auto models/demos/t3000/mixtral8x7b/demo/demo.py::test_mixtral8x7b_demo[wormhole_b0-True-general_weights] --timeout=720 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tests() { @@ -87,6 +99,7 @@ run_t3000_tests() { run_t3000_mixtral_tests } +fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then @@ -109,6 +122,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh index 37abf05e64d..cab852813ef 100755 --- a/tests/scripts/t3000/run_t3000_frequent_tests.sh +++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh @@ -1,99 +1,122 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_ethernet_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_ethernet_tests" - pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py - pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py + pytest -n auto tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py ; fail+=$? + pytest -n auto tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_ethernet_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_llama2_70b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_llama2_70b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py --timeout=900 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py ; fail+=$? + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py ; fail+=$? + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py ; fail+=$? + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py --timeout=900 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral_tests" # mixtral8x7b 8 chip decode model test (env flags set inside the test) - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-10-1-pcc] + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-10-1-pcc] ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tteager_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_tteager_tests" - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_reduce_scatter_post_commit.py + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit ; fail+=$? + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_reduce_scatter_post_commit.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_tteager_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_trace_stress_tests() { + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_trace_stress_tests" - - NUM_TRACE_LOOPS=15 pytest tests/ttnn/unit_tests/test_multi_device_trace.py - NUM_TRACE_LOOPS=15 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py + NUM_TRACE_LOOPS=15 pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + NUM_TRACE_LOOPS=15 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_trace_stress_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon40b_tests() { + fail=0 # Record the start time start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_mlp.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_attention.py --timeout=480 - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_decoder.py --timeout=480 - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_causallm.py --timeout=600 + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_mlp.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_attention.py --timeout=480 ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_decoder.py --timeout=480 ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_causallm.py --timeout=600 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tests() { @@ -103,9 +126,6 @@ run_t3000_tests() { # Run tteager tests run_t3000_tteager_tests - # Run trace tests - run_t3000_trace_stress_tests - # Run falcon40b tests run_t3000_falcon40b_tests @@ -115,8 +135,12 @@ run_t3000_tests() { # Run mixtral tests run_t3000_mixtral_tests + # Run trace tests + run_t3000_trace_stress_tests + } +fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then @@ -139,6 +163,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 6140b9efeaf..6f97b8e7636 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -1,61 +1,77 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_falcon7b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon7b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m "model_perf_t3000" + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m "model_perf_t3000" ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral_tests" - env pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py -m "model_perf_t3000" + env pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py -m "model_perf_t3000" ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_llama2_70b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_llama2_70b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon40b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_perf_falcon.py -m "model_perf_t3000" --timeout=600 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_perf_falcon.py -m "model_perf_t3000" --timeout=600 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_llm_tests() { @@ -80,6 +96,7 @@ run_t3000_cnn_tests() { env python models/perf/merge_perf_results.py } +fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then @@ -129,6 +146,10 @@ main() { echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1 exit 1 fi + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index a8019137642..64c23ed2b48 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -1,66 +1,79 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_ttmetal_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_ttmetal_tests" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" ; fail+=$? + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_ttmetal_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_ttnn_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_ttnn_tests" - pytest tests/ttnn/unit_tests/test_multi_device_trace.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py - pytest tests/ttnn/unit_tests/test_multi_device.py - pytest tests/ttnn/unit_tests/test_multi_device_async.py + pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + pytest -n auto tests/ttnn/unit_tests/test_multi_device.py ; fail+=$? + pytest -n auto tests/ttnn/unit_tests/test_multi_device_async.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_ttnn_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon7b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon7b_tests" - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py ; fail+=$? + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py ; fail+=$? + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py ; fail+=$? #pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon40b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -70,21 +83,25 @@ run_t3000_falcon40b_tests() { run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral_tests" - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tests() { @@ -104,6 +121,7 @@ run_t3000_tests() { run_t3000_mixtral_tests } +fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then @@ -126,6 +144,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt index f7f90202919..5a6cf7ebb88 100644 --- a/tt_metal/python_env/requirements-dev.txt +++ b/tt_metal/python_env/requirements-dev.txt @@ -21,6 +21,7 @@ mypy==1.9.0 pytest==7.2.2 pytest-timeout==2.2.0 pytest-split==0.8.2 +pytest-xdist==3.6.1 jsbeautifier==1.14.7 datasets==2.9.0 torch==2.2.1.0+cpu