#8729: xdist + reset mechanism on fd nightly, model perf, all t3k (ex…

…cept profiler) - enable timeout mechanism by default if using xdist, use 'metal-timeout' flag to enable if not using xdist - increase GH actions timeout for xdist (review) - get timings of each test and set global timeout to 5 mins (review) - add custom timeouts to nightly + t3k pipelines + post-commit (review)
tenstorrent · Jun 28, 2024 · d525b17 · d525b17
1 parent d954e76
commit d525b17
Show file tree

Hide file tree

Showing 20 changed files with 238 additions and 120 deletions.
diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
@@ -26,7 +26,7 @@ jobs:
             { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 30 },
             { name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
             { name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
-            { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 35 },
+            { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 45 },
           ]
     name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
     env:

diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml
@@ -52,7 +52,7 @@ jobs:
       - uses: ./.github/actions/install-python-deps
       - name: Run performance regressions
         id: performance_tests
-        timeout-minutes: 30
+        timeout-minutes: 40
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
           ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }}

diff --git a/.github/workflows/t3000-demo-tests.yaml b/.github/workflows/t3000-demo-tests.yaml
@@ -17,7 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         test-group: [
-          { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 40, owner_id: U044T8U8DEF}, #Johanna Rock
+          { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 50, owner_id: U044T8U8DEF}, #Johanna Rock
           { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k falcon7b tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 90, owner_id: U05RWH3QUPM}, #Salar Hosseini
           { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, # Miguel Tairum
@@ -46,6 +46,7 @@ jobs:
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run demo regression tests
+        shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
           source ${{ github.workspace }}/python_env/bin/activate

diff --git a/.github/workflows/t3000-frequent-tests.yaml b/.github/workflows/t3000-frequent-tests.yaml
@@ -42,6 +42,7 @@ jobs:
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent regression tests
+        shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
           source ${{ github.workspace }}/python_env/bin/activate

diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml
@@ -17,10 +17,10 @@ jobs:
       fail-fast: false
       matrix:
         test-group: [
-          { name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 60, owner_id: S07AJBTLX2L}, #Model Falcon
-          { name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
-          { name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
-          { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 60, owner_id: S07AJBTLX2L}, # Model Falcon
+          { name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 75, owner_id: S07AJBTLX2L}, #Model Falcon
+          { name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 75, owner_id: U03PUAKE719}, # Miguel Tairum
+          { name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 75, owner_id: U03FJB5TM5Y}, #Colman Glagovich
+          { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: S07AJBTLX2L}, # Model Falcon
           #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
         ]
     name: ${{ matrix.test-group.name }}
@@ -52,6 +52,7 @@ jobs:
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run model perf regression tests
+        shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
           source ${{ github.workspace }}/python_env/bin/activate

diff --git a/.github/workflows/t3000-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml
@@ -43,6 +43,7 @@ jobs:
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run unit regression tests
+        shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
           source ${{ github.workspace }}/python_env/bin/activate

diff --git a/conftest.py b/conftest.py
@@ -85,8 +85,6 @@ def device(request, device_params):
     import tt_lib as ttl
 
     device_id = request.config.getoption("device_id")
-
-    request.node.device_ids = [device_id]
     request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)]
 
     num_devices = ttl.device.GetNumPCIeDevices()
@@ -108,9 +106,7 @@ def pcie_devices(request, device_params):
 
     num_devices = ttl.device.GetNumPCIeDevices()
     device_ids = [i for i in range(num_devices)]
-
-    request.node.device_ids = device_ids
-    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
+    request.node.pci_ids = device_ids
 
     # Get only physical devices
     devices = ttl.device.CreateDevices(device_ids, **device_params)
@@ -129,8 +125,6 @@ def all_devices(request, device_params):
 
     num_devices = ttl.device.GetNumAvailableDevices()
     device_ids = [i for i in range(num_devices)]
-
-    request.node.device_ids = device_ids
     request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
 
     # Get only physical devices
@@ -155,7 +149,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
-    request.node.device_ids = device_ids[:num_devices_requested]
     request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]
 
     device_mesh = ttnn.open_device_mesh(
@@ -183,8 +176,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
     except (ValueError, AttributeError):
         num_pcie_devices_requested = len(device_ids)
 
-    request.node.device_ids = device_ids[:num_pcie_devices_requested]
-    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]]
+    request.node.pci_ids = device_ids[:num_pcie_devices_requested]
 
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params
@@ -213,7 +205,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
-    request.node.device_ids = device_ids[:num_devices_requested]
     request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]
 
     device_mesh = ttnn.open_device_mesh(
@@ -334,13 +325,18 @@ def pytest_addoption(parser):
     )
     parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli")
     parser.addoption(
-        "--metal-cleanup",
+        "--metal-timeout",
         action="store",
         default=None,
         help="Enable process timeout",
     )
 
 
+@pytest.fixture
+def input_path(request):
+    return request.config.getoption("--input-path")
+
+
 def pytest_generate_tests(metafunc):
     """
     This is not a standard docstring.
@@ -473,25 +469,28 @@ def pytest_runtest_makereport(item, call):
 @pytest.hookimpl(hookwrapper=True)
 def pytest_runtest_teardown(item, nextitem):
     yield
-    metal_cleanup_enabled = item.config.getoption("--metal-cleanup")
-    if metal_cleanup_enabled is not None:
+    metal_timeout_enabled = item.config.getoption("--metal-timeout")
+    using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0"))
+
+    if metal_timeout_enabled is not None or using_xdist:
         report = item.stash[phase_report_key]
         test_failed = report.get("call", None) and report["call"].failed
         if test_failed:
-            logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}")
-            # reset_tensix(set(item.pci_ids))
-            reset_tensix()
+            logger.info(f"In custom teardown, open device ids: {set(item.pci_ids)}")
+            reset_tensix(set(item.pci_ids))
 
 
 # This is overriding the timer setup hook from pytest-timeout
 # If --metal-timeout is passed, we define a new timeout method that spawns a timer process
 # At timeout, the process kills it's parent (the test process) and then itself
 @pytest.hookimpl(tryfirst=True)
 def pytest_timeout_set_timer(item, settings):
-    metal_timeout_enabled = item.config.getoption("--metal-cleanup")
-    if metal_timeout_enabled is not None:
+    metal_timeout_enabled = item.config.getoption("--metal-timeout")
+    using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0"))
+
+    if metal_timeout_enabled is not None or using_xdist:
         parent_pid = os.getpid()
-        logger.info(f"Metal timeout {settings.timeout} seconds")
+        logger.info(f"Metal timeout {settings.timeout} seconds {parent_pid} for {item.nodeid}")
 
         def get_parent_status():
             try:
@@ -501,12 +500,15 @@ def get_parent_status():
             return parent.status()
 
         def run_timer(settings):
+            logger.info(f"Timer started for {item.nodeid}")
             dead_status = ["zombie", "dead", "already dead"]
             timeout = settings.timeout
-            while get_parent_status() not in dead_status and timeout > 0:
-                time.sleep(1)
-                timeout -= 1
-            if get_parent_status() != "already dead":
+            parent_status = "running"
+            while parent_status not in dead_status and timeout > 0:
+                time.sleep(5)
+                timeout -= 5
+                parent_status = get_parent_status()
+            if parent_status != "already dead":
                 logger.info(f"Timing out test case")
                 os.kill(parent_pid, signal.SIGKILL)
             logger.info(f"Killing timer")
@@ -519,13 +521,12 @@ def cancel():
         metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True)
         item.cancel_timeout = cancel
         metal_timer.start()
-        # logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}")
     return True
 
 
 # This is a hook used in pytest-xdist to handle when a worker crashes out
 # In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and
-# then it should get cleaned up by the controller through this fixture :fingers_crossed:
+# then it should get cleaned up by the controller through this fixture
 @pytest.hookimpl(tryfirst=True)
 def pytest_handlecrashitem(crashitem, report, sched):
     reset_tensix()
@@ -542,10 +543,9 @@ def reset_tensix(tt_open_devices=None):
         smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh")
     else:
         tt_open_devices_str = ",".join([str(i) for i in tt_open_devices])
-        check_smi = run_process_and_get_result("tt-smi-metal -h")
-        logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}")
+        check_smi_metal = run_process_and_get_result("tt-smi-metal -h")
         logger.info(f"Running reset for pci devices: {tt_open_devices_str}")
-        if check_smi.returncode > 0:
+        if check_smi_metal.returncode > 0:
             logger.info(f"Test failed - resetting {arch} with tt-smi")
             smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}")
         else:
@@ -555,5 +555,4 @@ def reset_tensix(tt_open_devices=None):
 
 @pytest.hookimpl(tryfirst=True)
 def pytest_xdist_auto_num_workers(config):
-    logger.info("getting num of xdist workers")
     return 1
diff --git a/pytest.ini b/pytest.ini
@@ -1,7 +1,7 @@
 [pytest]
 timeout = 300
 minversion = 7.2
-addopts = --import-mode=importlib -vs -rA
+addopts = --import-mode=importlib -vvs -rA --durations=0
 empty_parameter_set_mark = skip
 markers =
     post_commit: mark tests to run on post-commit

diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh
@@ -1,6 +1,6 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
@@ -11,19 +11,19 @@ run_perf_models_other() {
     local tt_arch=$1
     local test_marker=$2
 
-    env pytest tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker
+    env pytest -n auto tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker
 
-    env pytest tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker
+    env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker
 
-    env pytest models/demos/ttnn_falcon7b/tests -m $test_marker
+    env pytest -n auto models/demos/ttnn_falcon7b/tests -m $test_marker
 
     # Separate calls since we can't mix switching between number of cqs
-    env pytest models/demos/resnet/tests/test_perf_resnet.py -m $test_marker
-    env pytest models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker
+    env pytest -n auto models/demos/resnet/tests/test_perf_resnet.py -m $test_marker
+    env pytest -n auto models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker
 
-    env pytest tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker
+    env pytest -n auto tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker
 
-    env pytest models/demos/metal_BERT_large_11/tests -m $test_marker
+    env pytest -n auto models/demos/metal_BERT_large_11/tests -m $test_marker
 
     ## Merge all the generated reports
     env python models/perf/merge_perf_results.py
@@ -33,13 +33,13 @@ run_perf_models_llm_javelin() {
     local tt_arch=$1
     local test_marker=$2
 
-    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m $test_marker
+    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m $test_marker
 
     if [ "$tt_arch" == "wormhole_b0" ]; then
-        env pytest models/demos/mamba/tests -m $test_marker --timeout=360
+        env pytest -n auto models/demos/mamba/tests -m $test_marker --timeout=360
     fi
 
-    env  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360
+    env  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360
 
     ## Merge all the generated reports
     env python models/perf/merge_perf_results.py
@@ -50,14 +50,15 @@ run_perf_models_cnn_javelin() {
     local test_marker=$2
 
     # Run tests
-    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480
+    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480
     #env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests -m $test_marker
 
     ## Merge all the generated reports
     env python models/perf/merge_perf_results.py
 }
 
 run_device_perf_models() {
+    set -eo pipefail
     local test_marker=$1
 
     env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600

diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
@@ -81,7 +81,7 @@ run_frequent_api_pipeline_tests() {
         ./tests/scripts/run_python_api_unit_tests.sh
     else
         if [[ $tt_arch == "wormhole_b0" ]]; then
-            pytest  tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
+            pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
         else
             echo "API tests are not available for fast dispatch because they're already covered in post-commit"
         fi

diff --git a/tests/scripts/single_card/nightly/run_common_models.sh b/tests/scripts/single_card/nightly/run_common_models.sh
@@ -1,12 +1,17 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
   exit 1
 fi
+fail=0
 
 echo "Running common models for archs"
 
-env pytest tests/nightly/common_models/
+env pytest -n auto tests/nightly/common_models/ ; fail+=$?
+
+if [[ $fail -ne 0 ]]; then
+  exit 1
+fi
diff --git a/tests/scripts/single_card/nightly/run_gs_only.sh b/tests/scripts/single_card/nightly/run_gs_only.sh
@@ -1,14 +1,19 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
   exit 1
 fi
+fail=0
 
 echo "Running model nightly tests for GS only"
 
-env pytest models/demos/resnet/tests/test_metal_resnet50_performant.py
+env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_performant.py ; fail+=$?
 
-env pytest models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py
+env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py ; fail+=$?
+
+if [[ $fail -ne 0 ]]; then
+  exit 1
+fi
diff --git a/tests/scripts/single_card/nightly/run_ttnn.sh b/tests/scripts/single_card/nightly/run_ttnn.sh
@@ -1,12 +1,17 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
   exit 1
 fi
+fail=0
 
 echo "Running ttnn nightly tests for GS only"
 
-env pytest tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal"
+env pytest -n auto tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" ; fail+=$?
+
+if [[ $fail -ne 0 ]]; then
+  exit 1
+fi