Skip to content

Commit

Permalink
#8729: xdist + reset mechanism on fd nightly, model perf, all t3k (ex…
Browse files Browse the repository at this point in the history
…cept profiler)

- enable timeout mechanism by default if using xdist, use 'metal-timeout' flag to enable if not using xdist
- increase GH actions timeout for xdist (review)
- get timings of each test and set global timeout to 5 mins (review)
- add custom timeouts to nightly + t3k pipelines + post-commit (review)
  • Loading branch information
vtangTT committed Jun 26, 2024
1 parent 79b36a1 commit 961053c
Show file tree
Hide file tree
Showing 18 changed files with 178 additions and 117 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
{ name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 30 },
{ name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
{ name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
{ name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 35 },
{ name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 45 },
]
name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
env:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/perf-models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
- uses: ./.github/actions/install-python-deps
- name: Run performance regressions
id: performance_tests
timeout-minutes: 30
timeout-minutes: 40
run: |
source ${{ github.workspace }}/python_env/bin/activate
./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/t3000-demo-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ jobs:
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run demo regression tests
shell: bash {0}
timeout-minutes: 180
run: |
source ${{ github.workspace }}/python_env/bin/activate
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/t3000-model-perf-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
- uses: ./.github/actions/install-python-deps
- name: Run model perf regression tests
timeout-minutes: 60
timeout-minutes: 75
run: |
source ${{ github.workspace }}/python_env/bin/activate
cd $TT_METAL_HOME
Expand Down
51 changes: 26 additions & 25 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ def device(request, device_params):
import tt_lib as ttl

device_id = request.config.getoption("device_id")

request.node.device_ids = [device_id]
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)]

num_devices = ttl.device.GetNumPCIeDevices()
Expand All @@ -108,8 +106,6 @@ def pcie_devices(request, device_params):

num_devices = ttl.device.GetNumPCIeDevices()
device_ids = [i for i in range(num_devices)]

request.node.device_ids = device_ids
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]

# Get only physical devices
Expand All @@ -129,8 +125,6 @@ def all_devices(request, device_params):

num_devices = ttl.device.GetNumAvailableDevices()
device_ids = [i for i in range(num_devices)]

request.node.device_ids = device_ids
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]

# Get only physical devices
Expand All @@ -155,7 +149,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
except (ValueError, AttributeError):
num_devices_requested = len(device_ids)

request.node.device_ids = device_ids[:num_devices_requested]
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]

device_mesh = ttnn.open_device_mesh(
Expand Down Expand Up @@ -183,7 +176,6 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
except (ValueError, AttributeError):
num_pcie_devices_requested = len(device_ids)

request.node.device_ids = device_ids[:num_pcie_devices_requested]
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]]

device_mesh = ttnn.open_device_mesh(
Expand Down Expand Up @@ -213,7 +205,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device
except (ValueError, AttributeError):
num_devices_requested = len(device_ids)

request.node.device_ids = device_ids[:num_devices_requested]
request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]

device_mesh = ttnn.open_device_mesh(
Expand Down Expand Up @@ -334,13 +325,18 @@ def pytest_addoption(parser):
)
parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli")
parser.addoption(
"--metal-cleanup",
"--metal-timeout",
action="store",
default=None,
help="Enable process timeout",
)


@pytest.fixture
def input_path(request):
return request.config.getoption("--input-path")


def pytest_generate_tests(metafunc):
"""
This is not a standard docstring.
Expand Down Expand Up @@ -473,25 +469,28 @@ def pytest_runtest_makereport(item, call):
@pytest.hookimpl(hookwrapper=True)
def pytest_runtest_teardown(item, nextitem):
yield
metal_cleanup_enabled = item.config.getoption("--metal-cleanup")
if metal_cleanup_enabled is not None:
metal_timeout_enabled = item.config.getoption("--metal-timeout")
using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0"))

if metal_timeout_enabled is not None or using_xdist:
report = item.stash[phase_report_key]
test_failed = report.get("call", None) and report["call"].failed
if test_failed:
logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}")
# reset_tensix(set(item.pci_ids))
reset_tensix()
logger.info(f"In custom teardown, open device ids: {set(item.pci_ids)}")
reset_tensix(set(item.pci_ids))


# This is overriding the timer setup hook from pytest-timeout
# If --metal-timeout is passed, we define a new timeout method that spawns a timer process
# At timeout, the process kills it's parent (the test process) and then itself
@pytest.hookimpl(tryfirst=True)
def pytest_timeout_set_timer(item, settings):
metal_timeout_enabled = item.config.getoption("--metal-cleanup")
if metal_timeout_enabled is not None:
metal_timeout_enabled = item.config.getoption("--metal-timeout")
using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0"))

if metal_timeout_enabled is not None or using_xdist:
parent_pid = os.getpid()
logger.info(f"Metal timeout {settings.timeout} seconds")
logger.info(f"Metal timeout {settings.timeout} seconds {parent_pid} for {item.nodeid}")

def get_parent_status():
try:
Expand All @@ -501,12 +500,15 @@ def get_parent_status():
return parent.status()

def run_timer(settings):
logger.info(f"Timer started for {item.nodeid}")
dead_status = ["zombie", "dead", "already dead"]
timeout = settings.timeout
while get_parent_status() not in dead_status and timeout > 0:
time.sleep(1)
timeout -= 1
if get_parent_status() != "already dead":
parent_status = "running"
while parent_status not in dead_status and timeout > 0:
time.sleep(5)
timeout -= 5
parent_status = get_parent_status()
if parent_status != "already dead":
logger.info(f"Timing out test case")
os.kill(parent_pid, signal.SIGKILL)
logger.info(f"Killing timer")
Expand Down Expand Up @@ -542,10 +544,9 @@ def reset_tensix(tt_open_devices=None):
smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh")
else:
tt_open_devices_str = ",".join([str(i) for i in tt_open_devices])
check_smi = run_process_and_get_result("tt-smi-metal -h")
logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}")
check_smi_metal = run_process_and_get_result("tt-smi-metal -h")
logger.info(f"Running reset for pci devices: {tt_open_devices_str}")
if check_smi.returncode > 0:
if check_smi_metal.returncode > 0:
logger.info(f"Test failed - resetting {arch} with tt-smi")
smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}")
else:
Expand Down
4 changes: 2 additions & 2 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[pytest]
timeout = 2400
timeout = 300
minversion = 7.2
addopts = --import-mode=importlib -vs -rA
addopts = --import-mode=importlib -vvs -rA --durations=0
empty_parameter_set_mark = skip
markers =
post_commit: mark tests to run on post-commit
Expand Down
25 changes: 13 additions & 12 deletions tests/scripts/run_performance.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
Expand All @@ -11,19 +11,19 @@ run_perf_models_other() {
local tt_arch=$1
local test_marker=$2

env pytest tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker
env pytest -n auto tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker

env pytest tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker
env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker

env pytest models/demos/ttnn_falcon7b/tests -m $test_marker
env pytest -n auto models/demos/ttnn_falcon7b/tests -m $test_marker

# Separate calls since we can't mix switching between number of cqs
env pytest models/demos/resnet/tests/test_perf_resnet.py -m $test_marker
env pytest models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker
env pytest -n auto models/demos/resnet/tests/test_perf_resnet.py -m $test_marker
env pytest -n auto models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker

env pytest tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker
env pytest -n auto tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker

env pytest models/demos/metal_BERT_large_11/tests -m $test_marker
env pytest -n auto models/demos/metal_BERT_large_11/tests -m $test_marker

## Merge all the generated reports
env python models/perf/merge_perf_results.py
Expand All @@ -33,13 +33,13 @@ run_perf_models_llm_javelin() {
local tt_arch=$1
local test_marker=$2

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m $test_marker
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m $test_marker

if [ "$tt_arch" == "wormhole_b0" ]; then
env pytest models/demos/mamba/tests -m $test_marker
env pytest -n auto models/demos/mamba/tests -m $test_marker
fi

env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mistral7b/tests -m $test_marker
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/mistral7b/tests -m $test_marker

## Merge all the generated reports
env python models/perf/merge_perf_results.py
Expand All @@ -50,14 +50,15 @@ run_perf_models_cnn_javelin() {
local test_marker=$2

# Run tests
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/device_perf_tests/stable_diffusion -m $test_marker
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker
#env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests -m $test_marker

## Merge all the generated reports
env python models/perf/merge_perf_results.py
}

run_device_perf_models() {
set -eo pipefail
local test_marker=$1

env pytest tests/device_perf_tests/stable_diffusion -m $test_marker
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/run_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ run_frequent_api_pipeline_tests() {
./tests/scripts/run_python_api_unit_tests.sh
else
if [[ $tt_arch == "wormhole_b0" ]]; then
pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
else
echo "API tests are not available for fast dispatch because they're already covered in post-commit"
fi
Expand Down
9 changes: 7 additions & 2 deletions tests/scripts/single_card/nightly/run_common_models.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
exit 1
fi
fail=0

echo "Running common models for archs"

env pytest tests/nightly/common_models/
env pytest -n auto tests/nightly/common_models/ ; fail+=$?

if [[ $fail -ne 0 ]]; then
exit 1
fi
11 changes: 8 additions & 3 deletions tests/scripts/single_card/nightly/run_gs_only.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
exit 1
fi
fail=0

echo "Running model nightly tests for GS only"

env pytest models/demos/resnet/tests/test_metal_resnet50_performant.py
env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_performant.py ; fail+=$?

env pytest models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py
env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py ; fail+=$?

if [[ $fail -ne 0 ]]; then
exit 1
fi
9 changes: 7 additions & 2 deletions tests/scripts/single_card/nightly/run_ttnn.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
exit 1
fi
fail=0

echo "Running ttnn nightly tests for GS only"

env pytest tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal"
env pytest -n auto tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" ; fail+=$?

if [[ $fail -ne 0 ]]; then
exit 1
fi
11 changes: 8 additions & 3 deletions tests/scripts/single_card/nightly/run_wh_b0_only.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
exit 1
fi
fail=0

echo "Running nightly tests for WH B0 only"
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/nightly/wh_b0_only_eth
env pytest tests/nightly/wh_b0_only
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/nightly/wh_b0_only_eth ; fail+=$?
env pytest -n auto tests/nightly/wh_b0_only ; fail+=$?

if [[ $fail -ne 0 ]]; then
exit 1
fi
9 changes: 7 additions & 2 deletions tests/scripts/single_card/nightly/run_wh_b0_unstable.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
#/bin/bash

set -eo pipefail
# set -eo pipefail

if [[ -z "$TT_METAL_HOME" ]]; then
echo "Must provide TT_METAL_HOME in environment" 1>&2
exit 1
fi
fail=0

echo "Running unstable nightly tests for WH B0 only"

SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest tests/ttnn/integration_tests/stable_diffusion
SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest -n auto tests/ttnn/integration_tests/stable_diffusion ; fail+=$?

if [[ $fail -ne 0 ]]; then
exit 1
fi
Loading

0 comments on commit 961053c

Please sign in to comment.