From b5e0888819eb18fa15a8f639532bfcf662276b5d Mon Sep 17 00:00:00 2001 From: Akhmed Rakhmati Date: Thu, 27 Jun 2024 03:03:21 +0000 Subject: [PATCH 1/6] #9767: use reflect library in reflection.hpp --- CMakeLists.txt | 11 +-- cmake/dependencies.cmake | 10 ++ tests/ttnn/unit_tests/gtests/CMakeLists.txt | 2 +- tt_eager/tt_dnn/op_library/CMakeLists.txt | 2 +- tt_metal/tools/profiler/op_profiler.hpp | 18 ++-- tt_metal/tt_stl/reflection.hpp | 93 +++++++++++++++---- ttnn/CMakeLists.txt | 2 +- ttnn/cpp/ttnn/device_operation.hpp | 45 +-------- .../eltwise/binary/device/binary_op.hpp | 18 ---- 9 files changed, 99 insertions(+), 102 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 083b369517e8..ee649dc7d86b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,16 +96,6 @@ if (NOT NUMA_LIBRARY) message(FATAL_ERROR "NUMA library not found") endif() - -CPMAddPackage( - NAME reflect - GITHUB_REPOSITORY boost-ext/reflect - GIT_TAG v1.1.1 -) -add_library(reflect INTERFACE) -target_include_directories(reflect SYSTEM INTERFACE ${reflect_SOURCE_DIR}) -add_library(reflect::reflect ALIAS reflect) - ############################################################################################################################ # Constructing interface libs for common compiler flags, header directories, and libraries # These interface libs are linked with PUBLIC scope at lowest common target (tt_metal/common) and at tt_metal_libs level @@ -142,6 +132,7 @@ endif() add_library(metal_header_directories INTERFACE) target_include_directories(metal_header_directories INTERFACE ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc) +target_include_directories(metal_header_directories SYSTEM INTERFACE ${reflect_SOURCE_DIR}) foreach(lib ${BoostPackages}) target_include_directories(metal_header_directories INTERFACE ${Boost${lib}_SOURCE_DIR}/include) endforeach() diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index 622489e7fdd9..025edeae1b89 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -48,3 +48,13 @@ if (googletest_ADDED) target_link_libraries(gtest PRIVATE c++ c++abi) target_link_libraries(gtest_main PRIVATE c++ c++abi) endif() + +############################################################################################################################ +# boost-ext reflect : https://github.com/boost-ext/reflect +############################################################################################################################ + +CPMAddPackage( + NAME reflect + GITHUB_REPOSITORY boost-ext/reflect + GIT_TAG v1.1.1 +) diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt index 3bee41905b42..359a0301929b 100644 --- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt +++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt @@ -9,7 +9,7 @@ set(TTNN_UNIT_TESTS_SRC add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC}) -target_link_libraries(unit_tests_ttnn PUBLIC test_common_libs ttnn_lib tt_metal tt_eager reflect::reflect) +target_link_libraries(unit_tests_ttnn PUBLIC test_common_libs ttnn_lib tt_metal tt_eager) target_include_directories(unit_tests_ttnn PRIVATE ${UMD_HOME} ${PROJECT_SOURCE_DIR} diff --git a/tt_eager/tt_dnn/op_library/CMakeLists.txt b/tt_eager/tt_dnn/op_library/CMakeLists.txt index e6be1fe00bc2..6a920b9dc29c 100644 --- a/tt_eager/tt_dnn/op_library/CMakeLists.txt +++ b/tt_eager/tt_dnn/op_library/CMakeLists.txt @@ -220,7 +220,7 @@ set(TT_DNN_SRCS add_library(tt_dnn OBJECT ${TT_DNN_SRCS}) -target_link_libraries(tt_dnn PUBLIC metal_header_directories compiler_flags umd_device reflect::reflect) +target_link_libraries(tt_dnn PUBLIC metal_header_directories compiler_flags umd_device) target_include_directories(tt_dnn PUBLIC ${UMD_HOME} ${PROJECT_SOURCE_DIR} diff --git a/tt_metal/tools/profiler/op_profiler.hpp b/tt_metal/tools/profiler/op_profiler.hpp index 09414d8c5878..79c231ea50f9 100644 --- a/tt_metal/tools/profiler/op_profiler.hpp +++ b/tt_metal/tools/profiler/op_profiler.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include @@ -274,17 +275,12 @@ inline json get_base_json( j["op_code"] = opName; json attributesObj; - constexpr auto& attribute_names = std::decay_t::attribute_names; - const auto attribute_values = operation_attributes.attribute_values(); - [&attributesObj, &attribute_names, &attribute_values](std::index_sequence) { - ( - [&attributesObj, &attribute_names, &attribute_values] { - const auto& attribute_name = std::get(attribute_names); - const auto& attribute = std::get(attribute_values); - attributesObj[attribute_name] = fmt::format("{}", attribute); - }(), - ...); - }(std::make_index_sequence>>{}); + reflect::for_each( + [&attributesObj, &operation_attributes](auto I) { + attributesObj[std::string{reflect::member_name(operation_attributes)}] = + fmt::format("{}", reflect::get(operation_attributes)); + }, + operation_attributes); j["attributes"] = attributesObj; std::vector input_tensors; diff --git a/tt_metal/tt_stl/reflection.hpp b/tt_metal/tt_stl/reflection.hpp index 3b225fe47d99..1deb133dfa72 100644 --- a/tt_metal/tt_stl/reflection.hpp +++ b/tt_metal/tt_stl/reflection.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -16,7 +17,6 @@ #include #include "third_party/magic_enum/magic_enum.hpp" - #include "type_name.hpp" namespace tt { @@ -37,9 +37,7 @@ concept IsVariant = requires { typename std::variant_size::type; }; template constexpr auto get_active_type_name_in_variant(const Variant& v) { - return std::visit([](auto&& arg) -> std::string_view { - return short_type_name>; - }, v); + return std::visit([](auto&& arg) -> std::string_view { return short_type_name>; }, v); } // Forward Declare hash_object @@ -397,46 +395,100 @@ std::ostream& operator<<(std::ostream& os, const std::set& set) { return os; } -template - requires std::same_as, to_visit_t> +template + requires std::same_as, object_t> constexpr auto visit_object_of_type(auto callback, T&& value) { callback(value); } -template +template constexpr auto visit_object_of_type(auto callback, const std::optional& value) { if (value.has_value()) { - visit_object_of_type(callback, value.value()); + visit_object_of_type(callback, value.value()); } } -template +template constexpr auto visit_object_of_type(auto callback, const std::vector& value) { for (auto& tensor : value) { - visit_object_of_type(callback, tensor); + visit_object_of_type(callback, tensor); } } -template +template constexpr auto visit_object_of_type(auto callback, const std::array& value) { for (auto& tensor : value) { - visit_object_of_type(callback, tensor); + visit_object_of_type(callback, tensor); } } -template +template constexpr auto visit_object_of_type(auto callback, const std::tuple& value) { constexpr auto num_attributes = sizeof...(Ts); [&callback, &value](std::index_sequence) { - (visit_object_of_type(callback, std::get(value)), ...); + (visit_object_of_type(callback, std::get(value)), ...); }(std::make_index_sequence{}); } -template - requires(not std::same_as, to_visit_t>) and requires { std::decay_t::attribute_names; } +template + requires(not std::same_as, object_t>) and requires { std::decay_t::attribute_names; } +constexpr auto visit_object_of_type(auto callback, T&& object) { + constexpr auto num_attributes = std::tuple_size_v::attribute_names)>; + visit_object_of_type(callback, object.attribute_values()); +} + +template + requires(not std::same_as, object_t>) and requires { std::is_aggregate_v>; } constexpr auto visit_object_of_type(auto callback, T&& object) { + reflect::for_each( + [&callback, &object](auto I) { visit_object_of_type(callback, reflect::get(object)); }, object); +} + +template + requires std::same_as, object_t> +constexpr auto get_first_object_of_type(T&& value) { + return std::cref(value); +} + +template +constexpr auto get_first_object_of_type(const std::optional& value) { + if (value.has_value()) { + const auto& tensor = value.value(); + return get_first_object_of_type(tensor); + } +} + +template +constexpr auto get_first_object_of_type(const std::vector& value) { + for (auto& tensor : value) { + return get_first_object_of_type(tensor); + } +} + +template +constexpr auto get_first_object_of_type(const std::array& value) { + for (auto& tensor : value) { + return get_first_object_of_type(tensor); + } +} + +template +constexpr auto get_first_object_of_type(const std::tuple& value) { + constexpr auto num_attributes = sizeof...(Ts); + return get_first_object_of_type(std::get<0>(value)); +} + +template + requires (not std::same_as, object_t>) and requires { std::decay_t::attribute_names; } +constexpr auto get_first_object_of_type(T&& object) { constexpr auto num_attributes = std::tuple_size_v::attribute_names)>; - visit_object_of_type(callback, object.attribute_values()); + return get_first_object_of_type(object.attribute_values()); +} + +template + requires (not std::same_as, object_t>) and requires { std::is_aggregate_v>; } +constexpr auto get_first_object_of_type(T&& object) { + return get_first_object_of_type(reflect::get<0>(object)); } } // namespace reflection @@ -694,6 +746,13 @@ inline hash_t hash_object(const T& object) noexcept { } else { return 0; } + } else if constexpr (std::is_aggregate_v) { + if constexpr (DEBUG_HASH_OBJECT_FUNCTION) { + fmt::print("Hashing struct {} using reflect library: {}\n", get_type_name(), object); + } + std::size_t hash = 0; + reflect::for_each([&hash, &object](auto I) { hash = hash_objects(hash, reflect::get(object)); }, object); + return hash; } else { static_assert(tt::stl::concepts::always_false_v, "Type doesn't support std::hash"); } diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt index 6d3b1549c428..c8262b3a3a27 100644 --- a/ttnn/CMakeLists.txt +++ b/ttnn/CMakeLists.txt @@ -17,7 +17,7 @@ set(TTNN_SRCS add_library(ttnn_lib OBJECT ${TTNN_SRCS}) target_compile_options(ttnn_lib PUBLIC -MP -Wno-int-to-pointer-cast -fno-var-tracking) target_link_libraries(ttnn_lib - PUBLIC compiler_flags metal_header_directories metal_common_libs reflect::reflect + PUBLIC compiler_flags metal_header_directories metal_common_libs ) target_include_directories(ttnn_lib PUBLIC ${UMD_HOME} diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp index 652eb88d8d05..ec9b2a93434d 100644 --- a/ttnn/cpp/ttnn/device_operation.hpp +++ b/ttnn/cpp/ttnn/device_operation.hpp @@ -15,7 +15,6 @@ #include "tt_stl/concepts.hpp" #include "tt_stl/reflection.hpp" #include "tt_stl/unique_any.hpp" -#include namespace ttnn { @@ -96,47 +95,6 @@ template return table[i]; } -template - requires std::same_as, Tensor> -constexpr auto get_first_tensor(T&& value) { - return std::cref(value); -} - -template -constexpr auto get_first_tensor(const std::optional& value) { - if (value.has_value()) { - const auto& tensor = value.value(); - return get_first_tensor(tensor); - } -} - -template -constexpr auto get_first_tensor(const std::vector& value) { - for (auto& tensor : value) { - return get_first_tensor(tensor); - } -} - -template -constexpr auto get_first_tensor(const std::array& value) { - for (auto& tensor : value) { - return get_first_tensor(tensor); - } -} - -template -constexpr auto get_first_tensor(const std::tuple& value) { - constexpr auto num_attributes = sizeof...(Ts); - return get_first_tensor(std::get<0>(value)); -} - -template - requires requires { std::decay_t::attribute_names; } and (not std::same_as, Tensor>) -constexpr auto get_first_tensor(T&& object) { - constexpr auto num_attributes = std::tuple_size_v::attribute_names)>; - return get_first_tensor(object.attribute_values()); -} - inline const auto USE_FAST_DISPATCH = std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr; template @@ -231,7 +189,8 @@ typename device_operation_t::tensor_return_value_t run( using tensor_return_value_t = typename device_operation_t::tensor_return_value_t; static_assert(not std::same_as, "Operation cannot return type cannot be void"); - auto device = get_first_tensor(tensor_args).get().device(); + // TODO: support the case when tensor args are empty? Or add an overload for that case? + auto device = tt::stl::reflection::get_first_object_of_type(tensor_args).get().device(); auto& program_cache = device->program_cache; auto program_hash = compute_program_hash(operation_attributes, tensor_args); diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_op.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_op.hpp index cc4906f4daae..c45bff9fde35 100644 --- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_op.hpp +++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_op.hpp @@ -65,29 +65,11 @@ struct Binary { const MemoryConfig memory_config; const DataType dtype; std::optional compute_kernel_config; - - static constexpr auto attribute_names = std::forward_as_tuple( - "binary_op_type", "in_place", "activations", "memory_config", "dtype", "compute_kernel_config"); - const auto attribute_values() const { - return std::forward_as_tuple( - this->binary_op_type, - this->in_place, - this->activations, - this->memory_config, - this->dtype, - this->compute_kernel_config); - } }; struct tensor_args_t { const Tensor& input_tensor_a; const Tensor& input_tensor_b; std::optional output_tensor; - - static constexpr auto attribute_names = - std::forward_as_tuple("input_tensor_a", "input_tensor_b", "output_tensor"); - const auto attribute_values() const { - return std::forward_as_tuple(this->input_tensor_a, this->input_tensor_b, this->output_tensor); - } }; using shape_return_value_t = ttnn::Shape; using tensor_return_value_t = Tensor; From 86ab828d505f4ec23aae119086df11a665808748 Mon Sep 17 00:00:00 2001 From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com> Date: Fri, 28 Jun 2024 16:41:08 -0400 Subject: [PATCH 2/6] #0: Properly delete source folders for wheel testing (#9829) #0: Go to root directory to delete extraneous folders because before we weren't deleting anything --- tests/scripts/set_up_end_to_end_tests_env.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/scripts/set_up_end_to_end_tests_env.sh b/tests/scripts/set_up_end_to_end_tests_env.sh index 27c4d78d8f7a..9a7e1e6c3869 100755 --- a/tests/scripts/set_up_end_to_end_tests_env.sh +++ b/tests/scripts/set_up_end_to_end_tests_env.sh @@ -21,6 +21,7 @@ set_up_end_to_end_tests_env() { python -m pip install -r requirements.txt python -m pip install ../../metal_libs-*.whl + cd ../../ rm -rf tt_metal tt_eager ttnn models echo "Showing current directory" ls -hal From 2cc5380e0c47556f6552085db159eb660274d986 Mon Sep 17 00:00:00 2001 From: mtairum Date: Fri, 28 Jun 2024 21:01:08 +0000 Subject: [PATCH 3/6] #9479: Update Mixtral perf estimates and clean mixtral unit test --- .../demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py | 10 +++++----- tests/scripts/t3000/run_t3000_model_perf_tests.sh | 2 +- tests/scripts/t3000/run_t3000_unit_tests.sh | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py b/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py index 3732cb21f0ae..0d9717160338 100644 --- a/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py +++ b/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py @@ -43,10 +43,10 @@ def forward(self, x): @pytest.mark.parametrize( "generation_start_pos, expected_compile_time, expected_inference_time", ( - (32, 150, 0.058), # FIXME: Perf regression (issue #9479) - (128, 150, 0.058), # FIXME: Perf regression (issue #9479) - (1024, 150, 0.058), # FIXME: Perf regression (issue #9479) - (2048, 150, 0.058), # FIXME: Perf regression (issue #9479) + (32, 150, 0.075), + (128, 150, 0.075), + (1024, 150, 0.075), + (2048, 150, 0.075), ), ) def test_mixtral_model_perf( @@ -61,7 +61,7 @@ def test_mixtral_model_perf( # Can use dummy_weights=True correctness is not tested, but it is much slower model_args = TtModelArgs(t3k_device_mesh.get_device(0), dummy_weights=False) - model_args.n_layers = 1 + model_args.n_layers = 32 # Clear global profiler state before starting measurements profiler.clear() diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 2cf1dc5dcc4e..6140b9efeafd 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -22,7 +22,7 @@ run_t3000_mixtral_tests() { echo "LOG_METAL: Running run_t3000_mixtral_tests" - env pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py::test_mixtral_model_perf[wormhole_b0-True-2048-150-0.058] -m "model_perf_t3000" + env pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py -m "model_perf_t3000" # Record the end time end_time=$(date +%s) diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index ea092261a138..a8019137642b 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -80,7 +80,6 @@ run_t3000_mixtral_tests() { pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-1-1-pcc] # Record the end time end_time=$(date +%s) @@ -111,7 +110,7 @@ main() { echo "Script is being sourced, not executing main function" return 0 fi - + if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 From 29efc68bd87ff750679d7feedbc0ff9736316265 Mon Sep 17 00:00:00 2001 From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:17:07 -0700 Subject: [PATCH 4/6] #0: Added github community issue workflow (#9833) * #0: Added github community issue workflow * #0: Update channel id * #0: Check that webhook works * #0: Try to log some more * #0: Change the check * #0: switched check to an org member * #0: Add label: community to non-org issues * #0: Switch to org check + labels addition --- .github/workflows/on-community-issue.yaml | 37 +++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/on-community-issue.yaml diff --git a/.github/workflows/on-community-issue.yaml b/.github/workflows/on-community-issue.yaml new file mode 100644 index 000000000000..63af9e0c1590 --- /dev/null +++ b/.github/workflows/on-community-issue.yaml @@ -0,0 +1,37 @@ +name: "Slack Notification on Community Issue" + +on: + issues: + types: [opened, labeled] + +jobs: + label-check: + runs-on: ubuntu-latest + + steps: + - name: Check if organization member + id: is_organization_member + uses: JamesSingleton/is-organization-member@1.0.0 + with: + organization: tenstorrent + username: ${{ github.event.issue.user.login }} + token: ${{ secrets.GITHUB_TOKEN }} + - name: Add community label + if: ${{ steps.is_organization_member.outputs.result == 'false' }} + run: gh issue edit "$NUMBER" --add-label "$LABELS" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_REPO: ${{ github.repository }} + NUMBER: ${{ github.event.issue.number }} + LABELS: community + - name: Send Slack Notification + if: ${{ steps.is_organization_member.outputs.result == 'false' }} + uses: slackapi/slack-github-action@v1.26.0 + with: + payload: | + { + "text": "A new issue has been created by a non-org member `${{ github.event.sender.login }}`: ${{ github.event.issue.html_url }}", + "channel": "C07AZJ5DLL8" + } + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_CHANNEL_WEBHOOK_URL }} From d954e7640847df125ce14d51b334c18270b70843 Mon Sep 17 00:00:00 2001 From: Vincent Tang Date: Wed, 22 May 2024 18:53:34 +0000 Subject: [PATCH 5/6] #8729: pytest xdist multiprocess reset mechanism - use custom pyhook from pytest-xdist to run cleanup after timeout - add a new process based timeout method to pytest-timeout - add reset after test fail, use '--metal-timeout' to enable reset mechanism - use pytest-xdist fixture to determine # of workers through '-n auto' - expose get_associated_mmio_device to python and only reset opened tt devices on fail --- conftest.py | 493 ++++++++++++++--------- tt_eager/tt_lib/csrc/tt_lib_bindings.cpp | 4 + tt_metal/host_api.hpp | 2 + tt_metal/tt_metal.cpp | 4 + 4 files changed, 307 insertions(+), 196 deletions(-) diff --git a/conftest.py b/conftest.py index f872bce2998f..c6339ee3ae1b 100644 --- a/conftest.py +++ b/conftest.py @@ -12,6 +12,11 @@ from operator import contains, eq, getitem from pathlib import Path import json +import copy +import multiprocess +import signal +import time +import psutil from loguru import logger @@ -70,191 +75,6 @@ def get_tt_cache_path_(model_version, model_subdir="", default_dir=""): return get_tt_cache_path_ -ALL_ARCHS = set( - [ - "grayskull", - "wormhole_b0", - ] -) - - -def pytest_addoption(parser): - parser.addoption( - "--tt-arch", - choices=[*ALL_ARCHS], - default=os.environ.get("ARCH_NAME", "grayskull"), - help="Target arch, ex. grayskull, wormhole_b0", - ) - parser.addoption( - "--device-id", - type=int, - default=0, - help="Target device id", - ) - parser.addoption( - "--input-method", - action="store", - choices=["json", "cli"], - default=None, - help="Choose input method: 1) json or 2) cli", - ) - parser.addoption( - "--input-path", - action="store", - default="", - help="Path to json file with inputs", - ) - parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli") - - -def pytest_generate_tests(metafunc): - """ - This is not a standard docstring. - - We will explain the non-standard fixtures that pytest_generate_tests is - creating here. - - silicon_arch_name and silicon_arch_ - ---------------------------------------------- - - This is how tests should be requesting accelerator architecture names. - Tests which aim to run on silicon should request a silicon_arch_name - fixture. Just that single fixture will parametrize the test to run on the - provided architecture name from the command line through the --tt-arch - option. The value of the fixture will be the string value of the - architecture name. For example, - - @pytest.mark.post_commit - def test_model_silicon(silicon_arch_name): - # silicon_arch_name will be one of grayskull, wormhole_b0 etc. - run_model_on_silicon(silicon_arch_name) - ... - - If you want to restrict a test to only a specific architecture, you can - provide an additional fixture in the form of silicon_arch_. This - will limit the range of possible values for silicon_arch_name to only be - ARCH_NAME. - - @pytest.mark.post_commit - def test_model_silicon_grayskull_only( - silicon_arch_name, - silicon_arch_grayskull, - ): - # silicon_arch_name can only be grayskull or empty - run_model_on_silicon(silicon_arch_name) - ... - - If --tt-arch specifies an architecture that's not ARCH_NAME, the test will - be skipped. We ensure skipping by providing an empty list parametrization - for silicon_arch_name, and with the empty_parameter_set_mark config option - for pytest, will skip any tests with an empty list parametrization. - - Note that you must provide silicon_arch_name as a fixture if you want to - use the silicon_arch_ fixture. - - Note that if tests want to use the ARCH value from the API, tests should - create their own separate fixture which will convert the string value - provided from silicon_arch_name into ARCH. We keep it as strings here - because these fixtures will be used in tests which do not have access to - any Python APIs. - """ - - tt_arch = metafunc.config.getoption("--tt-arch") - - silicon_arch_specific_fixture_name_to_avail_archs = { - "silicon_arch_grayskull": set( - [ - "grayskull", - ] - ), - "silicon_arch_wormhole_b0": set( - [ - "wormhole_b0", - ] - ), - } - - check_uses_silicon_arch_specific_fixture = partial(contains, silicon_arch_specific_fixture_name_to_avail_archs) - test_requested_silicon_arch_fixtures = tuple( - filter(check_uses_silicon_arch_specific_fixture, metafunc.fixturenames) - ) - is_test_requesting_specific_silicon_archs = len(test_requested_silicon_arch_fixtures) > 0 - get_archs_for_silicon_arch_specific_fixture = partial(getitem, silicon_arch_specific_fixture_name_to_avail_archs) - test_requested_silicon_archs = ALL_ARCHS.intersection( - *map( - get_archs_for_silicon_arch_specific_fixture, - test_requested_silicon_arch_fixtures, - ) - ) - - available_archs = test_requested_silicon_archs if is_test_requesting_specific_silicon_archs else ALL_ARCHS - matches_user_requested_silicon_arch = partial(eq, tt_arch) - available_archs = tuple(filter(matches_user_requested_silicon_arch, available_archs)) - - uses_silicon_arch = "silicon_arch_name" in metafunc.fixturenames - - # sanity - if is_test_requesting_specific_silicon_archs and not uses_silicon_arch: - raise Exception( - f"{metafunc.function} requesting a specific silicon target, but doesn't use silicon_arch_name fixture" - ) - - if uses_silicon_arch: - metafunc.parametrize("silicon_arch_name", available_archs, scope="session") - for test_requested_silicon_arch_fixture in test_requested_silicon_arch_fixtures: - # The values of these arch-specific fixtures should not be used in - # the test function, so use any parameters, like [True] - metafunc.parametrize(test_requested_silicon_arch_fixture, [True], scope="session") - - input_method = metafunc.config.getoption("--input-method") - if input_method == "json": - json_path = metafunc.config.getoption("--input-path") - if not json_path: - raise ValueError("Please provide a valid JSON path using --input-path option.") - with open(json_path, "r") as f: - data = json.load(f) - metafunc.parametrize("user_input", [data]) - elif input_method == "cli": - cli_input = metafunc.config.getoption("--cli-input") - if not cli_input: - raise ValueError("Please provide input using --cli-input option.") - metafunc.parametrize("user_input", [[cli_input]]) - - -# Report stashing to get outcomes etc -phase_report_key = pytest.StashKey() - - -@pytest.hookimpl(tryfirst=True, hookwrapper=True) -def pytest_runtest_makereport(item, call): - # execute all other hooks to obtain the report object - outcome = yield - rep = outcome.get_result() - - # store test results for each phase of a call, which can - # be "setup", "call", "teardown" - item.stash.setdefault(phase_report_key, {})[rep.when] = rep - - -@pytest.fixture(scope="function") -def reset_tensix(request, silicon_arch_name): - yield - - report = request.node.stash[phase_report_key] - - test_failed = ("call" not in report) or report["call"].failed - - if test_failed: - logger.debug("Test failed - resetting with smi") - if silicon_arch_name == "grayskull": - result = run_process_and_get_result("tt-smi -tr all") - elif silicon_arch_name == "wormhole_b0": - result = run_process_and_get_result("tt-smi -wr all") - else: - raise Exception(f"Unrecognized arch for tensix-reset: {silicon_arch_name}") - assert result.returncode == 0, "Tensix reset script raised error" - - @pytest.fixture(scope="function") def device_params(request): return getattr(request, "param", {}) @@ -266,6 +86,9 @@ def device(request, device_params): device_id = request.config.getoption("device_id") + request.node.device_ids = [device_id] + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)] + num_devices = ttl.device.GetNumPCIeDevices() assert device_id < num_devices, "CreateDevice not supported for non-mmio device" device = ttl.device.CreateDevice(device_id=device_id, **device_params) @@ -284,9 +107,13 @@ def pcie_devices(request, device_params): import tt_lib as ttl num_devices = ttl.device.GetNumPCIeDevices() + device_ids = [i for i in range(num_devices)] + + request.node.device_ids = device_ids + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] # Get only physical devices - devices = ttl.device.CreateDevices(device_ids=[i for i in range(num_devices)], **device_params) + devices = ttl.device.CreateDevices(device_ids, **device_params) yield [devices[i] for i in range(num_devices)] @@ -301,9 +128,13 @@ def all_devices(request, device_params): import tt_lib as ttl num_devices = ttl.device.GetNumAvailableDevices() + device_ids = [i for i in range(num_devices)] + + request.node.device_ids = device_ids + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] # Get only physical devices - devices = ttl.device.CreateDevices(device_ids=[i for i in range(num_devices)], **device_params) + devices = ttl.device.CreateDevices(device_ids, **device_params) yield [devices[i] for i in range(num_devices)] @@ -316,6 +147,7 @@ def all_devices(request, device_params): @pytest.fixture(scope="function") def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params): import ttnn + import tt_lib as ttl device_ids = ttnn.get_device_ids() try: @@ -323,6 +155,9 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par except (ValueError, AttributeError): num_devices_requested = len(device_ids) + request.node.device_ids = device_ids[:num_devices_requested] + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] + device_mesh = ttnn.open_device_mesh( ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested], **device_params ) @@ -330,8 +165,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created") yield device_mesh - import tt_lib as ttl - for device in device_mesh.get_devices(): ttl.device.DumpDeviceProfiler(device) @@ -342,6 +175,7 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par @pytest.fixture(scope="function") def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params): import ttnn + import tt_lib as ttl device_ids = ttnn.get_pcie_device_ids() try: @@ -349,6 +183,9 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic except (ValueError, AttributeError): num_pcie_devices_requested = len(device_ids) + request.node.device_ids = device_ids[:num_pcie_devices_requested] + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]] + device_mesh = ttnn.open_device_mesh( ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params ) @@ -356,8 +193,6 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created") yield device_mesh - import tt_lib as ttl - for device in device_mesh.get_devices(): ttl.device.DumpDeviceProfiler(device) @@ -368,6 +203,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic @pytest.fixture(scope="function") def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params): import ttnn + import tt_lib as ttl if ttnn.get_num_devices() < 8: pytest.skip() @@ -377,6 +213,9 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device except (ValueError, AttributeError): num_devices_requested = len(device_ids) + request.node.device_ids = device_ids[:num_devices_requested] + request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] + device_mesh = ttnn.open_device_mesh( ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested], **device_params ) @@ -384,8 +223,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created") yield device_mesh - import tt_lib as ttl - for device in device_mesh.get_devices(): ttl.device.DumpDeviceProfiler(device) @@ -453,6 +290,270 @@ def tracy_profile(): profiler.disable() -@pytest.fixture -def input_path(request): - return request.config.getoption("--input-path") +############################### +# Modifying pytest hooks +############################### +ALL_ARCHS = set( + [ + "grayskull", + "wormhole_b0", + ] +) + + +def pytest_addoption(parser): + parser.addoption( + "--tt-arch", + choices=[*ALL_ARCHS], + default=os.environ.get("ARCH_NAME", "grayskull"), + help="Target arch, ex. grayskull, wormhole_b0", + ) + parser.addoption( + "--pipeline-type", + default="", + help="Only `models_device_performance_bare_metal` should run `pytest_runtest_teardown`", + ) + parser.addoption( + "--device-id", + type=int, + default=0, + help="Target device id", + ) + parser.addoption( + "--input-method", + action="store", + choices=["json", "cli"], + default=None, + help="Choose input method: 1) json or 2) cli", + ) + parser.addoption( + "--input-path", + action="store", + default="", + help="Path to json file with inputs", + ) + parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli") + parser.addoption( + "--metal-cleanup", + action="store", + default=None, + help="Enable process timeout", + ) + + +def pytest_generate_tests(metafunc): + """ + This is not a standard docstring. + + We will explain the non-standard fixtures that pytest_generate_tests is + creating here. + + silicon_arch_name and silicon_arch_ + ---------------------------------------------- + + This is how tests should be requesting accelerator architecture names. + Tests which aim to run on silicon should request a silicon_arch_name + fixture. Just that single fixture will parametrize the test to run on the + provided architecture name from the command line through the --tt-arch + option. The value of the fixture will be the string value of the + architecture name. For example, + + @pytest.mark.post_commit + def test_model_silicon(silicon_arch_name): + # silicon_arch_name will be one of grayskull, wormhole_b0 etc. + run_model_on_silicon(silicon_arch_name) + ... + + If you want to restrict a test to only a specific architecture, you can + provide an additional fixture in the form of silicon_arch_. This + will limit the range of possible values for silicon_arch_name to only be + ARCH_NAME. + + @pytest.mark.post_commit + def test_model_silicon_grayskull_only( + silicon_arch_name, + silicon_arch_grayskull, + ): + # silicon_arch_name can only be grayskull or empty + run_model_on_silicon(silicon_arch_name) + ... + + If --tt-arch specifies an architecture that's not ARCH_NAME, the test will + be skipped. We ensure skipping by providing an empty list parametrization + for silicon_arch_name, and with the empty_parameter_set_mark config option + for pytest, will skip any tests with an empty list parametrization. + + Note that you must provide silicon_arch_name as a fixture if you want to + use the silicon_arch_ fixture. + + Note that if tests want to use the ARCH value from the API, tests should + create their own separate fixture which will convert the string value + provided from silicon_arch_name into ARCH. We keep it as strings here + because these fixtures will be used in tests which do not have access to + any Python APIs. + """ + + tt_arch = metafunc.config.getoption("--tt-arch") + + silicon_arch_specific_fixture_name_to_avail_archs = { + "silicon_arch_grayskull": set( + [ + "grayskull", + ] + ), + "silicon_arch_wormhole_b0": set( + [ + "wormhole_b0", + ] + ), + } + + check_uses_silicon_arch_specific_fixture = partial(contains, silicon_arch_specific_fixture_name_to_avail_archs) + test_requested_silicon_arch_fixtures = tuple( + filter(check_uses_silicon_arch_specific_fixture, metafunc.fixturenames) + ) + is_test_requesting_specific_silicon_archs = len(test_requested_silicon_arch_fixtures) > 0 + get_archs_for_silicon_arch_specific_fixture = partial(getitem, silicon_arch_specific_fixture_name_to_avail_archs) + test_requested_silicon_archs = ALL_ARCHS.intersection( + *map( + get_archs_for_silicon_arch_specific_fixture, + test_requested_silicon_arch_fixtures, + ) + ) + + available_archs = test_requested_silicon_archs if is_test_requesting_specific_silicon_archs else ALL_ARCHS + matches_user_requested_silicon_arch = partial(eq, tt_arch) + available_archs = tuple(filter(matches_user_requested_silicon_arch, available_archs)) + + uses_silicon_arch = "silicon_arch_name" in metafunc.fixturenames + + # sanity + if is_test_requesting_specific_silicon_archs and not uses_silicon_arch: + raise Exception( + f"{metafunc.function} requesting a specific silicon target, but doesn't use silicon_arch_name fixture" + ) + + if uses_silicon_arch: + metafunc.parametrize("silicon_arch_name", available_archs) + for test_requested_silicon_arch_fixture in test_requested_silicon_arch_fixtures: + # The values of these arch-specific fixtures should not be used in + # the test function, so use any parameters, like [True] + metafunc.parametrize(test_requested_silicon_arch_fixture, [True]) + + input_method = metafunc.config.getoption("--input-method") + if input_method == "json": + json_path = metafunc.config.getoption("--input-path") + if not json_path: + raise ValueError("Please provide a valid JSON path using --input-path option.") + with open(json_path, "r") as f: + data = json.load(f) + metafunc.parametrize("user_input", [data]) + elif input_method == "cli": + cli_input = metafunc.config.getoption("--cli-input") + if not cli_input: + raise ValueError("Please provide input using --cli-input option.") + metafunc.parametrize("user_input", [[cli_input]]) + + +# Report stashing to get outcomes etc +phase_report_key = pytest.StashKey() + + +@pytest.hookimpl(tryfirst=True, hookwrapper=True) +def pytest_runtest_makereport(item, call): + # execute all other hooks to obtain the report object + outcome = yield + rep = outcome.get_result() + + # store test results for each phase of a call, which can + # be "setup", "call", "teardown" + item.stash.setdefault(phase_report_key, {})[rep.when] = rep + + +@pytest.hookimpl(hookwrapper=True) +def pytest_runtest_teardown(item, nextitem): + yield + metal_cleanup_enabled = item.config.getoption("--metal-cleanup") + if metal_cleanup_enabled is not None: + report = item.stash[phase_report_key] + test_failed = report.get("call", None) and report["call"].failed + if test_failed: + logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}") + # reset_tensix(set(item.pci_ids)) + reset_tensix() + + +# This is overriding the timer setup hook from pytest-timeout +# If --metal-timeout is passed, we define a new timeout method that spawns a timer process +# At timeout, the process kills it's parent (the test process) and then itself +@pytest.hookimpl(tryfirst=True) +def pytest_timeout_set_timer(item, settings): + metal_timeout_enabled = item.config.getoption("--metal-cleanup") + if metal_timeout_enabled is not None: + parent_pid = os.getpid() + logger.info(f"Metal timeout {settings.timeout} seconds") + + def get_parent_status(): + try: + parent = psutil.Process(parent_pid) + except: + return "already dead" + return parent.status() + + def run_timer(settings): + dead_status = ["zombie", "dead", "already dead"] + timeout = settings.timeout + while get_parent_status() not in dead_status and timeout > 0: + time.sleep(1) + timeout -= 1 + if get_parent_status() != "already dead": + logger.info(f"Timing out test case") + os.kill(parent_pid, signal.SIGKILL) + logger.info(f"Killing timer") + os._exit(1) + + def cancel(): + logger.info(f"Cancelling timer") + metal_timer.terminate() + + metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True) + item.cancel_timeout = cancel + metal_timer.start() + # logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}") + return True + + +# This is a hook used in pytest-xdist to handle when a worker crashes out +# In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and +# then it should get cleaned up by the controller through this fixture :fingers_crossed: +@pytest.hookimpl(tryfirst=True) +def pytest_handlecrashitem(crashitem, report, sched): + reset_tensix() + + +def reset_tensix(tt_open_devices=None): + metal_env = copy.deepcopy(os.environ) + arch = metal_env.get("ARCH_NAME") + if arch != "grayskull" and arch != "wormhole_b0": + raise Exception(f"Unrecognized arch for tensix-reset: {arch}") + + if tt_open_devices is None: + logger.info(f"Running reset with reset script: /opt/tt_metal_infra/scripts/ci/{arch}/reset.sh") + smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh") + else: + tt_open_devices_str = ",".join([str(i) for i in tt_open_devices]) + check_smi = run_process_and_get_result("tt-smi-metal -h") + logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}") + logger.info(f"Running reset for pci devices: {tt_open_devices_str}") + if check_smi.returncode > 0: + logger.info(f"Test failed - resetting {arch} with tt-smi") + smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}") + else: + smi_reset_result = run_process_and_get_result(f"tt-smi-metal -r {tt_open_devices_str}") + logger.info(f"tt-smi reset status: {smi_reset_result.returncode}") + + +@pytest.hookimpl(tryfirst=True) +def pytest_xdist_auto_num_workers(config): + logger.info("getting num of xdist workers") + return 1 diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp index 915276dfa393..d03205be995b 100644 --- a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp +++ b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp @@ -136,6 +136,10 @@ void DeviceModule(py::module &m_device) { Returns number of Tenstorrent devices that are connected to host via PCIe and can be targeted. )doc"); + m_device.def("GetPCIeDeviceID", &GetPCIeDeviceID, R"doc( + Returns associated mmio device of give device id. + )doc"); + m_device.def("SetDefaultDevice", &AutoFormat::SetDefaultDevice, R"doc( Sets the default device to use for ops when inputs aren't on device. diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp index a2a490c345cc..5572bad808e6 100644 --- a/tt_metal/host_api.hpp +++ b/tt_metal/host_api.hpp @@ -55,6 +55,8 @@ size_t GetNumAvailableDevices(); */ size_t GetNumPCIeDevices(); +chip_id_t GetPCIeDeviceID(chip_id_t device_id); + /** * Instantiates a device object. * diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index a1ec68fb1bb3..dc529c1256eb 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -730,6 +730,10 @@ size_t GetNumPCIeDevices() { #endif } +chip_id_t GetPCIeDeviceID(chip_id_t device_id){ + return tt::Cluster::instance().get_associated_mmio_device(device_id); +} + Device *CreateDevice( chip_id_t device_id, const uint8_t num_hw_cqs, From d525b1710a5201c0329ffba5686c40217be0b414 Mon Sep 17 00:00:00 2001 From: Vincent Tang Date: Thu, 13 Jun 2024 22:02:21 +0000 Subject: [PATCH 6/6] #8729: xdist + reset mechanism on fd nightly, model perf, all t3k (except profiler) - enable timeout mechanism by default if using xdist, use 'metal-timeout' flag to enable if not using xdist - increase GH actions timeout for xdist (review) - get timings of each test and set global timeout to 5 mins (review) - add custom timeouts to nightly + t3k pipelines + post-commit (review) --- ...-dispatch-full-regressions-and-models.yaml | 2 +- .github/workflows/perf-models.yaml | 2 +- .github/workflows/t3000-demo-tests.yaml | 3 +- .github/workflows/t3000-frequent-tests.yaml | 1 + .github/workflows/t3000-model-perf-tests.yaml | 9 +-- .github/workflows/t3000-unit-tests.yaml | 1 + conftest.py | 59 ++++++++-------- pytest.ini | 2 +- tests/scripts/run_performance.sh | 25 +++---- tests/scripts/run_tests.sh | 2 +- .../single_card/nightly/run_common_models.sh | 9 ++- .../single_card/nightly/run_gs_only.sh | 11 ++- tests/scripts/single_card/nightly/run_ttnn.sh | 9 ++- .../single_card/nightly/run_wh_b0_only.sh | 11 ++- .../single_card/nightly/run_wh_b0_unstable.sh | 9 ++- tests/scripts/t3000/run_t3000_demo_tests.sh | 37 +++++++--- .../scripts/t3000/run_t3000_frequent_tests.sh | 68 +++++++++++++------ .../t3000/run_t3000_model_perf_tests.sh | 31 +++++++-- tests/scripts/t3000/run_t3000_unit_tests.sh | 66 ++++++++++++------ tt_metal/python_env/requirements-dev.txt | 1 + 20 files changed, 238 insertions(+), 120 deletions(-) diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml index 115b94154522..b6dc4f619c55 100644 --- a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml +++ b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml @@ -26,7 +26,7 @@ jobs: { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 30 }, { name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 }, { name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 }, - { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 35 }, + { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 45 }, ] name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }} env: diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml index 8c423e865c1f..f5905175e7e1 100644 --- a/.github/workflows/perf-models.yaml +++ b/.github/workflows/perf-models.yaml @@ -52,7 +52,7 @@ jobs: - uses: ./.github/actions/install-python-deps - name: Run performance regressions id: performance_tests - timeout-minutes: 30 + timeout-minutes: 40 run: | source ${{ github.workspace }}/python_env/bin/activate ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }} diff --git a/.github/workflows/t3000-demo-tests.yaml b/.github/workflows/t3000-demo-tests.yaml index a05a651f0c5b..ca524dd3a8ae 100644 --- a/.github/workflows/t3000-demo-tests.yaml +++ b/.github/workflows/t3000-demo-tests.yaml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: test-group: [ - { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 40, owner_id: U044T8U8DEF}, #Johanna Rock + { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 50, owner_id: U044T8U8DEF}, #Johanna Rock { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich { name: "t3k falcon7b tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 90, owner_id: U05RWH3QUPM}, #Salar Hosseini { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, # Miguel Tairum @@ -46,6 +46,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run demo regression tests + shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/.github/workflows/t3000-frequent-tests.yaml b/.github/workflows/t3000-frequent-tests.yaml index d6feebce9dfa..70a13c371f58 100644 --- a/.github/workflows/t3000-frequent-tests.yaml +++ b/.github/workflows/t3000-frequent-tests.yaml @@ -42,6 +42,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run frequent regression tests + shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml index 3edeb3884699..4995b036238d 100644 --- a/.github/workflows/t3000-model-perf-tests.yaml +++ b/.github/workflows/t3000-model-perf-tests.yaml @@ -17,10 +17,10 @@ jobs: fail-fast: false matrix: test-group: [ - { name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 60, owner_id: S07AJBTLX2L}, #Model Falcon - { name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum - { name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich - { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 60, owner_id: S07AJBTLX2L}, # Model Falcon + { name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 75, owner_id: S07AJBTLX2L}, #Model Falcon + { name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 75, owner_id: U03PUAKE719}, # Miguel Tairum + { name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 75, owner_id: U03FJB5TM5Y}, #Colman Glagovich + { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: S07AJBTLX2L}, # Model Falcon #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run? ] name: ${{ matrix.test-group.name }} @@ -52,6 +52,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run model perf regression tests + shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/.github/workflows/t3000-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml index 5b9e99baaa21..297b863399f0 100644 --- a/.github/workflows/t3000-unit-tests.yaml +++ b/.github/workflows/t3000-unit-tests.yaml @@ -43,6 +43,7 @@ jobs: run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar - uses: ./.github/actions/install-python-deps - name: Run unit regression tests + shell: bash {0} timeout-minutes: ${{ matrix.test-group.timeout }} run: | source ${{ github.workspace }}/python_env/bin/activate diff --git a/conftest.py b/conftest.py index c6339ee3ae1b..cbbda1b9e72b 100644 --- a/conftest.py +++ b/conftest.py @@ -85,8 +85,6 @@ def device(request, device_params): import tt_lib as ttl device_id = request.config.getoption("device_id") - - request.node.device_ids = [device_id] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)] num_devices = ttl.device.GetNumPCIeDevices() @@ -108,9 +106,7 @@ def pcie_devices(request, device_params): num_devices = ttl.device.GetNumPCIeDevices() device_ids = [i for i in range(num_devices)] - - request.node.device_ids = device_ids - request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] + request.node.pci_ids = device_ids # Get only physical devices devices = ttl.device.CreateDevices(device_ids, **device_params) @@ -129,8 +125,6 @@ def all_devices(request, device_params): num_devices = ttl.device.GetNumAvailableDevices() device_ids = [i for i in range(num_devices)] - - request.node.device_ids = device_ids request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids] # Get only physical devices @@ -155,7 +149,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par except (ValueError, AttributeError): num_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_devices_requested] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] device_mesh = ttnn.open_device_mesh( @@ -183,8 +176,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic except (ValueError, AttributeError): num_pcie_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_pcie_devices_requested] - request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]] + request.node.pci_ids = device_ids[:num_pcie_devices_requested] device_mesh = ttnn.open_device_mesh( ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params @@ -213,7 +205,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device except (ValueError, AttributeError): num_devices_requested = len(device_ids) - request.node.device_ids = device_ids[:num_devices_requested] request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]] device_mesh = ttnn.open_device_mesh( @@ -334,13 +325,18 @@ def pytest_addoption(parser): ) parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli") parser.addoption( - "--metal-cleanup", + "--metal-timeout", action="store", default=None, help="Enable process timeout", ) +@pytest.fixture +def input_path(request): + return request.config.getoption("--input-path") + + def pytest_generate_tests(metafunc): """ This is not a standard docstring. @@ -473,14 +469,15 @@ def pytest_runtest_makereport(item, call): @pytest.hookimpl(hookwrapper=True) def pytest_runtest_teardown(item, nextitem): yield - metal_cleanup_enabled = item.config.getoption("--metal-cleanup") - if metal_cleanup_enabled is not None: + metal_timeout_enabled = item.config.getoption("--metal-timeout") + using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0")) + + if metal_timeout_enabled is not None or using_xdist: report = item.stash[phase_report_key] test_failed = report.get("call", None) and report["call"].failed if test_failed: - logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}") - # reset_tensix(set(item.pci_ids)) - reset_tensix() + logger.info(f"In custom teardown, open device ids: {set(item.pci_ids)}") + reset_tensix(set(item.pci_ids)) # This is overriding the timer setup hook from pytest-timeout @@ -488,10 +485,12 @@ def pytest_runtest_teardown(item, nextitem): # At timeout, the process kills it's parent (the test process) and then itself @pytest.hookimpl(tryfirst=True) def pytest_timeout_set_timer(item, settings): - metal_timeout_enabled = item.config.getoption("--metal-cleanup") - if metal_timeout_enabled is not None: + metal_timeout_enabled = item.config.getoption("--metal-timeout") + using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0")) + + if metal_timeout_enabled is not None or using_xdist: parent_pid = os.getpid() - logger.info(f"Metal timeout {settings.timeout} seconds") + logger.info(f"Metal timeout {settings.timeout} seconds {parent_pid} for {item.nodeid}") def get_parent_status(): try: @@ -501,12 +500,15 @@ def get_parent_status(): return parent.status() def run_timer(settings): + logger.info(f"Timer started for {item.nodeid}") dead_status = ["zombie", "dead", "already dead"] timeout = settings.timeout - while get_parent_status() not in dead_status and timeout > 0: - time.sleep(1) - timeout -= 1 - if get_parent_status() != "already dead": + parent_status = "running" + while parent_status not in dead_status and timeout > 0: + time.sleep(5) + timeout -= 5 + parent_status = get_parent_status() + if parent_status != "already dead": logger.info(f"Timing out test case") os.kill(parent_pid, signal.SIGKILL) logger.info(f"Killing timer") @@ -519,13 +521,12 @@ def cancel(): metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True) item.cancel_timeout = cancel metal_timer.start() - # logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}") return True # This is a hook used in pytest-xdist to handle when a worker crashes out # In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and -# then it should get cleaned up by the controller through this fixture :fingers_crossed: +# then it should get cleaned up by the controller through this fixture @pytest.hookimpl(tryfirst=True) def pytest_handlecrashitem(crashitem, report, sched): reset_tensix() @@ -542,10 +543,9 @@ def reset_tensix(tt_open_devices=None): smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh") else: tt_open_devices_str = ",".join([str(i) for i in tt_open_devices]) - check_smi = run_process_and_get_result("tt-smi-metal -h") - logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}") + check_smi_metal = run_process_and_get_result("tt-smi-metal -h") logger.info(f"Running reset for pci devices: {tt_open_devices_str}") - if check_smi.returncode > 0: + if check_smi_metal.returncode > 0: logger.info(f"Test failed - resetting {arch} with tt-smi") smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}") else: @@ -555,5 +555,4 @@ def reset_tensix(tt_open_devices=None): @pytest.hookimpl(tryfirst=True) def pytest_xdist_auto_num_workers(config): - logger.info("getting num of xdist workers") return 1 diff --git a/pytest.ini b/pytest.ini index c8f8a206f754..699ef215218e 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] timeout = 300 minversion = 7.2 -addopts = --import-mode=importlib -vs -rA +addopts = --import-mode=importlib -vvs -rA --durations=0 empty_parameter_set_mark = skip markers = post_commit: mark tests to run on post-commit diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index 754bcbc9ab1e..91567864538d 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -1,6 +1,6 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 @@ -11,19 +11,19 @@ run_perf_models_other() { local tt_arch=$1 local test_marker=$2 - env pytest tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker - env pytest tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker - env pytest models/demos/ttnn_falcon7b/tests -m $test_marker + env pytest -n auto models/demos/ttnn_falcon7b/tests -m $test_marker # Separate calls since we can't mix switching between number of cqs - env pytest models/demos/resnet/tests/test_perf_resnet.py -m $test_marker - env pytest models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker + env pytest -n auto models/demos/resnet/tests/test_perf_resnet.py -m $test_marker + env pytest -n auto models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker - env pytest tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker + env pytest -n auto tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker - env pytest models/demos/metal_BERT_large_11/tests -m $test_marker + env pytest -n auto models/demos/metal_BERT_large_11/tests -m $test_marker ## Merge all the generated reports env python models/perf/merge_perf_results.py @@ -33,13 +33,13 @@ run_perf_models_llm_javelin() { local tt_arch=$1 local test_marker=$2 - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m $test_marker + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m $test_marker if [ "$tt_arch" == "wormhole_b0" ]; then - env pytest models/demos/mamba/tests -m $test_marker --timeout=360 + env pytest -n auto models/demos/mamba/tests -m $test_marker --timeout=360 fi - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360 ## Merge all the generated reports env python models/perf/merge_perf_results.py @@ -50,7 +50,7 @@ run_perf_models_cnn_javelin() { local test_marker=$2 # Run tests - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480 #env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests -m $test_marker ## Merge all the generated reports @@ -58,6 +58,7 @@ run_perf_models_cnn_javelin() { } run_device_perf_models() { + set -eo pipefail local test_marker=$1 env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600 diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 334b68b71fd0..ebd25264b9ce 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -81,7 +81,7 @@ run_frequent_api_pipeline_tests() { ./tests/scripts/run_python_api_unit_tests.sh else if [[ $tt_arch == "wormhole_b0" ]]; then - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly else echo "API tests are not available for fast dispatch because they're already covered in post-commit" fi diff --git a/tests/scripts/single_card/nightly/run_common_models.sh b/tests/scripts/single_card/nightly/run_common_models.sh index 17ca8c4d3cf3..19e090065f3f 100755 --- a/tests/scripts/single_card/nightly/run_common_models.sh +++ b/tests/scripts/single_card/nightly/run_common_models.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running common models for archs" -env pytest tests/nightly/common_models/ +env pytest -n auto tests/nightly/common_models/ ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_gs_only.sh b/tests/scripts/single_card/nightly/run_gs_only.sh index c5bcc9f97452..bad5b98ea404 100755 --- a/tests/scripts/single_card/nightly/run_gs_only.sh +++ b/tests/scripts/single_card/nightly/run_gs_only.sh @@ -1,14 +1,19 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running model nightly tests for GS only" -env pytest models/demos/resnet/tests/test_metal_resnet50_performant.py +env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_performant.py ; fail+=$? -env pytest models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py +env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_ttnn.sh b/tests/scripts/single_card/nightly/run_ttnn.sh index f0bb3f9cadc3..a41836173deb 100755 --- a/tests/scripts/single_card/nightly/run_ttnn.sh +++ b/tests/scripts/single_card/nightly/run_ttnn.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running ttnn nightly tests for GS only" -env pytest tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" +env pytest -n auto tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_wh_b0_only.sh b/tests/scripts/single_card/nightly/run_wh_b0_only.sh index d30894713c13..5ae9f0657cb1 100755 --- a/tests/scripts/single_card/nightly/run_wh_b0_only.sh +++ b/tests/scripts/single_card/nightly/run_wh_b0_only.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running nightly tests for WH B0 only" -env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/nightly/wh_b0_only_eth -env pytest tests/nightly/wh_b0_only \ No newline at end of file +env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/nightly/wh_b0_only_eth ; fail+=$? +env pytest -n auto tests/nightly/wh_b0_only ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh b/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh index 079087d6e690..35895a64208b 100755 --- a/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh +++ b/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh @@ -1,12 +1,17 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail if [[ -z "$TT_METAL_HOME" ]]; then echo "Must provide TT_METAL_HOME in environment" 1>&2 exit 1 fi +fail=0 echo "Running unstable nightly tests for WH B0 only" -SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest tests/ttnn/integration_tests/stable_diffusion +SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest -n auto tests/ttnn/integration_tests/stable_diffusion ; fail+=$? + +if [[ $fail -ne 0 ]]; then + exit 1 +fi diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh index 96a05371beb1..fa050429ddb2 100755 --- a/tests/scripts/t3000/run_t3000_demo_tests.sh +++ b/tests/scripts/t3000/run_t3000_demo_tests.sh @@ -1,23 +1,27 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_falcon40b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py --timeout=720 + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py --timeout=720 ; fail+=$? # Falcon40B end to end demo (prefill + decode) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_loops.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_loops.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_llama3_70b_tests() { @@ -38,39 +42,47 @@ run_t3000_llama3_70b_tests() { run_t3000_falcon7b_tests(){ # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon7b_tests" # Falcon7B demo (perf verification for 128/1024/2048 seq lens and output token verification) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_128_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_1024_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_2048_stochastic_verify] - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-default_mode_1024_greedy_verify] + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_128_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_1024_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_2048_stochastic_verify] ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-default_mode_1024_greedy_verify] ; fail+=$? # Falcon7B perplexity test (prefill and decode) - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-prefill_seq1024_dram] --timeout=720 - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-decode_1024_l1_sharded] --timeout=720 + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-prefill_seq1024_dram] --timeout=720 ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-decode_1024_l1_sharded] --timeout=720 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral8x7b_tests" # mixtral8x7b 8 chip demo test - 100 token generation with general weights (env flags set inside the test) - pytest models/demos/t3000/mixtral8x7b/demo/demo.py::test_mixtral8x7b_demo[wormhole_b0-True-general_weights] --timeout=720 + pytest -n auto models/demos/t3000/mixtral8x7b/demo/demo.py::test_mixtral8x7b_demo[wormhole_b0-True-general_weights] --timeout=720 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tests() { @@ -87,6 +99,7 @@ run_t3000_tests() { run_t3000_mixtral_tests } +fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then @@ -109,6 +122,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh index 37abf05e64d3..cab852813ef9 100755 --- a/tests/scripts/t3000/run_t3000_frequent_tests.sh +++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh @@ -1,99 +1,122 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_ethernet_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_ethernet_tests" - pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py - pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py + pytest -n auto tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py ; fail+=$? + pytest -n auto tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_ethernet_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_llama2_70b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_llama2_70b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py --timeout=900 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py ; fail+=$? + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py ; fail+=$? + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py ; fail+=$? + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py --timeout=900 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral_tests" # mixtral8x7b 8 chip decode model test (env flags set inside the test) - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-10-1-pcc] + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-10-1-pcc] ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tteager_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_tteager_tests" - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit - pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_reduce_scatter_post_commit.py + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit ; fail+=$? + pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_reduce_scatter_post_commit.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_tteager_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_trace_stress_tests() { + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_trace_stress_tests" - - NUM_TRACE_LOOPS=15 pytest tests/ttnn/unit_tests/test_multi_device_trace.py - NUM_TRACE_LOOPS=15 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py + NUM_TRACE_LOOPS=15 pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + NUM_TRACE_LOOPS=15 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_trace_stress_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon40b_tests() { + fail=0 # Record the start time start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_mlp.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_attention.py --timeout=480 - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_decoder.py --timeout=480 - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_causallm.py --timeout=600 + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_mlp.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_attention.py --timeout=480 ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_decoder.py --timeout=480 ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_causallm.py --timeout=600 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tests() { @@ -103,9 +126,6 @@ run_t3000_tests() { # Run tteager tests run_t3000_tteager_tests - # Run trace tests - run_t3000_trace_stress_tests - # Run falcon40b tests run_t3000_falcon40b_tests @@ -115,8 +135,12 @@ run_t3000_tests() { # Run mixtral tests run_t3000_mixtral_tests + # Run trace tests + run_t3000_trace_stress_tests + } +fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then @@ -139,6 +163,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh index 6140b9efeafd..6f97b8e76368 100755 --- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh +++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh @@ -1,61 +1,77 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_falcon7b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon7b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m "model_perf_t3000" + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m "model_perf_t3000" ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral_tests" - env pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py -m "model_perf_t3000" + env pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py -m "model_perf_t3000" ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_llama2_70b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_llama2_70b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon40b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" - env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_perf_falcon.py -m "model_perf_t3000" --timeout=600 + env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_perf_falcon.py -m "model_perf_t3000" --timeout=600 ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_llm_tests() { @@ -80,6 +96,7 @@ run_t3000_cnn_tests() { env python models/perf/merge_perf_results.py } +fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then @@ -129,6 +146,10 @@ main() { echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1 exit 1 fi + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index a8019137642b..64c23ed2b488 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -1,66 +1,79 @@ #/bin/bash -set -eo pipefail +# set -eo pipefail run_t3000_ttmetal_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_ttmetal_tests" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" - TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" - TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" - ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" ; fail+=$? + TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" ; fail+=$? + TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" ; fail+=$? + ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_ttmetal_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_ttnn_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_ttnn_tests" - pytest tests/ttnn/unit_tests/test_multi_device_trace.py - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py - pytest tests/ttnn/unit_tests/test_multi_device.py - pytest tests/ttnn/unit_tests/test_multi_device_async.py + pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$? + pytest -n auto tests/ttnn/unit_tests/test_multi_device.py ; fail+=$? + pytest -n auto tests/ttnn/unit_tests/test_multi_device_async.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_ttnn_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon7b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon7b_tests" - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py - pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py ; fail+=$? + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py ; fail+=$? + pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py ; fail+=$? #pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_falcon40b_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_falcon40b_tests" - WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py + WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py ; fail+=$? # Record the end time end_time=$(date +%s) @@ -70,21 +83,25 @@ run_t3000_falcon40b_tests() { run_t3000_mixtral_tests() { # Record the start time + fail=0 start_time=$(date +%s) echo "LOG_METAL: Running run_t3000_mixtral_tests" - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py - pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py ; fail+=$? + pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py ; fail+=$? # Record the end time end_time=$(date +%s) duration=$((end_time - start_time)) echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete" + if [[ $fail -ne 0 ]]; then + exit 1 + fi } run_t3000_tests() { @@ -104,6 +121,7 @@ run_t3000_tests() { run_t3000_mixtral_tests } +fail=0 main() { # For CI pipeline - source func commands but don't execute tests if not invoked directly if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then @@ -126,6 +144,10 @@ main() { export PYTHONPATH=$TT_METAL_HOME run_t3000_tests + + if [[ $fail -ne 0 ]]; then + exit 1 + fi } main "$@" diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt index f7f902029195..5a6cf7ebb885 100644 --- a/tt_metal/python_env/requirements-dev.txt +++ b/tt_metal/python_env/requirements-dev.txt @@ -21,6 +21,7 @@ mypy==1.9.0 pytest==7.2.2 pytest-timeout==2.2.0 pytest-split==0.8.2 +pytest-xdist==3.6.1 jsbeautifier==1.14.7 datasets==2.9.0 torch==2.2.1.0+cpu