From b5e0888819eb18fa15a8f639532bfcf662276b5d Mon Sep 17 00:00:00 2001
From: Akhmed Rakhmati <akhmed.rakhmati@gmail.com>
Date: Thu, 27 Jun 2024 03:03:21 +0000
Subject: [PATCH 1/6] #9767: use reflect library in reflection.hpp

---
 CMakeLists.txt                                | 11 +--
 cmake/dependencies.cmake                      | 10 ++
 tests/ttnn/unit_tests/gtests/CMakeLists.txt   |  2 +-
 tt_eager/tt_dnn/op_library/CMakeLists.txt     |  2 +-
 tt_metal/tools/profiler/op_profiler.hpp       | 18 ++--
 tt_metal/tt_stl/reflection.hpp                | 93 +++++++++++++++----
 ttnn/CMakeLists.txt                           |  2 +-
 ttnn/cpp/ttnn/device_operation.hpp            | 45 +--------
 .../eltwise/binary/device/binary_op.hpp       | 18 ----
 9 files changed, 99 insertions(+), 102 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 083b369517e8..ee649dc7d86b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,16 +96,6 @@ if (NOT NUMA_LIBRARY)
     message(FATAL_ERROR "NUMA library not found")
 endif()
 
-
-CPMAddPackage(
-  NAME reflect
-  GITHUB_REPOSITORY boost-ext/reflect
-  GIT_TAG v1.1.1
-)
-add_library(reflect INTERFACE)
-target_include_directories(reflect SYSTEM INTERFACE ${reflect_SOURCE_DIR})
-add_library(reflect::reflect ALIAS reflect)
-
 ############################################################################################################################
 # Constructing interface libs for common compiler flags, header directories, and libraries
 #   These interface libs are linked with PUBLIC scope at lowest common target (tt_metal/common) and at tt_metal_libs level
@@ -142,6 +132,7 @@ endif()
 
 add_library(metal_header_directories INTERFACE)
 target_include_directories(metal_header_directories INTERFACE ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc)
+target_include_directories(metal_header_directories SYSTEM INTERFACE ${reflect_SOURCE_DIR})
 foreach(lib ${BoostPackages})
     target_include_directories(metal_header_directories INTERFACE ${Boost${lib}_SOURCE_DIR}/include)
 endforeach()
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
index 622489e7fdd9..025edeae1b89 100644
--- a/cmake/dependencies.cmake
+++ b/cmake/dependencies.cmake
@@ -48,3 +48,13 @@ if (googletest_ADDED)
     target_link_libraries(gtest PRIVATE c++ c++abi)
     target_link_libraries(gtest_main PRIVATE c++ c++abi)
 endif()
+
+############################################################################################################################
+# boost-ext reflect : https://github.com/boost-ext/reflect
+############################################################################################################################
+
+CPMAddPackage(
+  NAME reflect
+  GITHUB_REPOSITORY boost-ext/reflect
+  GIT_TAG v1.1.1
+)
diff --git a/tests/ttnn/unit_tests/gtests/CMakeLists.txt b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
index 3bee41905b42..359a0301929b 100644
--- a/tests/ttnn/unit_tests/gtests/CMakeLists.txt
+++ b/tests/ttnn/unit_tests/gtests/CMakeLists.txt
@@ -9,7 +9,7 @@ set(TTNN_UNIT_TESTS_SRC
 
 add_executable(unit_tests_ttnn ${TTNN_UNIT_TESTS_SRC})
 
-target_link_libraries(unit_tests_ttnn PUBLIC test_common_libs ttnn_lib tt_metal tt_eager reflect::reflect)
+target_link_libraries(unit_tests_ttnn PUBLIC test_common_libs ttnn_lib tt_metal tt_eager)
 target_include_directories(unit_tests_ttnn PRIVATE
     ${UMD_HOME}
     ${PROJECT_SOURCE_DIR}
diff --git a/tt_eager/tt_dnn/op_library/CMakeLists.txt b/tt_eager/tt_dnn/op_library/CMakeLists.txt
index e6be1fe00bc2..6a920b9dc29c 100644
--- a/tt_eager/tt_dnn/op_library/CMakeLists.txt
+++ b/tt_eager/tt_dnn/op_library/CMakeLists.txt
@@ -220,7 +220,7 @@ set(TT_DNN_SRCS
 
 add_library(tt_dnn OBJECT ${TT_DNN_SRCS})
 
-target_link_libraries(tt_dnn PUBLIC metal_header_directories compiler_flags umd_device reflect::reflect)
+target_link_libraries(tt_dnn PUBLIC metal_header_directories compiler_flags umd_device)
 target_include_directories(tt_dnn PUBLIC
     ${UMD_HOME}
     ${PROJECT_SOURCE_DIR}
diff --git a/tt_metal/tools/profiler/op_profiler.hpp b/tt_metal/tools/profiler/op_profiler.hpp
index 09414d8c5878..79c231ea50f9 100644
--- a/tt_metal/tools/profiler/op_profiler.hpp
+++ b/tt_metal/tools/profiler/op_profiler.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <filesystem>
+#include <reflect>
 #include <tuple>
 #include <type_traits>
 
@@ -274,17 +275,12 @@ inline json get_base_json(
     j["op_code"] = opName;
 
     json attributesObj;
-    constexpr auto& attribute_names = std::decay_t<decltype(operation_attributes)>::attribute_names;
-    const auto attribute_values = operation_attributes.attribute_values();
-    [&attributesObj, &attribute_names, &attribute_values]<size_t... Ns>(std::index_sequence<Ns...>) {
-        (
-            [&attributesObj, &attribute_names, &attribute_values] {
-                const auto& attribute_name = std::get<Ns>(attribute_names);
-                const auto& attribute = std::get<Ns>(attribute_values);
-                attributesObj[attribute_name] = fmt::format("{}", attribute);
-            }(),
-            ...);
-    }(std::make_index_sequence<std::tuple_size_v<std::decay_t<decltype(attribute_names)>>>{});
+    reflect::for_each(
+        [&attributesObj, &operation_attributes](auto I) {
+            attributesObj[std::string{reflect::member_name<I>(operation_attributes)}] =
+                fmt::format("{}", reflect::get<I>(operation_attributes));
+        },
+        operation_attributes);
     j["attributes"] = attributesObj;
 
     std::vector<json> input_tensors;
diff --git a/tt_metal/tt_stl/reflection.hpp b/tt_metal/tt_stl/reflection.hpp
index 3b225fe47d99..1deb133dfa72 100644
--- a/tt_metal/tt_stl/reflection.hpp
+++ b/tt_metal/tt_stl/reflection.hpp
@@ -9,6 +9,7 @@
 #include <experimental/type_traits>
 #include <optional>
 #include <ostream>
+#include <reflect>
 #include <set>
 #include <string>
 #include <tuple>
@@ -16,7 +17,6 @@
 #include <vector>
 
 #include "third_party/magic_enum/magic_enum.hpp"
-
 #include "type_name.hpp"
 
 namespace tt {
@@ -37,9 +37,7 @@ concept IsVariant = requires { typename std::variant_size<T>::type; };
 
 template <IsVariant Variant>
 constexpr auto get_active_type_name_in_variant(const Variant& v) {
-    return std::visit([](auto&& arg) -> std::string_view {
-        return short_type_name<std::decay_t<decltype(arg)>>;
-    }, v);
+    return std::visit([](auto&& arg) -> std::string_view { return short_type_name<std::decay_t<decltype(arg)>>; }, v);
 }
 
 // Forward Declare hash_object
@@ -397,46 +395,100 @@ std::ostream& operator<<(std::ostream& os, const std::set<T>& set) {
     return os;
 }
 
-template <typename to_visit_t, typename T>
-    requires std::same_as<std::decay_t<T>, to_visit_t>
+template <typename object_t, typename T>
+    requires std::same_as<std::decay_t<T>, object_t>
 constexpr auto visit_object_of_type(auto callback, T&& value) {
     callback(value);
 }
 
-template <typename to_visit_t, typename T>
+template <typename object_t, typename T>
 constexpr auto visit_object_of_type(auto callback, const std::optional<T>& value) {
     if (value.has_value()) {
-        visit_object_of_type<to_visit_t>(callback, value.value());
+        visit_object_of_type<object_t>(callback, value.value());
     }
 }
 
-template <typename to_visit_t, typename T>
+template <typename object_t, typename T>
 constexpr auto visit_object_of_type(auto callback, const std::vector<T>& value) {
     for (auto& tensor : value) {
-        visit_object_of_type<to_visit_t>(callback, tensor);
+        visit_object_of_type<object_t>(callback, tensor);
     }
 }
 
-template <typename to_visit_t, typename T, auto N>
+template <typename object_t, typename T, auto N>
 constexpr auto visit_object_of_type(auto callback, const std::array<T, N>& value) {
     for (auto& tensor : value) {
-        visit_object_of_type<to_visit_t>(callback, tensor);
+        visit_object_of_type<object_t>(callback, tensor);
     }
 }
 
-template <typename to_visit_t, typename... Ts>
+template <typename object_t, typename... Ts>
 constexpr auto visit_object_of_type(auto callback, const std::tuple<Ts...>& value) {
     constexpr auto num_attributes = sizeof...(Ts);
     [&callback, &value]<size_t... Ns>(std::index_sequence<Ns...>) {
-        (visit_object_of_type<to_visit_t>(callback, std::get<Ns>(value)), ...);
+        (visit_object_of_type<object_t>(callback, std::get<Ns>(value)), ...);
     }(std::make_index_sequence<num_attributes>{});
 }
 
-template <typename to_visit_t, typename T>
-    requires(not std::same_as<std::decay_t<T>, to_visit_t>) and requires { std::decay_t<T>::attribute_names; }
+template <typename object_t, typename T>
+    requires(not std::same_as<std::decay_t<T>, object_t>) and requires { std::decay_t<T>::attribute_names; }
+constexpr auto visit_object_of_type(auto callback, T&& object) {
+    constexpr auto num_attributes = std::tuple_size_v<decltype(std::decay_t<T>::attribute_names)>;
+    visit_object_of_type<object_t>(callback, object.attribute_values());
+}
+
+template <typename object_t, typename T>
+    requires(not std::same_as<std::decay_t<T>, object_t>) and requires { std::is_aggregate_v<std::decay_t<T>>; }
 constexpr auto visit_object_of_type(auto callback, T&& object) {
+    reflect::for_each(
+        [&callback, &object](auto I) { visit_object_of_type<object_t>(callback, reflect::get<I>(object)); }, object);
+}
+
+template <typename object_t, typename T>
+    requires std::same_as<std::decay_t<T>, object_t>
+constexpr auto get_first_object_of_type(T&& value) {
+    return std::cref(value);
+}
+
+template <typename object_t, typename T>
+constexpr auto get_first_object_of_type(const std::optional<T>& value) {
+    if (value.has_value()) {
+        const auto& tensor = value.value();
+        return get_first_object_of_type<object_t>(tensor);
+    }
+}
+
+template <typename object_t, typename T>
+constexpr auto get_first_object_of_type(const std::vector<T>& value) {
+    for (auto& tensor : value) {
+        return get_first_object_of_type<object_t>(tensor);
+    }
+}
+
+template <typename object_t, typename T, auto N>
+constexpr auto get_first_object_of_type(const std::array<T, N>& value) {
+    for (auto& tensor : value) {
+        return get_first_object_of_type<object_t>(tensor);
+    }
+}
+
+template <typename object_t, typename... Ts>
+constexpr auto get_first_object_of_type(const std::tuple<Ts...>& value) {
+    constexpr auto num_attributes = sizeof...(Ts);
+    return get_first_object_of_type<object_t>(std::get<0>(value));
+}
+
+template <typename object_t, typename T>
+    requires (not std::same_as<std::decay_t<T>, object_t>) and requires { std::decay_t<T>::attribute_names; }
+constexpr auto get_first_object_of_type(T&& object) {
     constexpr auto num_attributes = std::tuple_size_v<decltype(std::decay_t<T>::attribute_names)>;
-    visit_object_of_type<to_visit_t>(callback, object.attribute_values());
+    return get_first_object_of_type<object_t>(object.attribute_values());
+}
+
+template <typename object_t, typename T>
+    requires (not std::same_as<std::decay_t<T>, object_t>) and requires { std::is_aggregate_v<std::decay_t<T>>; }
+constexpr auto get_first_object_of_type(T&& object) {
+    return get_first_object_of_type<object_t>(reflect::get<0>(object));
 }
 
 }  // namespace reflection
@@ -694,6 +746,13 @@ inline hash_t hash_object(const T& object) noexcept {
         } else {
             return 0;
         }
+    } else if constexpr (std::is_aggregate_v<T>) {
+        if constexpr (DEBUG_HASH_OBJECT_FUNCTION) {
+            fmt::print("Hashing struct {} using reflect library: {}\n", get_type_name<T>(), object);
+        }
+        std::size_t hash = 0;
+        reflect::for_each([&hash, &object](auto I) { hash = hash_objects(hash, reflect::get<I>(object)); }, object);
+        return hash;
     } else {
         static_assert(tt::stl::concepts::always_false_v<T>, "Type doesn't support std::hash");
     }
diff --git a/ttnn/CMakeLists.txt b/ttnn/CMakeLists.txt
index 6d3b1549c428..c8262b3a3a27 100644
--- a/ttnn/CMakeLists.txt
+++ b/ttnn/CMakeLists.txt
@@ -17,7 +17,7 @@ set(TTNN_SRCS
 add_library(ttnn_lib OBJECT ${TTNN_SRCS})
 target_compile_options(ttnn_lib PUBLIC -MP -Wno-int-to-pointer-cast -fno-var-tracking)
 target_link_libraries(ttnn_lib
-    PUBLIC compiler_flags metal_header_directories metal_common_libs reflect::reflect
+    PUBLIC compiler_flags metal_header_directories metal_common_libs
 )
 target_include_directories(ttnn_lib PUBLIC
     ${UMD_HOME}
diff --git a/ttnn/cpp/ttnn/device_operation.hpp b/ttnn/cpp/ttnn/device_operation.hpp
index 652eb88d8d05..ec9b2a93434d 100644
--- a/ttnn/cpp/ttnn/device_operation.hpp
+++ b/ttnn/cpp/ttnn/device_operation.hpp
@@ -15,7 +15,6 @@
 #include "tt_stl/concepts.hpp"
 #include "tt_stl/reflection.hpp"
 #include "tt_stl/unique_any.hpp"
-#include <reflect>
 
 namespace ttnn {
 
@@ -96,47 +95,6 @@ template <typename... Ts>
     return table[i];
 }
 
-template <typename T>
-    requires std::same_as<std::decay_t<T>, Tensor>
-constexpr auto get_first_tensor(T&& value) {
-    return std::cref(value);
-}
-
-template <typename T>
-constexpr auto get_first_tensor(const std::optional<T>& value) {
-    if (value.has_value()) {
-        const auto& tensor = value.value();
-        return get_first_tensor(tensor);
-    }
-}
-
-template <typename T>
-constexpr auto get_first_tensor(const std::vector<T>& value) {
-    for (auto& tensor : value) {
-        return get_first_tensor(tensor);
-    }
-}
-
-template <typename T, auto N>
-constexpr auto get_first_tensor(const std::array<T, N>& value) {
-    for (auto& tensor : value) {
-        return get_first_tensor(tensor);
-    }
-}
-
-template <typename... Ts>
-constexpr auto get_first_tensor(const std::tuple<Ts...>& value) {
-    constexpr auto num_attributes = sizeof...(Ts);
-    return get_first_tensor(std::get<0>(value));
-}
-
-template <typename T>
-    requires requires { std::decay_t<T>::attribute_names; } and (not std::same_as<std::decay_t<T>, Tensor>)
-constexpr auto get_first_tensor(T&& object) {
-    constexpr auto num_attributes = std::tuple_size_v<decltype(std::decay_t<T>::attribute_names)>;
-    return get_first_tensor(object.attribute_values());
-}
-
 inline const auto USE_FAST_DISPATCH = std::getenv("TT_METAL_SLOW_DISPATCH_MODE") == nullptr;
 
 template <typename device_operation_t>
@@ -231,7 +189,8 @@ typename device_operation_t::tensor_return_value_t run(
     using tensor_return_value_t = typename device_operation_t::tensor_return_value_t;
     static_assert(not std::same_as<tensor_return_value_t, void>, "Operation cannot return type cannot be void");
 
-    auto device = get_first_tensor(tensor_args).get().device();
+    // TODO: support the case when tensor args are empty? Or add an overload for that case?
+    auto device = tt::stl::reflection::get_first_object_of_type<Tensor>(tensor_args).get().device();
     auto& program_cache = device->program_cache;
 
     auto program_hash = compute_program_hash<device_operation_t>(operation_attributes, tensor_args);
diff --git a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_op.hpp b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_op.hpp
index cc4906f4daae..c45bff9fde35 100644
--- a/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_op.hpp
+++ b/ttnn/cpp/ttnn/operations/eltwise/binary/device/binary_op.hpp
@@ -65,29 +65,11 @@ struct Binary {
         const MemoryConfig memory_config;
         const DataType dtype;
         std::optional<DeviceComputeKernelConfig> compute_kernel_config;
-
-        static constexpr auto attribute_names = std::forward_as_tuple(
-            "binary_op_type", "in_place", "activations", "memory_config", "dtype", "compute_kernel_config");
-        const auto attribute_values() const {
-            return std::forward_as_tuple(
-                this->binary_op_type,
-                this->in_place,
-                this->activations,
-                this->memory_config,
-                this->dtype,
-                this->compute_kernel_config);
-        }
     };
     struct tensor_args_t {
         const Tensor& input_tensor_a;
         const Tensor& input_tensor_b;
         std::optional<Tensor> output_tensor;
-
-        static constexpr auto attribute_names =
-            std::forward_as_tuple("input_tensor_a", "input_tensor_b", "output_tensor");
-        const auto attribute_values() const {
-            return std::forward_as_tuple(this->input_tensor_a, this->input_tensor_b, this->output_tensor);
-        }
     };
     using shape_return_value_t = ttnn::Shape;
     using tensor_return_value_t = Tensor;

From 86ab828d505f4ec23aae119086df11a665808748 Mon Sep 17 00:00:00 2001
From: Raymond Kim <109366641+tt-rkim@users.noreply.github.com>
Date: Fri, 28 Jun 2024 16:41:08 -0400
Subject: [PATCH 2/6] #0: Properly delete source folders for wheel testing
 (#9829)

#0: Go to root directory to delete extraneous folders because before we weren't deleting anything
---
 tests/scripts/set_up_end_to_end_tests_env.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/scripts/set_up_end_to_end_tests_env.sh b/tests/scripts/set_up_end_to_end_tests_env.sh
index 27c4d78d8f7a..9a7e1e6c3869 100755
--- a/tests/scripts/set_up_end_to_end_tests_env.sh
+++ b/tests/scripts/set_up_end_to_end_tests_env.sh
@@ -21,6 +21,7 @@ set_up_end_to_end_tests_env() {
   python -m pip install -r requirements.txt
   python -m pip install ../../metal_libs-*.whl
 
+  cd ../../
   rm -rf tt_metal tt_eager ttnn models
   echo "Showing current directory"
   ls -hal

From 2cc5380e0c47556f6552085db159eb660274d986 Mon Sep 17 00:00:00 2001
From: mtairum <mtairum@tenstorrent.com>
Date: Fri, 28 Jun 2024 21:01:08 +0000
Subject: [PATCH 3/6] #9479: Update Mixtral perf estimates and clean mixtral
 unit test

---
 .../demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py | 10 +++++-----
 tests/scripts/t3000/run_t3000_model_perf_tests.sh      |  2 +-
 tests/scripts/t3000/run_t3000_unit_tests.sh            |  3 +--
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py b/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py
index 3732cb21f0ae..0d9717160338 100644
--- a/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py
+++ b/models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py
@@ -43,10 +43,10 @@ def forward(self, x):
 @pytest.mark.parametrize(
     "generation_start_pos, expected_compile_time, expected_inference_time",
     (
-        (32, 150, 0.058),  # FIXME: Perf regression (issue #9479)
-        (128, 150, 0.058),  # FIXME: Perf regression (issue #9479)
-        (1024, 150, 0.058),  # FIXME: Perf regression (issue #9479)
-        (2048, 150, 0.058),  # FIXME: Perf regression (issue #9479)
+        (32, 150, 0.075),
+        (128, 150, 0.075),
+        (1024, 150, 0.075),
+        (2048, 150, 0.075),
     ),
 )
 def test_mixtral_model_perf(
@@ -61,7 +61,7 @@ def test_mixtral_model_perf(
 
     # Can use dummy_weights=True correctness is not tested, but it is much slower
     model_args = TtModelArgs(t3k_device_mesh.get_device(0), dummy_weights=False)
-    model_args.n_layers = 1
+    model_args.n_layers = 32
 
     # Clear global profiler state before starting measurements
     profiler.clear()
diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
index 2cf1dc5dcc4e..6140b9efeafd 100755
--- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh
+++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
@@ -22,7 +22,7 @@ run_t3000_mixtral_tests() {
 
   echo "LOG_METAL: Running run_t3000_mixtral_tests"
 
-  env pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py::test_mixtral_model_perf[wormhole_b0-True-2048-150-0.058] -m "model_perf_t3000"
+  env pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py -m "model_perf_t3000"
 
   # Record the end time
   end_time=$(date +%s)
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
index ea092261a138..a8019137642b 100755
--- a/tests/scripts/t3000/run_t3000_unit_tests.sh
+++ b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -80,7 +80,6 @@ run_t3000_mixtral_tests() {
   pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py
   pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py
   pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py
-  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-1-1-pcc]
 
   # Record the end time
   end_time=$(date +%s)
@@ -111,7 +110,7 @@ main() {
     echo "Script is being sourced, not executing main function"
     return 0
   fi
-  
+
   if [[ -z "$TT_METAL_HOME" ]]; then
     echo "Must provide TT_METAL_HOME in environment" 1>&2
     exit 1

From 29efc68bd87ff750679d7feedbc0ff9736316265 Mon Sep 17 00:00:00 2001
From: Artem Yerofieiev <169092593+ayerofieiev-tt@users.noreply.github.com>
Date: Fri, 28 Jun 2024 15:17:07 -0700
Subject: [PATCH 4/6] #0: Added github community issue workflow (#9833)

* #0: Added github community issue workflow

* #0: Update channel id

* #0: Check that webhook works

* #0: Try to log some more

* #0: Change the check

* #0: switched check to an org member

* #0: Add label: community to non-org issues

* #0: Switch to org check + labels addition
---
 .github/workflows/on-community-issue.yaml | 37 +++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/on-community-issue.yaml

diff --git a/.github/workflows/on-community-issue.yaml b/.github/workflows/on-community-issue.yaml
new file mode 100644
index 000000000000..63af9e0c1590
--- /dev/null
+++ b/.github/workflows/on-community-issue.yaml
@@ -0,0 +1,37 @@
+name: "Slack Notification on Community Issue"
+
+on:
+  issues:
+    types: [opened, labeled]
+
+jobs:
+  label-check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check if organization member
+        id: is_organization_member
+        uses: JamesSingleton/is-organization-member@1.0.0
+        with:
+          organization: tenstorrent
+          username: ${{ github.event.issue.user.login }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Add community label
+        if: ${{ steps.is_organization_member.outputs.result == 'false' }}
+        run: gh issue edit "$NUMBER" --add-label "$LABELS"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GH_REPO: ${{ github.repository }}
+          NUMBER: ${{ github.event.issue.number }}
+          LABELS: community
+      - name: Send Slack Notification
+        if: ${{ steps.is_organization_member.outputs.result == 'false' }}
+        uses: slackapi/slack-github-action@v1.26.0        
+        with:
+          payload: |
+            {
+              "text": "A new issue has been created by a non-org member `${{ github.event.sender.login }}`: ${{ github.event.issue.html_url }}",
+              "channel": "C07AZJ5DLL8"
+            }
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_CHANNEL_WEBHOOK_URL }}

From d954e7640847df125ce14d51b334c18270b70843 Mon Sep 17 00:00:00 2001
From: Vincent Tang <vtang@tenstorrent.com>
Date: Wed, 22 May 2024 18:53:34 +0000
Subject: [PATCH 5/6] #8729: pytest xdist multiprocess reset mechanism

- use custom pyhook from pytest-xdist to run cleanup after timeout
- add a new process based timeout method to pytest-timeout
- add reset after test fail, use '--metal-timeout' to enable reset mechanism
- use pytest-xdist fixture to determine # of workers through '-n auto'
- expose get_associated_mmio_device to python and only reset opened tt devices on fail
---
 conftest.py                              | 493 ++++++++++++++---------
 tt_eager/tt_lib/csrc/tt_lib_bindings.cpp |   4 +
 tt_metal/host_api.hpp                    |   2 +
 tt_metal/tt_metal.cpp                    |   4 +
 4 files changed, 307 insertions(+), 196 deletions(-)

diff --git a/conftest.py b/conftest.py
index f872bce2998f..c6339ee3ae1b 100644
--- a/conftest.py
+++ b/conftest.py
@@ -12,6 +12,11 @@
 from operator import contains, eq, getitem
 from pathlib import Path
 import json
+import copy
+import multiprocess
+import signal
+import time
+import psutil
 
 from loguru import logger
 
@@ -70,191 +75,6 @@ def get_tt_cache_path_(model_version, model_subdir="", default_dir=""):
     return get_tt_cache_path_
 
 
-ALL_ARCHS = set(
-    [
-        "grayskull",
-        "wormhole_b0",
-    ]
-)
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        "--tt-arch",
-        choices=[*ALL_ARCHS],
-        default=os.environ.get("ARCH_NAME", "grayskull"),
-        help="Target arch, ex. grayskull, wormhole_b0",
-    )
-    parser.addoption(
-        "--device-id",
-        type=int,
-        default=0,
-        help="Target device id",
-    )
-    parser.addoption(
-        "--input-method",
-        action="store",
-        choices=["json", "cli"],
-        default=None,
-        help="Choose input method: 1) json or 2) cli",
-    )
-    parser.addoption(
-        "--input-path",
-        action="store",
-        default="",
-        help="Path to json file with inputs",
-    )
-    parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli")
-
-
-def pytest_generate_tests(metafunc):
-    """
-    This is not a standard docstring.
-
-    We will explain the non-standard fixtures that pytest_generate_tests is
-    creating here.
-
-    silicon_arch_name and silicon_arch_<ARCH_NAME>
-    ----------------------------------------------
-
-    This is how tests should be requesting accelerator architecture names.
-    Tests which aim to run on silicon should request a silicon_arch_name
-    fixture. Just that single fixture will parametrize the test to run on the
-    provided architecture name from the command line through the --tt-arch
-    option. The value of the fixture will be the string value of the
-    architecture name. For example,
-
-    @pytest.mark.post_commit
-    def test_model_silicon(silicon_arch_name):
-        # silicon_arch_name will be one of grayskull, wormhole_b0 etc.
-        run_model_on_silicon(silicon_arch_name)
-        ...
-
-    If you want to restrict a test to only a specific architecture, you can
-    provide an additional fixture in the form of silicon_arch_<ARCH_NAME>. This
-    will limit the range of possible values for silicon_arch_name to only be
-    ARCH_NAME.
-
-    @pytest.mark.post_commit
-    def test_model_silicon_grayskull_only(
-        silicon_arch_name,
-        silicon_arch_grayskull,
-    ):
-        # silicon_arch_name can only be grayskull or empty
-        run_model_on_silicon(silicon_arch_name)
-        ...
-
-    If --tt-arch specifies an architecture that's not ARCH_NAME, the test will
-    be skipped. We ensure skipping by providing an empty list parametrization
-    for silicon_arch_name, and with the empty_parameter_set_mark config option
-    for pytest, will skip any tests with an empty list parametrization.
-
-    Note that you must provide silicon_arch_name as a fixture if you want to
-    use the silicon_arch_<ARCH_NAME> fixture.
-
-    Note that if tests want to use the ARCH value from the API, tests should
-    create their own separate fixture which will convert the string value
-    provided from silicon_arch_name into ARCH. We keep it as strings here
-    because these fixtures will be used in tests which do not have access to
-    any Python APIs.
-    """
-
-    tt_arch = metafunc.config.getoption("--tt-arch")
-
-    silicon_arch_specific_fixture_name_to_avail_archs = {
-        "silicon_arch_grayskull": set(
-            [
-                "grayskull",
-            ]
-        ),
-        "silicon_arch_wormhole_b0": set(
-            [
-                "wormhole_b0",
-            ]
-        ),
-    }
-
-    check_uses_silicon_arch_specific_fixture = partial(contains, silicon_arch_specific_fixture_name_to_avail_archs)
-    test_requested_silicon_arch_fixtures = tuple(
-        filter(check_uses_silicon_arch_specific_fixture, metafunc.fixturenames)
-    )
-    is_test_requesting_specific_silicon_archs = len(test_requested_silicon_arch_fixtures) > 0
-    get_archs_for_silicon_arch_specific_fixture = partial(getitem, silicon_arch_specific_fixture_name_to_avail_archs)
-    test_requested_silicon_archs = ALL_ARCHS.intersection(
-        *map(
-            get_archs_for_silicon_arch_specific_fixture,
-            test_requested_silicon_arch_fixtures,
-        )
-    )
-
-    available_archs = test_requested_silicon_archs if is_test_requesting_specific_silicon_archs else ALL_ARCHS
-    matches_user_requested_silicon_arch = partial(eq, tt_arch)
-    available_archs = tuple(filter(matches_user_requested_silicon_arch, available_archs))
-
-    uses_silicon_arch = "silicon_arch_name" in metafunc.fixturenames
-
-    # sanity
-    if is_test_requesting_specific_silicon_archs and not uses_silicon_arch:
-        raise Exception(
-            f"{metafunc.function} requesting a specific silicon target, but doesn't use silicon_arch_name fixture"
-        )
-
-    if uses_silicon_arch:
-        metafunc.parametrize("silicon_arch_name", available_archs, scope="session")
-        for test_requested_silicon_arch_fixture in test_requested_silicon_arch_fixtures:
-            # The values of these arch-specific fixtures should not be used in
-            # the test function, so use any parameters, like [True]
-            metafunc.parametrize(test_requested_silicon_arch_fixture, [True], scope="session")
-
-    input_method = metafunc.config.getoption("--input-method")
-    if input_method == "json":
-        json_path = metafunc.config.getoption("--input-path")
-        if not json_path:
-            raise ValueError("Please provide a valid JSON path using --input-path option.")
-        with open(json_path, "r") as f:
-            data = json.load(f)
-        metafunc.parametrize("user_input", [data])
-    elif input_method == "cli":
-        cli_input = metafunc.config.getoption("--cli-input")
-        if not cli_input:
-            raise ValueError("Please provide input using --cli-input option.")
-        metafunc.parametrize("user_input", [[cli_input]])
-
-
-# Report stashing to get outcomes etc
-phase_report_key = pytest.StashKey()
-
-
-@pytest.hookimpl(tryfirst=True, hookwrapper=True)
-def pytest_runtest_makereport(item, call):
-    # execute all other hooks to obtain the report object
-    outcome = yield
-    rep = outcome.get_result()
-
-    # store test results for each phase of a call, which can
-    # be "setup", "call", "teardown"
-    item.stash.setdefault(phase_report_key, {})[rep.when] = rep
-
-
-@pytest.fixture(scope="function")
-def reset_tensix(request, silicon_arch_name):
-    yield
-
-    report = request.node.stash[phase_report_key]
-
-    test_failed = ("call" not in report) or report["call"].failed
-
-    if test_failed:
-        logger.debug("Test failed - resetting with smi")
-        if silicon_arch_name == "grayskull":
-            result = run_process_and_get_result("tt-smi -tr all")
-        elif silicon_arch_name == "wormhole_b0":
-            result = run_process_and_get_result("tt-smi -wr all")
-        else:
-            raise Exception(f"Unrecognized arch for tensix-reset: {silicon_arch_name}")
-        assert result.returncode == 0, "Tensix reset script raised error"
-
-
 @pytest.fixture(scope="function")
 def device_params(request):
     return getattr(request, "param", {})
@@ -266,6 +86,9 @@ def device(request, device_params):
 
     device_id = request.config.getoption("device_id")
 
+    request.node.device_ids = [device_id]
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)]
+
     num_devices = ttl.device.GetNumPCIeDevices()
     assert device_id < num_devices, "CreateDevice not supported for non-mmio device"
     device = ttl.device.CreateDevice(device_id=device_id, **device_params)
@@ -284,9 +107,13 @@ def pcie_devices(request, device_params):
     import tt_lib as ttl
 
     num_devices = ttl.device.GetNumPCIeDevices()
+    device_ids = [i for i in range(num_devices)]
+
+    request.node.device_ids = device_ids
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
 
     # Get only physical devices
-    devices = ttl.device.CreateDevices(device_ids=[i for i in range(num_devices)], **device_params)
+    devices = ttl.device.CreateDevices(device_ids, **device_params)
 
     yield [devices[i] for i in range(num_devices)]
 
@@ -301,9 +128,13 @@ def all_devices(request, device_params):
     import tt_lib as ttl
 
     num_devices = ttl.device.GetNumAvailableDevices()
+    device_ids = [i for i in range(num_devices)]
+
+    request.node.device_ids = device_ids
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
 
     # Get only physical devices
-    devices = ttl.device.CreateDevices(device_ids=[i for i in range(num_devices)], **device_params)
+    devices = ttl.device.CreateDevices(device_ids, **device_params)
 
     yield [devices[i] for i in range(num_devices)]
 
@@ -316,6 +147,7 @@ def all_devices(request, device_params):
 @pytest.fixture(scope="function")
 def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params):
     import ttnn
+    import tt_lib as ttl
 
     device_ids = ttnn.get_device_ids()
     try:
@@ -323,6 +155,9 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
+    request.node.device_ids = device_ids[:num_devices_requested]
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]
+
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested], **device_params
     )
@@ -330,8 +165,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
     logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
     yield device_mesh
 
-    import tt_lib as ttl
-
     for device in device_mesh.get_devices():
         ttl.device.DumpDeviceProfiler(device)
 
@@ -342,6 +175,7 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
 @pytest.fixture(scope="function")
 def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params):
     import ttnn
+    import tt_lib as ttl
 
     device_ids = ttnn.get_pcie_device_ids()
     try:
@@ -349,6 +183,9 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
     except (ValueError, AttributeError):
         num_pcie_devices_requested = len(device_ids)
 
+    request.node.device_ids = device_ids[:num_pcie_devices_requested]
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]]
+
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params
     )
@@ -356,8 +193,6 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
     logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
     yield device_mesh
 
-    import tt_lib as ttl
-
     for device in device_mesh.get_devices():
         ttl.device.DumpDeviceProfiler(device)
 
@@ -368,6 +203,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
 @pytest.fixture(scope="function")
 def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_params):
     import ttnn
+    import tt_lib as ttl
 
     if ttnn.get_num_devices() < 8:
         pytest.skip()
@@ -377,6 +213,9 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
+    request.node.device_ids = device_ids[:num_devices_requested]
+    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]
+
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_devices_requested), device_ids[:num_devices_requested], **device_params
     )
@@ -384,8 +223,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device
     logger.debug(f"multidevice with {device_mesh.get_num_devices()} devices is created")
     yield device_mesh
 
-    import tt_lib as ttl
-
     for device in device_mesh.get_devices():
         ttl.device.DumpDeviceProfiler(device)
 
@@ -453,6 +290,270 @@ def tracy_profile():
     profiler.disable()
 
 
-@pytest.fixture
-def input_path(request):
-    return request.config.getoption("--input-path")
+###############################
+# Modifying pytest hooks
+###############################
+ALL_ARCHS = set(
+    [
+        "grayskull",
+        "wormhole_b0",
+    ]
+)
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--tt-arch",
+        choices=[*ALL_ARCHS],
+        default=os.environ.get("ARCH_NAME", "grayskull"),
+        help="Target arch, ex. grayskull, wormhole_b0",
+    )
+    parser.addoption(
+        "--pipeline-type",
+        default="",
+        help="Only `models_device_performance_bare_metal` should run `pytest_runtest_teardown`",
+    )
+    parser.addoption(
+        "--device-id",
+        type=int,
+        default=0,
+        help="Target device id",
+    )
+    parser.addoption(
+        "--input-method",
+        action="store",
+        choices=["json", "cli"],
+        default=None,
+        help="Choose input method: 1) json or 2) cli",
+    )
+    parser.addoption(
+        "--input-path",
+        action="store",
+        default="",
+        help="Path to json file with inputs",
+    )
+    parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli")
+    parser.addoption(
+        "--metal-cleanup",
+        action="store",
+        default=None,
+        help="Enable process timeout",
+    )
+
+
+def pytest_generate_tests(metafunc):
+    """
+    This is not a standard docstring.
+
+    We will explain the non-standard fixtures that pytest_generate_tests is
+    creating here.
+
+    silicon_arch_name and silicon_arch_<ARCH_NAME>
+    ----------------------------------------------
+
+    This is how tests should be requesting accelerator architecture names.
+    Tests which aim to run on silicon should request a silicon_arch_name
+    fixture. Just that single fixture will parametrize the test to run on the
+    provided architecture name from the command line through the --tt-arch
+    option. The value of the fixture will be the string value of the
+    architecture name. For example,
+
+    @pytest.mark.post_commit
+    def test_model_silicon(silicon_arch_name):
+        # silicon_arch_name will be one of grayskull, wormhole_b0 etc.
+        run_model_on_silicon(silicon_arch_name)
+        ...
+
+    If you want to restrict a test to only a specific architecture, you can
+    provide an additional fixture in the form of silicon_arch_<ARCH_NAME>. This
+    will limit the range of possible values for silicon_arch_name to only be
+    ARCH_NAME.
+
+    @pytest.mark.post_commit
+    def test_model_silicon_grayskull_only(
+        silicon_arch_name,
+        silicon_arch_grayskull,
+    ):
+        # silicon_arch_name can only be grayskull or empty
+        run_model_on_silicon(silicon_arch_name)
+        ...
+
+    If --tt-arch specifies an architecture that's not ARCH_NAME, the test will
+    be skipped. We ensure skipping by providing an empty list parametrization
+    for silicon_arch_name, and with the empty_parameter_set_mark config option
+    for pytest, will skip any tests with an empty list parametrization.
+
+    Note that you must provide silicon_arch_name as a fixture if you want to
+    use the silicon_arch_<ARCH_NAME> fixture.
+
+    Note that if tests want to use the ARCH value from the API, tests should
+    create their own separate fixture which will convert the string value
+    provided from silicon_arch_name into ARCH. We keep it as strings here
+    because these fixtures will be used in tests which do not have access to
+    any Python APIs.
+    """
+
+    tt_arch = metafunc.config.getoption("--tt-arch")
+
+    silicon_arch_specific_fixture_name_to_avail_archs = {
+        "silicon_arch_grayskull": set(
+            [
+                "grayskull",
+            ]
+        ),
+        "silicon_arch_wormhole_b0": set(
+            [
+                "wormhole_b0",
+            ]
+        ),
+    }
+
+    check_uses_silicon_arch_specific_fixture = partial(contains, silicon_arch_specific_fixture_name_to_avail_archs)
+    test_requested_silicon_arch_fixtures = tuple(
+        filter(check_uses_silicon_arch_specific_fixture, metafunc.fixturenames)
+    )
+    is_test_requesting_specific_silicon_archs = len(test_requested_silicon_arch_fixtures) > 0
+    get_archs_for_silicon_arch_specific_fixture = partial(getitem, silicon_arch_specific_fixture_name_to_avail_archs)
+    test_requested_silicon_archs = ALL_ARCHS.intersection(
+        *map(
+            get_archs_for_silicon_arch_specific_fixture,
+            test_requested_silicon_arch_fixtures,
+        )
+    )
+
+    available_archs = test_requested_silicon_archs if is_test_requesting_specific_silicon_archs else ALL_ARCHS
+    matches_user_requested_silicon_arch = partial(eq, tt_arch)
+    available_archs = tuple(filter(matches_user_requested_silicon_arch, available_archs))
+
+    uses_silicon_arch = "silicon_arch_name" in metafunc.fixturenames
+
+    # sanity
+    if is_test_requesting_specific_silicon_archs and not uses_silicon_arch:
+        raise Exception(
+            f"{metafunc.function} requesting a specific silicon target, but doesn't use silicon_arch_name fixture"
+        )
+
+    if uses_silicon_arch:
+        metafunc.parametrize("silicon_arch_name", available_archs)
+        for test_requested_silicon_arch_fixture in test_requested_silicon_arch_fixtures:
+            # The values of these arch-specific fixtures should not be used in
+            # the test function, so use any parameters, like [True]
+            metafunc.parametrize(test_requested_silicon_arch_fixture, [True])
+
+    input_method = metafunc.config.getoption("--input-method")
+    if input_method == "json":
+        json_path = metafunc.config.getoption("--input-path")
+        if not json_path:
+            raise ValueError("Please provide a valid JSON path using --input-path option.")
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        metafunc.parametrize("user_input", [data])
+    elif input_method == "cli":
+        cli_input = metafunc.config.getoption("--cli-input")
+        if not cli_input:
+            raise ValueError("Please provide input using --cli-input option.")
+        metafunc.parametrize("user_input", [[cli_input]])
+
+
+# Report stashing to get outcomes etc
+phase_report_key = pytest.StashKey()
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    # execute all other hooks to obtain the report object
+    outcome = yield
+    rep = outcome.get_result()
+
+    # store test results for each phase of a call, which can
+    # be "setup", "call", "teardown"
+    item.stash.setdefault(phase_report_key, {})[rep.when] = rep
+
+
+@pytest.hookimpl(hookwrapper=True)
+def pytest_runtest_teardown(item, nextitem):
+    yield
+    metal_cleanup_enabled = item.config.getoption("--metal-cleanup")
+    if metal_cleanup_enabled is not None:
+        report = item.stash[phase_report_key]
+        test_failed = report.get("call", None) and report["call"].failed
+        if test_failed:
+            logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}")
+            # reset_tensix(set(item.pci_ids))
+            reset_tensix()
+
+
+# This is overriding the timer setup hook from pytest-timeout
+# If --metal-timeout is passed, we define a new timeout method that spawns a timer process
+# At timeout, the process kills it's parent (the test process) and then itself
+@pytest.hookimpl(tryfirst=True)
+def pytest_timeout_set_timer(item, settings):
+    metal_timeout_enabled = item.config.getoption("--metal-cleanup")
+    if metal_timeout_enabled is not None:
+        parent_pid = os.getpid()
+        logger.info(f"Metal timeout {settings.timeout} seconds")
+
+        def get_parent_status():
+            try:
+                parent = psutil.Process(parent_pid)
+            except:
+                return "already dead"
+            return parent.status()
+
+        def run_timer(settings):
+            dead_status = ["zombie", "dead", "already dead"]
+            timeout = settings.timeout
+            while get_parent_status() not in dead_status and timeout > 0:
+                time.sleep(1)
+                timeout -= 1
+            if get_parent_status() != "already dead":
+                logger.info(f"Timing out test case")
+                os.kill(parent_pid, signal.SIGKILL)
+            logger.info(f"Killing timer")
+            os._exit(1)
+
+        def cancel():
+            logger.info(f"Cancelling timer")
+            metal_timer.terminate()
+
+        metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True)
+        item.cancel_timeout = cancel
+        metal_timer.start()
+        # logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}")
+    return True
+
+
+# This is a hook used in pytest-xdist to handle when a worker crashes out
+# In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and
+# then it should get cleaned up by the controller through this fixture :fingers_crossed:
+@pytest.hookimpl(tryfirst=True)
+def pytest_handlecrashitem(crashitem, report, sched):
+    reset_tensix()
+
+
+def reset_tensix(tt_open_devices=None):
+    metal_env = copy.deepcopy(os.environ)
+    arch = metal_env.get("ARCH_NAME")
+    if arch != "grayskull" and arch != "wormhole_b0":
+        raise Exception(f"Unrecognized arch for tensix-reset: {arch}")
+
+    if tt_open_devices is None:
+        logger.info(f"Running reset with reset script: /opt/tt_metal_infra/scripts/ci/{arch}/reset.sh")
+        smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh")
+    else:
+        tt_open_devices_str = ",".join([str(i) for i in tt_open_devices])
+        check_smi = run_process_and_get_result("tt-smi-metal -h")
+        logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}")
+        logger.info(f"Running reset for pci devices: {tt_open_devices_str}")
+        if check_smi.returncode > 0:
+            logger.info(f"Test failed - resetting {arch} with tt-smi")
+            smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}")
+        else:
+            smi_reset_result = run_process_and_get_result(f"tt-smi-metal -r {tt_open_devices_str}")
+    logger.info(f"tt-smi reset status: {smi_reset_result.returncode}")
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_xdist_auto_num_workers(config):
+    logger.info("getting num of xdist workers")
+    return 1
diff --git a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
index 915276dfa393..d03205be995b 100644
--- a/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
+++ b/tt_eager/tt_lib/csrc/tt_lib_bindings.cpp
@@ -136,6 +136,10 @@ void DeviceModule(py::module &m_device) {
         Returns number of Tenstorrent devices that are connected to host via PCIe and can be targeted.
     )doc");
 
+    m_device.def("GetPCIeDeviceID", &GetPCIeDeviceID, R"doc(
+        Returns associated mmio device of give device id.
+    )doc");
+
     m_device.def("SetDefaultDevice", &AutoFormat::SetDefaultDevice, R"doc(
         Sets the default device to use for ops when inputs aren't on device.
 
diff --git a/tt_metal/host_api.hpp b/tt_metal/host_api.hpp
index a2a490c345cc..5572bad808e6 100644
--- a/tt_metal/host_api.hpp
+++ b/tt_metal/host_api.hpp
@@ -55,6 +55,8 @@ size_t GetNumAvailableDevices();
  */
 size_t GetNumPCIeDevices();
 
+chip_id_t GetPCIeDeviceID(chip_id_t device_id);
+
 /**
  * Instantiates a device object.
  *
diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp
index a1ec68fb1bb3..dc529c1256eb 100644
--- a/tt_metal/tt_metal.cpp
+++ b/tt_metal/tt_metal.cpp
@@ -730,6 +730,10 @@ size_t GetNumPCIeDevices() {
 #endif
 }
 
+chip_id_t GetPCIeDeviceID(chip_id_t device_id){
+    return tt::Cluster::instance().get_associated_mmio_device(device_id);
+}
+
 Device *CreateDevice(
     chip_id_t device_id,
     const uint8_t num_hw_cqs,

From d525b1710a5201c0329ffba5686c40217be0b414 Mon Sep 17 00:00:00 2001
From: Vincent Tang <vtang@tenstorrent.com>
Date: Thu, 13 Jun 2024 22:02:21 +0000
Subject: [PATCH 6/6] #8729: xdist + reset mechanism on fd nightly, model perf,
 all t3k (except profiler) - enable timeout mechanism by default if using
 xdist, use 'metal-timeout' flag to enable if not using xdist - increase GH
 actions timeout for xdist (review) - get timings of each test and set global
 timeout to 5 mins (review) - add custom timeouts to nightly + t3k pipelines +
 post-commit (review)

---
 ...-dispatch-full-regressions-and-models.yaml |  2 +-
 .github/workflows/perf-models.yaml            |  2 +-
 .github/workflows/t3000-demo-tests.yaml       |  3 +-
 .github/workflows/t3000-frequent-tests.yaml   |  1 +
 .github/workflows/t3000-model-perf-tests.yaml |  9 +--
 .github/workflows/t3000-unit-tests.yaml       |  1 +
 conftest.py                                   | 59 ++++++++--------
 pytest.ini                                    |  2 +-
 tests/scripts/run_performance.sh              | 25 +++----
 tests/scripts/run_tests.sh                    |  2 +-
 .../single_card/nightly/run_common_models.sh  |  9 ++-
 .../single_card/nightly/run_gs_only.sh        | 11 ++-
 tests/scripts/single_card/nightly/run_ttnn.sh |  9 ++-
 .../single_card/nightly/run_wh_b0_only.sh     | 11 ++-
 .../single_card/nightly/run_wh_b0_unstable.sh |  9 ++-
 tests/scripts/t3000/run_t3000_demo_tests.sh   | 37 +++++++---
 .../scripts/t3000/run_t3000_frequent_tests.sh | 68 +++++++++++++------
 .../t3000/run_t3000_model_perf_tests.sh       | 31 +++++++--
 tests/scripts/t3000/run_t3000_unit_tests.sh   | 66 ++++++++++++------
 tt_metal/python_env/requirements-dev.txt      |  1 +
 20 files changed, 238 insertions(+), 120 deletions(-)

diff --git a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
index 115b94154522..b6dc4f619c55 100644
--- a/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
+++ b/.github/workflows/fast-dispatch-full-regressions-and-models.yaml
@@ -26,7 +26,7 @@ jobs:
             { name: "N300 WH-only models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_only.sh, timeout: 30 },
             { name: "API tests GS", arch: grayskull, cmd: ./tests/scripts/run_tests.sh --tt-arch grayskull --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
             { name: "API tests N300 WH B0", arch: wormhole_b0, cmd: ./tests/scripts/run_tests.sh --tt-arch wormhole_b0 --pipeline-type frequent_api --dispatch-mode fast, timeout: 40 },
-            { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 35 },
+            { name: "[Unstable] N300 models", arch: wormhole_b0, cmd: tests/scripts/single_card/nightly/run_wh_b0_unstable.sh, timeout: 45 },
           ]
     name: FD ${{ matrix.test-group.name }} ${{ matrix.test-group.arch }}
     env:
diff --git a/.github/workflows/perf-models.yaml b/.github/workflows/perf-models.yaml
index 8c423e865c1f..f5905175e7e1 100644
--- a/.github/workflows/perf-models.yaml
+++ b/.github/workflows/perf-models.yaml
@@ -52,7 +52,7 @@ jobs:
       - uses: ./.github/actions/install-python-deps
       - name: Run performance regressions
         id: performance_tests
-        timeout-minutes: 30
+        timeout-minutes: 40
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
           ./tests/scripts/run_tests.sh --tt-arch $ARCH_NAME --pipeline-type ${{ matrix.model-type }}_models_performance_${{ matrix.test-info.machine-type }}
diff --git a/.github/workflows/t3000-demo-tests.yaml b/.github/workflows/t3000-demo-tests.yaml
index a05a651f0c5b..ca524dd3a8ae 100644
--- a/.github/workflows/t3000-demo-tests.yaml
+++ b/.github/workflows/t3000-demo-tests.yaml
@@ -17,7 +17,7 @@ jobs:
       fail-fast: false
       matrix:
         test-group: [
-          { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 40, owner_id: U044T8U8DEF}, #Johanna Rock
+          { name: "t3k falcon40b tests", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 50, owner_id: U044T8U8DEF}, #Johanna Rock
           { name: "t3k llama3_70b tests", arch: wormhole_b0, cmd: run_t3000_llama3_70b_tests, timeout: 30, owner_id: U03FJB5TM5Y}, #Colman Glagovich
           { name: "t3k falcon7b tests", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 90, owner_id: U05RWH3QUPM}, #Salar Hosseini
           { name: "t3k mixtral tests", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 30, owner_id: U03PUAKE719}, # Miguel Tairum
@@ -46,6 +46,7 @@ jobs:
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run demo regression tests
+        shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
diff --git a/.github/workflows/t3000-frequent-tests.yaml b/.github/workflows/t3000-frequent-tests.yaml
index d6feebce9dfa..70a13c371f58 100644
--- a/.github/workflows/t3000-frequent-tests.yaml
+++ b/.github/workflows/t3000-frequent-tests.yaml
@@ -42,6 +42,7 @@ jobs:
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run frequent regression tests
+        shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
diff --git a/.github/workflows/t3000-model-perf-tests.yaml b/.github/workflows/t3000-model-perf-tests.yaml
index 3edeb3884699..4995b036238d 100644
--- a/.github/workflows/t3000-model-perf-tests.yaml
+++ b/.github/workflows/t3000-model-perf-tests.yaml
@@ -17,10 +17,10 @@ jobs:
       fail-fast: false
       matrix:
         test-group: [
-          { name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 60, owner_id: S07AJBTLX2L}, #Model Falcon
-          { name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 60, owner_id: U03PUAKE719}, # Miguel Tairum
-          { name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 60, owner_id: U03FJB5TM5Y}, #Colman Glagovich
-          { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 60, owner_id: S07AJBTLX2L}, # Model Falcon
+          { name: "t3k LLM falcon7b model perf tests", model: "falcob7b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon7b_tests, timeout: 75, owner_id: S07AJBTLX2L}, #Model Falcon
+          { name: "t3k LLM mixtral model perf tests", model: "mixtral", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_mixtral_tests, timeout: 75, owner_id: U03PUAKE719}, # Miguel Tairum
+          { name: "t3k LLM llama2 model perf tests", model: "llama2", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_llama2_70b_tests, timeout: 75, owner_id: U03FJB5TM5Y}, #Colman Glagovich
+          { name: "t3k LLM falcon40b model perf tests", model: "falcon40b", model-type: "LLM", arch: wormhole_b0, cmd: run_t3000_falcon40b_tests, timeout: 75, owner_id: S07AJBTLX2L}, # Model Falcon
           #{ name: "t3k CNN model perf tests ", model-type: "CNN", arch: wormhole_b0, cmd: run_t3000_cnn_tests, timeout: 120, owner_id: }, #No tests are being run?
         ]
     name: ${{ matrix.test-group.name }}
@@ -52,6 +52,7 @@ jobs:
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run model perf regression tests
+        shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
diff --git a/.github/workflows/t3000-unit-tests.yaml b/.github/workflows/t3000-unit-tests.yaml
index 5b9e99baaa21..297b863399f0 100644
--- a/.github/workflows/t3000-unit-tests.yaml
+++ b/.github/workflows/t3000-unit-tests.yaml
@@ -43,6 +43,7 @@ jobs:
         run: tar -xvf ttm_${{ matrix.test-group.arch }}.tar
       - uses: ./.github/actions/install-python-deps
       - name: Run unit regression tests
+        shell: bash {0}
         timeout-minutes: ${{ matrix.test-group.timeout }}
         run: |
           source ${{ github.workspace }}/python_env/bin/activate
diff --git a/conftest.py b/conftest.py
index c6339ee3ae1b..cbbda1b9e72b 100644
--- a/conftest.py
+++ b/conftest.py
@@ -85,8 +85,6 @@ def device(request, device_params):
     import tt_lib as ttl
 
     device_id = request.config.getoption("device_id")
-
-    request.node.device_ids = [device_id]
     request.node.pci_ids = [ttl.device.GetPCIeDeviceID(device_id)]
 
     num_devices = ttl.device.GetNumPCIeDevices()
@@ -108,9 +106,7 @@ def pcie_devices(request, device_params):
 
     num_devices = ttl.device.GetNumPCIeDevices()
     device_ids = [i for i in range(num_devices)]
-
-    request.node.device_ids = device_ids
-    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
+    request.node.pci_ids = device_ids
 
     # Get only physical devices
     devices = ttl.device.CreateDevices(device_ids, **device_params)
@@ -129,8 +125,6 @@ def all_devices(request, device_params):
 
     num_devices = ttl.device.GetNumAvailableDevices()
     device_ids = [i for i in range(num_devices)]
-
-    request.node.device_ids = device_ids
     request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids]
 
     # Get only physical devices
@@ -155,7 +149,6 @@ def device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device_par
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
-    request.node.device_ids = device_ids[:num_devices_requested]
     request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]
 
     device_mesh = ttnn.open_device_mesh(
@@ -183,8 +176,7 @@ def pcie_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, devic
     except (ValueError, AttributeError):
         num_pcie_devices_requested = len(device_ids)
 
-    request.node.device_ids = device_ids[:num_pcie_devices_requested]
-    request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_pcie_devices_requested]]
+    request.node.pci_ids = device_ids[:num_pcie_devices_requested]
 
     device_mesh = ttnn.open_device_mesh(
         ttnn.DeviceGrid(1, num_pcie_devices_requested), device_ids[:num_pcie_devices_requested], **device_params
@@ -213,7 +205,6 @@ def t3k_device_mesh(request, silicon_arch_name, silicon_arch_wormhole_b0, device
     except (ValueError, AttributeError):
         num_devices_requested = len(device_ids)
 
-    request.node.device_ids = device_ids[:num_devices_requested]
     request.node.pci_ids = [ttl.device.GetPCIeDeviceID(i) for i in device_ids[:num_devices_requested]]
 
     device_mesh = ttnn.open_device_mesh(
@@ -334,13 +325,18 @@ def pytest_addoption(parser):
     )
     parser.addoption("--cli-input", action="store", default=None, help="Enter prompt if --input-method=cli")
     parser.addoption(
-        "--metal-cleanup",
+        "--metal-timeout",
         action="store",
         default=None,
         help="Enable process timeout",
     )
 
 
+@pytest.fixture
+def input_path(request):
+    return request.config.getoption("--input-path")
+
+
 def pytest_generate_tests(metafunc):
     """
     This is not a standard docstring.
@@ -473,14 +469,15 @@ def pytest_runtest_makereport(item, call):
 @pytest.hookimpl(hookwrapper=True)
 def pytest_runtest_teardown(item, nextitem):
     yield
-    metal_cleanup_enabled = item.config.getoption("--metal-cleanup")
-    if metal_cleanup_enabled is not None:
+    metal_timeout_enabled = item.config.getoption("--metal-timeout")
+    using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0"))
+
+    if metal_timeout_enabled is not None or using_xdist:
         report = item.stash[phase_report_key]
         test_failed = report.get("call", None) and report["call"].failed
         if test_failed:
-            logger.info(f"In custom teardown, open device ids: {item.device_ids} {set(item.pci_ids)}")
-            # reset_tensix(set(item.pci_ids))
-            reset_tensix()
+            logger.info(f"In custom teardown, open device ids: {set(item.pci_ids)}")
+            reset_tensix(set(item.pci_ids))
 
 
 # This is overriding the timer setup hook from pytest-timeout
@@ -488,10 +485,12 @@ def pytest_runtest_teardown(item, nextitem):
 # At timeout, the process kills it's parent (the test process) and then itself
 @pytest.hookimpl(tryfirst=True)
 def pytest_timeout_set_timer(item, settings):
-    metal_timeout_enabled = item.config.getoption("--metal-cleanup")
-    if metal_timeout_enabled is not None:
+    metal_timeout_enabled = item.config.getoption("--metal-timeout")
+    using_xdist = int(os.getenv("PYTEST_XDIST_WORKER_COUNT", "0"))
+
+    if metal_timeout_enabled is not None or using_xdist:
         parent_pid = os.getpid()
-        logger.info(f"Metal timeout {settings.timeout} seconds")
+        logger.info(f"Metal timeout {settings.timeout} seconds {parent_pid} for {item.nodeid}")
 
         def get_parent_status():
             try:
@@ -501,12 +500,15 @@ def get_parent_status():
             return parent.status()
 
         def run_timer(settings):
+            logger.info(f"Timer started for {item.nodeid}")
             dead_status = ["zombie", "dead", "already dead"]
             timeout = settings.timeout
-            while get_parent_status() not in dead_status and timeout > 0:
-                time.sleep(1)
-                timeout -= 1
-            if get_parent_status() != "already dead":
+            parent_status = "running"
+            while parent_status not in dead_status and timeout > 0:
+                time.sleep(5)
+                timeout -= 5
+                parent_status = get_parent_status()
+            if parent_status != "already dead":
                 logger.info(f"Timing out test case")
                 os.kill(parent_pid, signal.SIGKILL)
             logger.info(f"Killing timer")
@@ -519,13 +521,12 @@ def cancel():
         metal_timer = multiprocess.Process(target=run_timer, args=(settings,), daemon=True)
         item.cancel_timeout = cancel
         metal_timer.start()
-        # logger.info(f"parent and metal timer pid: {parent_pid} {metal_timer.pid}")
     return True
 
 
 # This is a hook used in pytest-xdist to handle when a worker crashes out
 # In our case, combined with pytest-timeout thread method, the worker will crash out for a hang and
-# then it should get cleaned up by the controller through this fixture :fingers_crossed:
+# then it should get cleaned up by the controller through this fixture
 @pytest.hookimpl(tryfirst=True)
 def pytest_handlecrashitem(crashitem, report, sched):
     reset_tensix()
@@ -542,10 +543,9 @@ def reset_tensix(tt_open_devices=None):
         smi_reset_result = run_process_and_get_result(f"/opt/tt_metal_infra/scripts/ci/{arch}/reset.sh")
     else:
         tt_open_devices_str = ",".join([str(i) for i in tt_open_devices])
-        check_smi = run_process_and_get_result("tt-smi-metal -h")
-        logger.info(f"Check tt-smi-metal exists: {check_smi.returncode}")
+        check_smi_metal = run_process_and_get_result("tt-smi-metal -h")
         logger.info(f"Running reset for pci devices: {tt_open_devices_str}")
-        if check_smi.returncode > 0:
+        if check_smi_metal.returncode > 0:
             logger.info(f"Test failed - resetting {arch} with tt-smi")
             smi_reset_result = run_process_and_get_result(f"tt-smi -r {tt_open_devices_str}")
         else:
@@ -555,5 +555,4 @@ def reset_tensix(tt_open_devices=None):
 
 @pytest.hookimpl(tryfirst=True)
 def pytest_xdist_auto_num_workers(config):
-    logger.info("getting num of xdist workers")
     return 1
diff --git a/pytest.ini b/pytest.ini
index c8f8a206f754..699ef215218e 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,7 +1,7 @@
 [pytest]
 timeout = 300
 minversion = 7.2
-addopts = --import-mode=importlib -vs -rA
+addopts = --import-mode=importlib -vvs -rA --durations=0
 empty_parameter_set_mark = skip
 markers =
     post_commit: mark tests to run on post-commit
diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh
index 754bcbc9ab1e..91567864538d 100755
--- a/tests/scripts/run_performance.sh
+++ b/tests/scripts/run_performance.sh
@@ -1,6 +1,6 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
@@ -11,19 +11,19 @@ run_perf_models_other() {
     local tt_arch=$1
     local test_marker=$2
 
-    env pytest tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker
+    env pytest -n auto tests/ttnn/integration_tests/resnet/test_performance.py -m $test_marker
 
-    env pytest tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker
+    env pytest -n auto tests/ttnn/integration_tests/bert/test_performance.py -m $test_marker
 
-    env pytest models/demos/ttnn_falcon7b/tests -m $test_marker
+    env pytest -n auto models/demos/ttnn_falcon7b/tests -m $test_marker
 
     # Separate calls since we can't mix switching between number of cqs
-    env pytest models/demos/resnet/tests/test_perf_resnet.py -m $test_marker
-    env pytest models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker
+    env pytest -n auto models/demos/resnet/tests/test_perf_resnet.py -m $test_marker
+    env pytest -n auto models/demos/resnet/tests/test_perf_resnet_2cqs.py -m $test_marker
 
-    env pytest tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker
+    env pytest -n auto tests/ttnn/integration_tests/whisper/test_performance.py -m $test_marker
 
-    env pytest models/demos/metal_BERT_large_11/tests -m $test_marker
+    env pytest -n auto models/demos/metal_BERT_large_11/tests -m $test_marker
 
     ## Merge all the generated reports
     env python models/perf/merge_perf_results.py
@@ -33,13 +33,13 @@ run_perf_models_llm_javelin() {
     local tt_arch=$1
     local test_marker=$2
 
-    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m $test_marker
+    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m $test_marker
 
     if [ "$tt_arch" == "wormhole_b0" ]; then
-        env pytest models/demos/mamba/tests -m $test_marker --timeout=360
+        env pytest -n auto models/demos/mamba/tests -m $test_marker --timeout=360
     fi
 
-    env  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360
+    env  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/mistral7b/tests -m $test_marker --timeout=360
 
     ## Merge all the generated reports
     env python models/perf/merge_perf_results.py
@@ -50,7 +50,7 @@ run_perf_models_cnn_javelin() {
     local test_marker=$2
 
     # Run tests
-    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480
+    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=480
     #env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests -m $test_marker
 
     ## Merge all the generated reports
@@ -58,6 +58,7 @@ run_perf_models_cnn_javelin() {
 }
 
 run_device_perf_models() {
+    set -eo pipefail
     local test_marker=$1
 
     env pytest tests/device_perf_tests/stable_diffusion -m $test_marker --timeout=600
diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh
index 334b68b71fd0..ebd25264b9ce 100755
--- a/tests/scripts/run_tests.sh
+++ b/tests/scripts/run_tests.sh
@@ -81,7 +81,7 @@ run_frequent_api_pipeline_tests() {
         ./tests/scripts/run_python_api_unit_tests.sh
     else
         if [[ $tt_arch == "wormhole_b0" ]]; then
-            pytest  tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
+            pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k nightly
         else
             echo "API tests are not available for fast dispatch because they're already covered in post-commit"
         fi
diff --git a/tests/scripts/single_card/nightly/run_common_models.sh b/tests/scripts/single_card/nightly/run_common_models.sh
index 17ca8c4d3cf3..19e090065f3f 100755
--- a/tests/scripts/single_card/nightly/run_common_models.sh
+++ b/tests/scripts/single_card/nightly/run_common_models.sh
@@ -1,12 +1,17 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
   exit 1
 fi
+fail=0
 
 echo "Running common models for archs"
 
-env pytest tests/nightly/common_models/
+env pytest -n auto tests/nightly/common_models/ ; fail+=$?
+
+if [[ $fail -ne 0 ]]; then
+  exit 1
+fi
diff --git a/tests/scripts/single_card/nightly/run_gs_only.sh b/tests/scripts/single_card/nightly/run_gs_only.sh
index c5bcc9f97452..bad5b98ea404 100755
--- a/tests/scripts/single_card/nightly/run_gs_only.sh
+++ b/tests/scripts/single_card/nightly/run_gs_only.sh
@@ -1,14 +1,19 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
   exit 1
 fi
+fail=0
 
 echo "Running model nightly tests for GS only"
 
-env pytest models/demos/resnet/tests/test_metal_resnet50_performant.py
+env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_performant.py ; fail+=$?
 
-env pytest models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py
+env pytest -n auto models/demos/resnet/tests/test_metal_resnet50_2cqs_performant.py ; fail+=$?
+
+if [[ $fail -ne 0 ]]; then
+  exit 1
+fi
diff --git a/tests/scripts/single_card/nightly/run_ttnn.sh b/tests/scripts/single_card/nightly/run_ttnn.sh
index f0bb3f9cadc3..a41836173deb 100755
--- a/tests/scripts/single_card/nightly/run_ttnn.sh
+++ b/tests/scripts/single_card/nightly/run_ttnn.sh
@@ -1,12 +1,17 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
   exit 1
 fi
+fail=0
 
 echo "Running ttnn nightly tests for GS only"
 
-env pytest tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal"
+env pytest -n auto tests/ttnn/integration_tests -m "not models_performance_bare_metal and not models_device_performance_bare_metal" ; fail+=$?
+
+if [[ $fail -ne 0 ]]; then
+  exit 1
+fi
diff --git a/tests/scripts/single_card/nightly/run_wh_b0_only.sh b/tests/scripts/single_card/nightly/run_wh_b0_only.sh
index d30894713c13..5ae9f0657cb1 100755
--- a/tests/scripts/single_card/nightly/run_wh_b0_only.sh
+++ b/tests/scripts/single_card/nightly/run_wh_b0_only.sh
@@ -1,12 +1,17 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
   exit 1
 fi
+fail=0
 
 echo "Running nightly tests for WH B0 only"
-env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/nightly/wh_b0_only_eth
-env pytest tests/nightly/wh_b0_only
\ No newline at end of file
+env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/nightly/wh_b0_only_eth ; fail+=$?
+env pytest -n auto tests/nightly/wh_b0_only ; fail+=$?
+
+if [[ $fail -ne 0 ]]; then
+  exit 1
+fi
diff --git a/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh b/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh
index 079087d6e690..35895a64208b 100755
--- a/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh
+++ b/tests/scripts/single_card/nightly/run_wh_b0_unstable.sh
@@ -1,12 +1,17 @@
 #/bin/bash
 
-set -eo pipefail
+# set -eo pipefail
 
 if [[ -z "$TT_METAL_HOME" ]]; then
   echo "Must provide TT_METAL_HOME in environment" 1>&2
   exit 1
 fi
+fail=0
 
 echo "Running unstable nightly tests for WH B0 only"
 
-SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest tests/ttnn/integration_tests/stable_diffusion
+SLOW_MATMULS=1 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml env pytest -n auto tests/ttnn/integration_tests/stable_diffusion ; fail+=$?
+
+if [[ $fail -ne 0 ]]; then
+    exit 1
+fi
diff --git a/tests/scripts/t3000/run_t3000_demo_tests.sh b/tests/scripts/t3000/run_t3000_demo_tests.sh
index 96a05371beb1..fa050429ddb2 100755
--- a/tests/scripts/t3000/run_t3000_demo_tests.sh
+++ b/tests/scripts/t3000/run_t3000_demo_tests.sh
@@ -1,23 +1,27 @@
 
 #/bin/bash
-set -eo pipefail
+# set -eo pipefail
 
 run_t3000_falcon40b_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_falcon40b_tests"
 
   # Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py --timeout=720
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_60_layer_t3000_prefill_10_loops.py --timeout=720 ; fail+=$?
 
   # Falcon40B end to end demo (prefill + decode)
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_loops.py
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_t3000_demo_loops.py ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_llama3_70b_tests() {
@@ -38,39 +42,47 @@ run_t3000_llama3_70b_tests() {
 
 run_t3000_falcon7b_tests(){
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_falcon7b_tests"
 
   # Falcon7B demo (perf verification for 128/1024/2048 seq lens and output token verification)
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_128_stochastic_verify]
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_1024_stochastic_verify]
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_2048_stochastic_verify]
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-default_mode_1024_greedy_verify]
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_128_stochastic_verify] ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_1024_stochastic_verify] ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-perf_mode_2048_stochastic_verify] ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto --disable-warnings -q -s --input-method=json --input-path='models/demos/t3000/falcon7b/input_data_t3000.json' models/demos/t3000/falcon7b/demo_t3000.py::test_demo_multichip[user_input0-8-True-default_mode_1024_greedy_verify] ; fail+=$?
 
   # Falcon7B perplexity test (prefill and decode)
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-prefill_seq1024_dram] --timeout=720
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-decode_1024_l1_sharded] --timeout=720
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-prefill_seq1024_dram] --timeout=720 ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests/test_perplexity_falcon.py::test_perplexity[True-decode_1024_l1_sharded] --timeout=720 ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_mixtral_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_mixtral8x7b_tests"
 
   # mixtral8x7b 8 chip demo test - 100 token generation with general weights (env flags set inside the test)
-  pytest models/demos/t3000/mixtral8x7b/demo/demo.py::test_mixtral8x7b_demo[wormhole_b0-True-general_weights] --timeout=720
+  pytest -n auto models/demos/t3000/mixtral8x7b/demo/demo.py::test_mixtral8x7b_demo[wormhole_b0-True-general_weights] --timeout=720 ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_tests() {
@@ -87,6 +99,7 @@ run_t3000_tests() {
   run_t3000_mixtral_tests
 }
 
+fail=0
 main() {
     # For CI pipeline - source func commands but don't execute tests if not invoked directly
   if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
@@ -109,6 +122,10 @@ main() {
   export PYTHONPATH=$TT_METAL_HOME
 
   run_t3000_tests
+
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 main "$@"
diff --git a/tests/scripts/t3000/run_t3000_frequent_tests.sh b/tests/scripts/t3000/run_t3000_frequent_tests.sh
index 37abf05e64d3..cab852813ef9 100755
--- a/tests/scripts/t3000/run_t3000_frequent_tests.sh
+++ b/tests/scripts/t3000/run_t3000_frequent_tests.sh
@@ -1,99 +1,122 @@
 
 #/bin/bash
-set -eo pipefail
+# set -eo pipefail
 
 run_t3000_ethernet_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_ethernet_tests"
 
-  pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py
-  pytest tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py
+  pytest -n auto tests/tt_metal/microbenchmarks/ethernet/test_ethernet_bidirectional_bandwidth_microbenchmark.py ; fail+=$?
+  pytest -n auto tests/tt_metal/microbenchmarks/ethernet/test_ethernet_ring_latency_microbenchmark.py ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_ethernet_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_llama2_70b_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_llama2_70b_tests"
 
-  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py
-  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py
-  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py
-  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py --timeout=900
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_mlp_t3000.py ; fail+=$?
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_attention_t3000.py ; fail+=$?
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_decoder_t3000.py ; fail+=$?
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_model_t3000.py --timeout=900 ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_mixtral_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_mixtral_tests"
 
   # mixtral8x7b 8 chip decode model test (env flags set inside the test)
-  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-10-1-pcc]
+  pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_model.py::test_mixtral_model_inference[wormhole_b0-True-10-1-pcc] ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_tteager_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_tteager_tests"
 
-  pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit
-  pytest tests/tt_eager/python_api_testing/unit_testing/misc/test_reduce_scatter_post_commit.py
+  pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_all_gather.py -k post_commit ; fail+=$?
+  pytest -n auto tests/tt_eager/python_api_testing/unit_testing/misc/test_reduce_scatter_post_commit.py ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_tteager_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_trace_stress_tests() {
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_trace_stress_tests"
-
-  NUM_TRACE_LOOPS=15 pytest tests/ttnn/unit_tests/test_multi_device_trace.py
-  NUM_TRACE_LOOPS=15 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py
+  NUM_TRACE_LOOPS=15 pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$?
+  NUM_TRACE_LOOPS=15 WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_trace_stress_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 
 run_t3000_falcon40b_tests() {
+  fail=0
   # Record the start time
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_falcon40b_tests"
 
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_mlp.py
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_attention.py --timeout=480
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_decoder.py --timeout=480
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_falcon_causallm.py --timeout=600
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_mlp.py ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_attention.py --timeout=480 ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_decoder.py --timeout=480 ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_falcon_causallm.py --timeout=600 ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_tests() {
@@ -103,9 +126,6 @@ run_t3000_tests() {
   # Run tteager tests
   run_t3000_tteager_tests
 
-  # Run trace tests
-  run_t3000_trace_stress_tests
-
   # Run falcon40b tests
   run_t3000_falcon40b_tests
 
@@ -115,8 +135,12 @@ run_t3000_tests() {
   # Run mixtral tests
   run_t3000_mixtral_tests
 
+  # Run trace tests
+  run_t3000_trace_stress_tests
+
 }
 
+fail=0
 main() {
   # For CI pipeline - source func commands but don't execute tests if not invoked directly
   if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
@@ -139,6 +163,10 @@ main() {
   export PYTHONPATH=$TT_METAL_HOME
 
   run_t3000_tests
+
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 main "$@"
diff --git a/tests/scripts/t3000/run_t3000_model_perf_tests.sh b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
index 6140b9efeafd..6f97b8e76368 100755
--- a/tests/scripts/t3000/run_t3000_model_perf_tests.sh
+++ b/tests/scripts/t3000/run_t3000_model_perf_tests.sh
@@ -1,61 +1,77 @@
 
 #/bin/bash
-set -eo pipefail
+# set -eo pipefail
 
 run_t3000_falcon7b_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_falcon7b_tests"
 
-  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/falcon7b/tests -m "model_perf_t3000"
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/falcon7b/tests -m "model_perf_t3000" ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_mixtral_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_mixtral_tests"
 
-  env pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py -m "model_perf_t3000"
+  env pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_perf.py -m "model_perf_t3000" ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_llama2_70b_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_llama2_70b_tests"
 
-  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/llama2_70b/tests/test_llama_perf_decode.py -m "model_perf_t3000" --timeout=600 ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_llama2_70b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_falcon40b_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_falcon40b_tests"
 
-  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/test_perf_falcon.py -m "model_perf_t3000" --timeout=600
+  env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/test_perf_falcon.py -m "model_perf_t3000" --timeout=600 ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_falcon40b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_llm_tests() {
@@ -80,6 +96,7 @@ run_t3000_cnn_tests() {
   env python models/perf/merge_perf_results.py
 }
 
+fail=0
 main() {
   # For CI pipeline - source func commands but don't execute tests if not invoked directly
   if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
@@ -129,6 +146,10 @@ main() {
     echo "$pipeline_type is invalid (supported: [cnn_model_perf_t3000_device, cnn_model_perf_t3000_device])" 2>&1
     exit 1
   fi
+
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 main "$@"
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
index a8019137642b..64c23ed2b488 100755
--- a/tests/scripts/t3000/run_t3000_unit_tests.sh
+++ b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -1,66 +1,79 @@
 
 #/bin/bash
-set -eo pipefail
+# set -eo pipefail
 
 run_t3000_ttmetal_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_ttmetal_tests"
 
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips"
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips"
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips"
-  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips"
-  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*"
-  ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*"
-  ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*"
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectSendAllConnectedChips" ; fail+=$?
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsSendInterleavedBufferAllConnectedChips" ; fail+=$?
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsDirectRingGatherAllChips" ; fail+=$?
+  TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/unit_tests --gtest_filter="DeviceFixture.EthKernelsInterleavedRingGatherAllChips" ; fail+=$?
+  TT_METAL_ENABLE_REMOTE_CHIP=1 ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueSingleCardFixture.*" ; fail+=$?
+  ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="CommandQueueMultiDeviceFixture.*" ; fail+=$?
+  ./build/test/tt_metal/unit_tests_fast_dispatch --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_ttmetal_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_ttnn_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_ttnn_tests"
-  pytest tests/ttnn/unit_tests/test_multi_device_trace.py
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py
-  pytest tests/ttnn/unit_tests/test_multi_device.py
-  pytest tests/ttnn/unit_tests/test_multi_device_async.py
+  pytest -n auto tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$?
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest tests/ttnn/unit_tests/test_multi_device_trace.py ; fail+=$?
+  pytest -n auto tests/ttnn/unit_tests/test_multi_device.py ; fail+=$?
+  pytest -n auto tests/ttnn/unit_tests/test_multi_device_async.py ; fail+=$?
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_ttnn_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_falcon7b_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_falcon7b_tests"
 
-  pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py
-  pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py
-  pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py
+  pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_mlp.py ; fail+=$?
+  pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_attention.py ; fail+=$?
+  pytest -n auto models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_decoder.py ; fail+=$?
   #pytest models/demos/ttnn_falcon7b/tests/multi_chip/test_falcon_causallm.py
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_falcon7b_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_falcon40b_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_falcon40b_tests"
 
-  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py
+  WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/t3000/falcon40b/tests/ci/test_falcon_end_to_end_1_layer_t3000.py ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
@@ -70,21 +83,25 @@ run_t3000_falcon40b_tests() {
 
 run_t3000_mixtral_tests() {
   # Record the start time
+  fail=0
   start_time=$(date +%s)
 
   echo "LOG_METAL: Running run_t3000_mixtral_tests"
 
-  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py
-  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py
-  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py
-  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py
-  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py
-  pytest models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py
+  pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_attention.py ; fail+=$?
+  pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_mlp.py ; fail+=$?
+  pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_rms_norm.py ; fail+=$?
+  pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_embedding.py ; fail+=$?
+  pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_moe.py ; fail+=$?
+  pytest -n auto models/demos/t3000/mixtral8x7b/tests/test_mixtral_decoder.py ; fail+=$?
 
   # Record the end time
   end_time=$(date +%s)
   duration=$((end_time - start_time))
   echo "LOG_METAL: run_t3000_mixtral_tests $duration seconds to complete"
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 run_t3000_tests() {
@@ -104,6 +121,7 @@ run_t3000_tests() {
   run_t3000_mixtral_tests
 }
 
+fail=0
 main() {
   # For CI pipeline - source func commands but don't execute tests if not invoked directly
   if [[ "${BASH_SOURCE[0]}" != "${0}" ]]; then
@@ -126,6 +144,10 @@ main() {
   export PYTHONPATH=$TT_METAL_HOME
 
   run_t3000_tests
+
+  if [[ $fail -ne 0 ]]; then
+    exit 1
+  fi
 }
 
 main "$@"
diff --git a/tt_metal/python_env/requirements-dev.txt b/tt_metal/python_env/requirements-dev.txt
index f7f902029195..5a6cf7ebb885 100644
--- a/tt_metal/python_env/requirements-dev.txt
+++ b/tt_metal/python_env/requirements-dev.txt
@@ -21,6 +21,7 @@ mypy==1.9.0
 pytest==7.2.2
 pytest-timeout==2.2.0
 pytest-split==0.8.2
+pytest-xdist==3.6.1
 jsbeautifier==1.14.7
 datasets==2.9.0
 torch==2.2.1.0+cpu