Don't use tensix_types.h in ttnn (#15806)

tenstorrent · Dec 9, 2024 · 41ce2b2 · 41ce2b2
1 parent 4393945
commit 41ce2b2
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 19 deletions.
diff --git a/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp b/ttnn/cpp/ttnn/operations/core/compute_kernel/compute_kernel_config.cpp
@@ -8,8 +8,9 @@
 
 #define DATUMS_PER_ROW 16
 
-// FIXME: ARCH_NAME specific include
-#include "tensix_types.h"  // DEST_REGISTER_FULL_SIZE
+// This parameter is the same for all supported architectures
+// Check this invariant when adding new architectures
+#define DEST_REGISTER_FULL_SIZE 64 * 16
 
 namespace ttnn {
 

diff --git a/...ons/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp b/...ons/experimental/transformer/create_qkv_heads/device/create_qkv_heads_program_factory.cpp
@@ -7,9 +7,6 @@
 #include "tt_metal/common/constants.hpp"
 #include "tt_metal/detail/util.hpp"
 
-// FIXME: ARCH_NAME specific include
-#include "tensix_types.h"  // L1_SIZE
-
 using namespace tt::constants;
 using namespace tt;
 
@@ -92,14 +89,15 @@ static inline operation::ProgramWithCallbacks create_heads_combined_qkv_sharded(
         block_ht * TILE_HEIGHT);
     uint32_t per_core_tiles = block_ht * block_wt;
 
+    const uint32_t l1_size = input_tensor.device()->l1_size_per_core();
     auto data_format = tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype());
     uint32_t single_tile_size = tile_size(data_format);
     TT_FATAL(
-        L1_SIZE >= 2 * per_core_tiles * single_tile_size,
+        l1_size >= 2 * per_core_tiles * single_tile_size,
         "Workload of Tiles {} at Tile Size {} (times 2 for output) exceeds L1 capacity {}",
         per_core_tiles,
         single_tile_size,
-        L1_SIZE);
+        l1_size);
 
     std::vector<uint32_t> num_tiles_per_group;
     num_tiles_per_group.reserve(output.size());

diff --git a/..._from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp b/..._from_separate_tensors/device/create_qkv_heads_from_separate_tensors_device_operation.cpp
@@ -7,9 +7,6 @@
 
 #include "tt_metal/host_api.hpp"
 
-// FIXME: ARCH_NAME specific include
-#include "tensix_types.h"  // L1_SIZE
-
 namespace ttnn::operations::experimental::transformer {
 
 void CreateQKVHeadsSeparateTensorsDeviceOperation::validate(const std::vector<Tensor>& input_tensors) const {
@@ -122,10 +119,11 @@ void CreateQKVHeadsSeparateTensorsDeviceOperation::validate(const std::vector<Te
     uint32_t per_core_q_tiles = q_shard_ht * q_shard_wt;
     uint32_t per_core_k_tiles = k_shard_ht * k_shard_wt;
 
+    const uint32_t l1_size = q_input_tensor.device()->l1_size_per_core();
     const uint32_t single_tile_size =
         tt::tile_size(tt::tt_metal::datatype_to_dataformat_converter(q_input_tensor.get_dtype()));
     TT_FATAL(
-        L1_SIZE >= 2 * (per_core_q_tiles + 2 * per_core_k_tiles) * single_tile_size, "Workload exceeds L1 capacity");
+        l1_size >= 2 * (per_core_q_tiles + 2 * per_core_k_tiles) * single_tile_size, "Workload exceeds L1 capacity");
 
     // TODO: Add this back when output is HEIGHT sharded only!
     // TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Error");

diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp
@@ -5,9 +5,6 @@
 #include "topk_op.hpp"
 #include "topk_program_factory.hpp"
 
-// FIXME: ARCH_NAME specific include
-#include "tensix_types.h"  // L1_SIZE
-
 namespace topk_utils {
 
 static inline bool verify_available_cores(
@@ -16,6 +13,7 @@ static inline bool verify_available_cores(
     uint16_t max_dim,
     CoreCoord grid,
     uint16_t k,
+    const uint32_t l1_size,
     const uint32_t value_tile_size,
     const uint32_t index_tile_size) {
     const auto max_cores = grid.y - 1;  // reserve one core for the gather - switch to grid.x as it allows for more
@@ -30,7 +28,7 @@ static inline bool verify_available_cores(
             (split_size / tt::constants::TILE_WIDTH) *
             (value_tile_size + index_tile_size);  // we divide the width into split_size chunks and each chunk, as well
                                                   // as a matching set of indices, is processed by a core
-        if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < L1_SIZE && num_cores > 1) {
+        if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < l1_size && num_cores > 1) {
             return true;
         }
     }
@@ -79,6 +77,7 @@ void TopK::validate_with_output_tensors(
                 input_shape[this->dim] / 2,
                 device->compute_with_storage_grid_size(),
                 this->k,
+                device->l1_size_per_core(),
                 value_tile_size,
                 index_tile_size),
             "Not enough cores available to run topk operation");

diff --git a/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp b/ttnn/cpp/ttnn/operations/reduction/topk/device/topk_program_factory.hpp
@@ -8,9 +8,6 @@
 #include "tt_metal/host_api.hpp"
 #include "tt_log.h"
 
-// FIXME: ARCH_NAME specific include
-#include "tensix_types.h"  // L1_SIZE
-
 namespace ttnn::operations::reduction::detail {
 
 operation::ProgramWithCallbacks topk_single_core_interleaved(
@@ -179,6 +176,7 @@ static inline std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> cores_utilized(
     uint16_t max_dim,
     CoreCoord grid,
     uint16_t k,
+    const uint32_t l1_size,
     const uint32_t value_tile_size,
     const uint32_t index_tile_size) {
     const auto max_cores = grid.y - 1;  // reserve one core for the gather - switch to grid.x as it allows for more
@@ -193,7 +191,7 @@ static inline std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> cores_utilized(
             (split_size / tt::constants::TILE_WIDTH) *
             (value_tile_size + index_tile_size);  // we divide the width into split_size chunks and each chunk, as well
                                                   // as a matching set of indices, is processed by a core
-        if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < L1_SIZE && num_cores > 1) {
+        if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < l1_size && num_cores > 1) {
             return {num_cores + 1, split_size, rem, num_cores * k};
         }
     }
@@ -237,6 +235,7 @@ operation::ProgramWithCallbacks topk_multicore_interleaved(
         input_shape[dim] / 2,
         device->compute_with_storage_grid_size(),
         k,
+        device->l1_size_per_core(),
         value_tile_size,
         index_tile_size);