Skip to content

Commit

Permalink
Don't use tensix_types.h in ttnn (#15806)
Browse files Browse the repository at this point in the history
  • Loading branch information
blozano-tt authored Dec 9, 2024
1 parent 4393945 commit 41ce2b2
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

#define DATUMS_PER_ROW 16

// FIXME: ARCH_NAME specific include
#include "tensix_types.h" // DEST_REGISTER_FULL_SIZE
// This parameter is the same for all supported architectures
// Check this invariant when adding new architectures
#define DEST_REGISTER_FULL_SIZE 64 * 16

namespace ttnn {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
#include "tt_metal/common/constants.hpp"
#include "tt_metal/detail/util.hpp"

// FIXME: ARCH_NAME specific include
#include "tensix_types.h" // L1_SIZE

using namespace tt::constants;
using namespace tt;

Expand Down Expand Up @@ -92,14 +89,15 @@ static inline operation::ProgramWithCallbacks create_heads_combined_qkv_sharded(
block_ht * TILE_HEIGHT);
uint32_t per_core_tiles = block_ht * block_wt;

const uint32_t l1_size = input_tensor.device()->l1_size_per_core();
auto data_format = tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype());
uint32_t single_tile_size = tile_size(data_format);
TT_FATAL(
L1_SIZE >= 2 * per_core_tiles * single_tile_size,
l1_size >= 2 * per_core_tiles * single_tile_size,
"Workload of Tiles {} at Tile Size {} (times 2 for output) exceeds L1 capacity {}",
per_core_tiles,
single_tile_size,
L1_SIZE);
l1_size);

std::vector<uint32_t> num_tiles_per_group;
num_tiles_per_group.reserve(output.size());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@

#include "tt_metal/host_api.hpp"

// FIXME: ARCH_NAME specific include
#include "tensix_types.h" // L1_SIZE

namespace ttnn::operations::experimental::transformer {

void CreateQKVHeadsSeparateTensorsDeviceOperation::validate(const std::vector<Tensor>& input_tensors) const {
Expand Down Expand Up @@ -122,10 +119,11 @@ void CreateQKVHeadsSeparateTensorsDeviceOperation::validate(const std::vector<Te
uint32_t per_core_q_tiles = q_shard_ht * q_shard_wt;
uint32_t per_core_k_tiles = k_shard_ht * k_shard_wt;

const uint32_t l1_size = q_input_tensor.device()->l1_size_per_core();
const uint32_t single_tile_size =
tt::tile_size(tt::tt_metal::datatype_to_dataformat_converter(q_input_tensor.get_dtype()));
TT_FATAL(
L1_SIZE >= 2 * (per_core_q_tiles + 2 * per_core_k_tiles) * single_tile_size, "Workload exceeds L1 capacity");
l1_size >= 2 * (per_core_q_tiles + 2 * per_core_k_tiles) * single_tile_size, "Workload exceeds L1 capacity");

// TODO: Add this back when output is HEIGHT sharded only!
// TT_FATAL(this->output_mem_config.memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Error");
Expand Down
7 changes: 3 additions & 4 deletions ttnn/cpp/ttnn/operations/reduction/topk/device/topk_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
#include "topk_op.hpp"
#include "topk_program_factory.hpp"

// FIXME: ARCH_NAME specific include
#include "tensix_types.h" // L1_SIZE

namespace topk_utils {

static inline bool verify_available_cores(
Expand All @@ -16,6 +13,7 @@ static inline bool verify_available_cores(
uint16_t max_dim,
CoreCoord grid,
uint16_t k,
const uint32_t l1_size,
const uint32_t value_tile_size,
const uint32_t index_tile_size) {
const auto max_cores = grid.y - 1; // reserve one core for the gather - switch to grid.x as it allows for more
Expand All @@ -30,7 +28,7 @@ static inline bool verify_available_cores(
(split_size / tt::constants::TILE_WIDTH) *
(value_tile_size + index_tile_size); // we divide the width into split_size chunks and each chunk, as well
// as a matching set of indices, is processed by a core
if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < L1_SIZE && num_cores > 1) {
if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < l1_size && num_cores > 1) {
return true;
}
}
Expand Down Expand Up @@ -79,6 +77,7 @@ void TopK::validate_with_output_tensors(
input_shape[this->dim] / 2,
device->compute_with_storage_grid_size(),
this->k,
device->l1_size_per_core(),
value_tile_size,
index_tile_size),
"Not enough cores available to run topk operation");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
#include "tt_metal/host_api.hpp"
#include "tt_log.h"

// FIXME: ARCH_NAME specific include
#include "tensix_types.h" // L1_SIZE

namespace ttnn::operations::reduction::detail {

operation::ProgramWithCallbacks topk_single_core_interleaved(
Expand Down Expand Up @@ -179,6 +176,7 @@ static inline std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> cores_utilized(
uint16_t max_dim,
CoreCoord grid,
uint16_t k,
const uint32_t l1_size,
const uint32_t value_tile_size,
const uint32_t index_tile_size) {
const auto max_cores = grid.y - 1; // reserve one core for the gather - switch to grid.x as it allows for more
Expand All @@ -193,7 +191,7 @@ static inline std::tuple<uint16_t, uint16_t, uint16_t, uint16_t> cores_utilized(
(split_size / tt::constants::TILE_WIDTH) *
(value_tile_size + index_tile_size); // we divide the width into split_size chunks and each chunk, as well
// as a matching set of indices, is processed by a core
if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < L1_SIZE && num_cores > 1) {
if (num_cores <= max_cores && (memory_cost_gather + memory_cost_local) < l1_size && num_cores > 1) {
return {num_cores + 1, split_size, rem, num_cores * k};
}
}
Expand Down Expand Up @@ -237,6 +235,7 @@ operation::ProgramWithCallbacks topk_multicore_interleaved(
input_shape[dim] / 2,
device->compute_with_storage_grid_size(),
k,
device->l1_size_per_core(),
value_tile_size,
index_tile_size);

Expand Down

0 comments on commit 41ce2b2

Please sign in to comment.