Skip to content

Commit

Permalink
#0: Removed callback
Browse files Browse the repository at this point in the history
  • Loading branch information
sankarmanoj-tt committed Dec 16, 2024
1 parent b08c579 commit d7e027d
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 61 deletions.
1 change: 0 additions & 1 deletion tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3269,7 +3269,6 @@ void EnqueueProgramImpl(
detail::CompileProgram(device, program);
program.allocate_circular_buffers(device);
detail::ValidateCircularBufferRegion(program, device);
program.call_pre_exec_callback();
cq.hw_command_queue().enqueue_program(program, blocking);
// Program relinquishes ownership of all global buffers its using, once its been enqueued. Avoid mem
// leaks on device.
Expand Down
34 changes: 12 additions & 22 deletions tt_metal/impl/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class Program_ {

void invalidate_circular_buffer_allocation();

uint32_t get_max_cb_memory_usage(const Device *device) const;
uint32_t get_cb_memory_size() const;
void allocate_circular_buffers(const Device *device);

bool is_finalized() const;
Expand Down Expand Up @@ -754,18 +754,21 @@ void detail::Program_::invalidate_circular_buffer_allocation() {
}

void Program::invalidate_circular_buffer_allocation() { pimpl_->invalidate_circular_buffer_allocation(); }
uint32_t Program::get_max_cb_memory_usage(const Device *device) const { return pimpl_->get_max_cb_memory_usage(device); }
uint32_t Program::get_cb_memory_size() const { return pimpl_->get_cb_memory_size(); }


uint32_t detail::Program_::get_max_cb_memory_usage(const Device *device) const{
uint64_t base_cb_address = device->get_base_allocator_addr(HalMemType::L1);
uint64_t end_cb_address = base_cb_address;
uint32_t detail::Program_::get_cb_memory_size() const{

uint32_t total_cb_size = 0;
for (const auto& circular_buffer : this->circular_buffers_) {
if (circular_buffer->globally_allocated()) {
continue;
}
total_cb_size += circular_buffer->size();

for (const CircularBufferAllocator &cb_allocator : this->cb_allocators_) {
end_cb_address = std::max(end_cb_address, cb_allocator.get_cb_region_end());
}
log_info("Base CB address: {}, End CB Address: {}", base_cb_address,end_cb_address);
return end_cb_address - base_cb_address;
log_info("Total CB Size : {}", total_cb_size);
return total_cb_size;
}

void detail::Program_::allocate_circular_buffers(const Device *device) {
Expand Down Expand Up @@ -1467,19 +1470,6 @@ const std::vector<SubDeviceId> &detail::Program_::determine_sub_device_ids(const
return sub_device_ids->second;
}

void Program::set_pre_exec_callback(std::function<void(const Program&)> callback){
this->pre_exec_callback_ = std::move(callback);
}

void Program::call_pre_exec_callback(){
if (this->pre_exec_callback_) {
this->pre_exec_callback_(*this);
} else {
log_debug("No pre-exec callback set for program {}", this->get_id());
}
}


void detail::Program_::finalize(Device *device) {
// Store the number of tensix "go signals" for use by CQ
// CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix)
Expand Down
7 changes: 1 addition & 6 deletions tt_metal/impl/program/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,15 +166,10 @@ class Program {

const std::vector<SubDeviceId> &determine_sub_device_ids(const Device *device);

void set_pre_exec_callback(std::function<void(const Program&)> callback);
void call_pre_exec_callback();
uint32_t get_max_cb_memory_usage(const Device *device) const;
uint32_t get_cb_memory_size() const;


private:

std::function<void(const Program&)> pre_exec_callback_;

std::unique_ptr<detail::Program_> pimpl_;

friend CBHandle CreateCircularBuffer(Program &program, const std::variant<CoreCoord, CoreRange, CoreRangeSet> &core_spec, const CircularBufferConfig &config);
Expand Down
59 changes: 27 additions & 32 deletions ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,38 +265,33 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program(const std::vect
enable_subblock_padding,
use_non_tile_height);

if(std::getenv("TT_DISABLE_CONV_L1_CHECK")==nullptr) {
tt::log_info(tt::LogOp, "Allocation Stats before Op: {}", this->pre_op_l1_allocation_size_bytes);
program_with_cbs.program.set_pre_exec_callback([this, arch, input_dtype, weights_dtype, output_dtype, weights_shape, has_bias, device](const Program& program) {
const uint32_t post_op_l1_stats = device->get_memory_allocation_statistics(tt::tt_metal::BufferType::L1).total_allocated_bytes;
auto actual_cb_size = program.get_max_cb_memory_usage(device);

auto [calc_output_size, calc_CB_size] = estimate_L1_usage(
arch, this->memory_config.memory_layout,
input_dtype, weights_dtype, output_dtype,
compute_kernel_config,
block_config, parallelization_config,
input_tensor_shape, weights_shape, sliding_window_config.get_output_shape(),
output_channels, groups, std::array<uint32_t,2>({sliding_window_config.window_hw.first, sliding_window_config.window_hw.second}),
Conv2dConfig{
.enable_act_double_buffer=enable_act_double_buffer,
.enable_weights_double_buffer=enable_weights_double_buffer,
.enable_split_reader=enable_split_reader,
.enable_subblock_padding=enable_subblock_padding
},
has_bias, use_non_tile_height);
if(calc_CB_size > 0) {
if(calc_CB_size != actual_cb_size) {
tt::log_error("Calculated CB size {} does not match with the actual CB size {}",calc_CB_size,actual_cb_size);
TT_ASSERT(actual_cb_size==calc_CB_size);
}
}
if(calc_output_size > 0) {
if(post_op_l1_stats != this->pre_op_l1_allocation_size_bytes + calc_output_size) {
tt::log_error(tt::LogOp, "Mismatch!! L1 Allocation Pre Op = {}, Post Op = {} Calculated Size = {}", this->pre_op_l1_allocation_size_bytes, post_op_l1_stats,calc_output_size);
}
}
});
const uint32_t post_op_l1_stats = device->get_memory_allocation_statistics(tt::tt_metal::BufferType::L1).total_allocated_bytes;
auto actual_cb_size = program_with_cbs.program.get_cb_memory_size();

auto [calc_output_size, calc_CB_size] = estimate_L1_usage(
arch, this->memory_config.memory_layout,
input_dtype, weights_dtype, output_dtype,
compute_kernel_config,
block_config, parallelization_config,
input_tensor_shape, weights_shape, sliding_window_config.get_output_shape(),
output_channels, groups, std::array<uint32_t,2>({sliding_window_config.window_hw.first, sliding_window_config.window_hw.second}),
Conv2dConfig{
.enable_act_double_buffer=enable_act_double_buffer,
.enable_weights_double_buffer=enable_weights_double_buffer,
.enable_split_reader=enable_split_reader,
.enable_subblock_padding=enable_subblock_padding
},
has_bias, use_non_tile_height);
if(calc_CB_size > 0) {
if(calc_CB_size != actual_cb_size) {
tt::log_error("Calculated CB size {} does not match with the actual CB size {}",calc_CB_size,actual_cb_size);
TT_ASSERT(actual_cb_size==calc_CB_size);
}
}
if(calc_output_size > 0) {
if(post_op_l1_stats != this->pre_op_l1_allocation_size_bytes + calc_output_size) {
tt::log_error(tt::LogOp, "Mismatch!! L1 Allocation Pre Op = {}, Post Op = {} Calculated Size = {}", this->pre_op_l1_allocation_size_bytes, post_op_l1_stats,calc_output_size);
}
}
return program_with_cbs;
}
Expand Down

0 comments on commit d7e027d

Please sign in to comment.