diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index e825c3761a8e..afc6b344740b 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -3269,7 +3269,6 @@ void EnqueueProgramImpl( detail::CompileProgram(device, program); program.allocate_circular_buffers(device); detail::ValidateCircularBufferRegion(program, device); - program.call_pre_exec_callback(); cq.hw_command_queue().enqueue_program(program, blocking); // Program relinquishes ownership of all global buffers its using, once its been enqueued. Avoid mem // leaks on device. diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index bd375cebad3a..cea2b98e4111 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -128,7 +128,7 @@ class Program_ { void invalidate_circular_buffer_allocation(); - uint32_t get_max_cb_memory_usage(const Device *device) const; + uint32_t get_cb_memory_size() const; void allocate_circular_buffers(const Device *device); bool is_finalized() const; @@ -754,18 +754,21 @@ void detail::Program_::invalidate_circular_buffer_allocation() { } void Program::invalidate_circular_buffer_allocation() { pimpl_->invalidate_circular_buffer_allocation(); } -uint32_t Program::get_max_cb_memory_usage(const Device *device) const { return pimpl_->get_max_cb_memory_usage(device); } +uint32_t Program::get_cb_memory_size() const { return pimpl_->get_cb_memory_size(); } -uint32_t detail::Program_::get_max_cb_memory_usage(const Device *device) const{ - uint64_t base_cb_address = device->get_base_allocator_addr(HalMemType::L1); - uint64_t end_cb_address = base_cb_address; +uint32_t detail::Program_::get_cb_memory_size() const{ + + uint32_t total_cb_size = 0; + for (const auto& circular_buffer : this->circular_buffers_) { + if (circular_buffer->globally_allocated()) { + continue; + } + total_cb_size += circular_buffer->size(); - for (const CircularBufferAllocator &cb_allocator : this->cb_allocators_) { - end_cb_address = std::max(end_cb_address, cb_allocator.get_cb_region_end()); } - log_info("Base CB address: {}, End CB Address: {}", base_cb_address,end_cb_address); - return end_cb_address - base_cb_address; + log_info("Total CB Size : {}", total_cb_size); + return total_cb_size; } void detail::Program_::allocate_circular_buffers(const Device *device) { @@ -1467,19 +1470,6 @@ const std::vector &detail::Program_::determine_sub_device_ids(const return sub_device_ids->second; } -void Program::set_pre_exec_callback(std::function callback){ - this->pre_exec_callback_ = std::move(callback); -} - -void Program::call_pre_exec_callback(){ - if (this->pre_exec_callback_) { - this->pre_exec_callback_(*this); - } else { - log_debug("No pre-exec callback set for program {}", this->get_id()); - } -} - - void detail::Program_::finalize(Device *device) { // Store the number of tensix "go signals" for use by CQ // CQ iterates over these to update runtime addresses, needs to know when eth begins (after tensix) diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index d7475329f334..1ea32a54cbe2 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -166,15 +166,10 @@ class Program { const std::vector &determine_sub_device_ids(const Device *device); - void set_pre_exec_callback(std::function callback); - void call_pre_exec_callback(); - uint32_t get_max_cb_memory_usage(const Device *device) const; + uint32_t get_cb_memory_size() const; private: - - std::function pre_exec_callback_; - std::unique_ptr pimpl_; friend CBHandle CreateCircularBuffer(Program &program, const std::variant &core_spec, const CircularBufferConfig &config); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp index 7f5b9391e8fe..82b8baebf130 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp @@ -265,38 +265,33 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program(const std::vect enable_subblock_padding, use_non_tile_height); - if(std::getenv("TT_DISABLE_CONV_L1_CHECK")==nullptr) { - tt::log_info(tt::LogOp, "Allocation Stats before Op: {}", this->pre_op_l1_allocation_size_bytes); - program_with_cbs.program.set_pre_exec_callback([this, arch, input_dtype, weights_dtype, output_dtype, weights_shape, has_bias, device](const Program& program) { - const uint32_t post_op_l1_stats = device->get_memory_allocation_statistics(tt::tt_metal::BufferType::L1).total_allocated_bytes; - auto actual_cb_size = program.get_max_cb_memory_usage(device); - - auto [calc_output_size, calc_CB_size] = estimate_L1_usage( - arch, this->memory_config.memory_layout, - input_dtype, weights_dtype, output_dtype, - compute_kernel_config, - block_config, parallelization_config, - input_tensor_shape, weights_shape, sliding_window_config.get_output_shape(), - output_channels, groups, std::array({sliding_window_config.window_hw.first, sliding_window_config.window_hw.second}), - Conv2dConfig{ - .enable_act_double_buffer=enable_act_double_buffer, - .enable_weights_double_buffer=enable_weights_double_buffer, - .enable_split_reader=enable_split_reader, - .enable_subblock_padding=enable_subblock_padding - }, - has_bias, use_non_tile_height); - if(calc_CB_size > 0) { - if(calc_CB_size != actual_cb_size) { - tt::log_error("Calculated CB size {} does not match with the actual CB size {}",calc_CB_size,actual_cb_size); - TT_ASSERT(actual_cb_size==calc_CB_size); - } - } - if(calc_output_size > 0) { - if(post_op_l1_stats != this->pre_op_l1_allocation_size_bytes + calc_output_size) { - tt::log_error(tt::LogOp, "Mismatch!! L1 Allocation Pre Op = {}, Post Op = {} Calculated Size = {}", this->pre_op_l1_allocation_size_bytes, post_op_l1_stats,calc_output_size); - } - } - }); + const uint32_t post_op_l1_stats = device->get_memory_allocation_statistics(tt::tt_metal::BufferType::L1).total_allocated_bytes; + auto actual_cb_size = program_with_cbs.program.get_cb_memory_size(); + + auto [calc_output_size, calc_CB_size] = estimate_L1_usage( + arch, this->memory_config.memory_layout, + input_dtype, weights_dtype, output_dtype, + compute_kernel_config, + block_config, parallelization_config, + input_tensor_shape, weights_shape, sliding_window_config.get_output_shape(), + output_channels, groups, std::array({sliding_window_config.window_hw.first, sliding_window_config.window_hw.second}), + Conv2dConfig{ + .enable_act_double_buffer=enable_act_double_buffer, + .enable_weights_double_buffer=enable_weights_double_buffer, + .enable_split_reader=enable_split_reader, + .enable_subblock_padding=enable_subblock_padding + }, + has_bias, use_non_tile_height); + if(calc_CB_size > 0) { + if(calc_CB_size != actual_cb_size) { + tt::log_error("Calculated CB size {} does not match with the actual CB size {}",calc_CB_size,actual_cb_size); + TT_ASSERT(actual_cb_size==calc_CB_size); + } + } + if(calc_output_size > 0) { + if(post_op_l1_stats != this->pre_op_l1_allocation_size_bytes + calc_output_size) { + tt::log_error(tt::LogOp, "Mismatch!! L1 Allocation Pre Op = {}, Post Op = {} Calculated Size = {}", this->pre_op_l1_allocation_size_bytes, post_op_l1_stats,calc_output_size); + } } return program_with_cbs; }