#5725: adding few validation for upsample.

tenstorrent · Sep 19, 2024 · 261c81e · 261c81e
1 parent 1673fd9
commit 261c81e
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 19 deletions.
diff --git a/...n/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp b/...n/operations/pool/upsample/device/kernels/dataflow/reader_bilinear_multi_core_sharded.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -9,12 +9,18 @@
 
 // Fill given four values into the memory starting at the given address.
 // WARNING: Use with caution as there's no memory protection. Make sure size is within limits
-ALWI bool fill_four_val(uint32_t begin_addr, uint16_t val, uint16_t val1, uint16_t val2, uint16_t val3) {
+ALWI void fill_four_val(uint32_t begin_addr, uint16_t val, uint16_t val1, uint16_t val2, uint16_t val3) {
     volatile tt_l1_ptr uint32_t* ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(begin_addr);
 
     ptr[0] = (val | (val1 << 16));
     ptr[1] = (val2 | (val3 << 16));
-    return true;
+}
+
+ALWI float uint32_to_float(uint32_t f)
+{
+    float ret;
+    std::memcpy(&ret, &f, sizeof(float));
+    return ret;
 }
 
 
@@ -34,24 +40,23 @@ void kernel_main() {
 
     constexpr uint32_t in_cb_id = get_compile_time_arg_val(0);
     constexpr uint32_t out_cb_id = tt::CB::c_in1;
-    constexpr uint32_t is_reader = get_compile_time_arg_val(2);
-
-    uint32_t in_image_row_nbytes = in_w * stick_nbytes;
-    uint32_t out_image_row_nbytes = out_w * stick_nbytes;
-    uint32_t reader_image_rows_per_core = (in_image_rows_per_core + is_reader) / 2;
-    uint32_t writer_image_rows_per_core = in_image_rows_per_core / 2;
-    uint32_t image_row_begin = is_reader ? 0 : reader_image_rows_per_core;
-    uint32_t image_row_end = is_reader ? reader_image_rows_per_core : in_image_rows_per_core;
-    uint32_t l1_read_addr = get_read_ptr(in_cb_id); //+ image_row_begin * in_image_row_nbytes;
+    //constexpr uint32_t is_reader = get_compile_time_arg_val(2);
+    constexpr uint32_t scale_h_inv_comp = get_compile_time_arg_val(3);
+    constexpr uint32_t scale_w_inv_comp = get_compile_time_arg_val(4);
+    constexpr uint32_t y_index_comp = get_compile_time_arg_val(5);
+    constexpr uint32_t x_index_compute_comp = get_compile_time_arg_val(6);
+
+    uint32_t l1_read_addr = get_read_ptr(in_cb_id);
     constexpr uint32_t in_scalar_cb_id = tt::CB::c_in4;
 
     // assuming shard begins with a new row. TODO: generalize?
-    float scale_h_inv = 1.0f / scale_h;
-    float scale_w_inv = 1.0f / scale_w;
+    float scale_h_inv = uint32_to_float(scale_h_inv_comp);
+    float scale_w_inv =  uint32_to_float(scale_w_inv_comp);
     float x, y, x_index, y_index, dx, dy;
-    y_index = (float)(0.5f) * (float)scale_h_inv + 0.5f;
+    y_index = uint32_to_float(y_index_comp);
+    float x_index_compute = uint32_to_float(x_index_compute_comp);
     for (uint32_t image_row = 0 ; image_row < in_image_rows_per_core * scale_h; ++image_row){
-        x_index = (float)(0.5f) * (float)scale_w_inv -0.5f;
+        x_index = x_index_compute;
         for(uint32_t j=0; j < in_w * scale_w; j++){
             cb_reserve_back(out_cb_id, 4);
             cb_reserve_back(in_scalar_cb_id, 1);
@@ -64,7 +69,7 @@ void kernel_main() {
             uint32_t x1 = int(x);
             uint32_t y1 = int(y);
             uint32_t x2 = min(x1 + 1, in_w-1);
-            uint32_t y2 = y1 + 1; //, in_image_rows_per_core - 1);
+            uint32_t y2 = y1 + 1;
             if(is_last_row){
                 y2 = min(y2, in_image_rows_per_core); //if last row, y2 should be in_image_rows_per_core
             }

diff --git a/.../cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp b/.../cpp/ttnn/operations/pool/upsample/device/upsample_bilinear_program_factory_multicore.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -175,10 +175,25 @@ operation::ProgramWithCallbacks bilinear_multi_core(const Tensor &input, Tensor&
     log_debug(LogOp, "input_nsticks_per_core: {}, output_nsticks_per_core: {}", input_nsticks_per_core, output_nsticks_per_core);
 
     // Kernels
+    //computation needed for the bilinear kernel. Passing them as an argument.
+    float scale_h_inv = 1.0f / (float)scale_factor_h;
+    float scale_w_inv = 1.0f / (float)scale_factor_w;
+    float y_index = (float)(0.5f) * (float)scale_h_inv + 0.5f;
+    float x_index_compute = (float)(0.5f) * (float)scale_w_inv - 0.5f;
+
+    uint32_t scale_h_inv_u32 = *reinterpret_cast<uint32_t*>(&scale_h_inv);
+    uint32_t scale_w_inv_u32 = *reinterpret_cast<uint32_t*>(&scale_w_inv);
+    uint32_t y_index_u32 = *reinterpret_cast<uint32_t*>(&y_index);
+    uint32_t x_index_compute_u32 = *reinterpret_cast<uint32_t*>(&x_index_compute);
+
     std::vector<uint32_t> reader_compile_time_args = {
         in_cb_id,
         out_cb_id,
         false,
+        scale_h_inv_u32,
+        scale_w_inv_u32,
+        y_index_u32,
+        x_index_compute_u32,
     };
 
     string writer_kernel_fname, reader_kernel_fname, compute_kernel_fname;

diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/device/upsample_op.cpp
@@ -25,7 +25,11 @@ void UpSample::validate(const std::vector<Tensor> &input_tensors) const {
     TT_FATAL(input_tensor_a.get_dtype() == DataType::BFLOAT16, "Input tensor data type should be BFLOAT16");
     if (input_tensor_a.memory_config().is_sharded()) {
         TT_FATAL(input_tensor_a.memory_config().memory_layout == output_mem_config_.memory_layout, "Input tensor memory layout should be same as output tensor memory layout");
-        TT_FATAL(input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED, "Input tensor memory layout should be HEIGHT or BLOCK sharded");
+        if(mode_ == "nearest")
+            TT_FATAL(input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED || input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::BLOCK_SHARDED, "Input tensor memory layout should be HEIGHT or BLOCK sharded");
+        else if(mode_ == "bilinear")
+            TT_FATAL(input_tensor_a.memory_config().memory_layout == TensorMemoryLayout::HEIGHT_SHARDED, "Input tensor memory layout should be HEIGHT sharded");
+        TT_FATAL(mode_ == "bilinear" || mode_ == "nearest", "Upsample only supports bilinear or nearest mode");
         TT_FATAL(input_tensor_a.buffer()->buffer_type() == tt::tt_metal::BufferType::L1, "Input buffer should be sharded in L1");
     }
 }

diff --git a/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp b/ttnn/cpp/ttnn/operations/pool/upsample/upsample.cpp
@@ -17,6 +17,9 @@ ttnn::Tensor ExecuteUpSample::invoke(const ttnn::Tensor& input_tensor,
         MemoryConfig mem_config = output_mem_config.value_or(input_tensor.memory_config());
         ttnn::DeviceComputeKernelConfig config = compute_kernel_config.value_or(
             ttnn::init_device_compute_kernel_config(input_tensor.device()->arch(), std::nullopt, MathFidelity::HiFi4));
+        if(mode.empty()) {
+            mode = "nearest";
+        }
         int scale_h = 1;
         int scale_w = 1;
         std::visit(