diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index 8341540c5ef3..6944b86f4a2d 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -2626,17 +2626,33 @@ def test_non_tile_multiple_height_conv_wh(
 @pytest.mark.parametrize(
     "batch_size, output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, use_1d_systolic_array, config_override",
     (
+        (1, 64, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 64, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 64, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 128, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 192, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 256, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 384, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 448, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 576, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
+        (1, 128, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
         (1, 320, 320, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
-        (1, 320, 320, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
-        (1, 512, 512, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
-        (1, 512, 512, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
         (1, 640, 640, 16, 16, 3, 3, 1, 1, 1, 1, False, None),
-        (1, 640, 640, 32, 32, 3, 3, 1, 1, 1, 1, False, None),
     ),
 )
 @pytest.mark.parametrize(
     "weights_dtype",
-    [ttnn.bfloat16],
+    [ttnn.bfloat16, ttnn.bfloat8_b],
 )
 @pytest.mark.parametrize(
     "activations_dtype",
@@ -2665,16 +2681,6 @@ def test_non_tile_multiple_width_conv_wh(
     config_override,
     enable_auto_formatting,
 ):
-    # Skip test cases raising OOM, but do not affect the SD e2e test
-    if (input_channels == 960 and config_override == None and fp32_accum == True) or (
-        output_channels == 1280
-        and input_height == 32
-        and activations_dtype == ttnn.bfloat16
-        and weights_dtype == ttnn.bfloat16
-        and enable_auto_formatting == False
-    ):
-        pytest.skip("Skip the test cases raising OOM but not affecting e2e test")
-
     run_conv(
         device,
         math_fidelity,
@@ -2694,7 +2700,7 @@ def test_non_tile_multiple_width_conv_wh(
         use_1d_systolic_array,
         config_override,
         use_shallow_conv_variant=(input_channels == 16),
-        transpose_mcast=use_1d_systolic_array,  ## use RM (transpose_mcast=False) with 2D on WH
+        transpose_mcast=use_1d_systolic_array,
         enable_auto_formatting=enable_auto_formatting,
         padded_input_channels=16 if input_channels == 16 else None,
         output_layout=ttnn.ROW_MAJOR_LAYOUT,
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
index 4af7cb3d4ec9..27a0a5e9366e 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -106,8 +106,7 @@ ParallelConfig determine_parallel_config(
                 block_shard_orientation == ShardOrientation::COL_MAJOR ? compute_grid_size.x : compute_grid_size.y;
         auto channels_per_core = std::ceil((float)input_channels / effective_tile_width);
         if(is_non_tile_mul_width) {
-            out_nhw_ntiles = tt::round_up(batch_size * output_height * output_width, tt::constants::TILE_HEIGHT) / tt::constants::TILE_HEIGHT;
-            out_c_ntiles = tt::round_up(output_channels, effective_tile_width) / 1;
+            out_c_ntiles = tt::round_up(output_channels, effective_tile_width);
             channels_per_core = input_channels;
         }
         num_cores_nhw = find_closest_largest_divisor_with_num_padding(out_nhw_ntiles, start_divisor);
@@ -455,8 +454,9 @@ std::tuple<ttnn::Shape, ttnn::MemoryConfig, bool, bool> get_conv_padded_input_sh
         auto block_shard_orientation =
             conv_config.transpose_shards ? ShardOrientation::COL_MAJOR : ShardOrientation::ROW_MAJOR;
         auto num_cores_c = block_shard_orientation == ShardOrientation::COL_MAJOR ? device->compute_with_storage_grid_size().y : device->compute_with_storage_grid_size().x;
+        auto elem_size = conv_config.weights_dtype == DataType::BFLOAT8_B ? 1 : 2;
         bool is_non_tile_mul_width = (shard_layout == TensorMemoryLayout::BLOCK_SHARDED) && conv_config.act_block_h_override == 0 &&
-        (conv_config.dtype == DataType::BFLOAT16 || conv_config.dtype == DataType::FLOAT32) && conv_config.output_layout == Layout::ROW_MAJOR && ((2*in_channels) % (16 * num_cores_c)) == 0;
+        conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size*in_channels) % (16 * num_cores_c)) == 0;
         ParallelConfig optimal_parallel_config = determine_parallel_config(
             shard_layout, batch_size, in_channels, height, width, out_channels, device->compute_with_storage_grid_size(), block_shard_orientation, !use_non_tile_height, is_non_tile_mul_width);
 
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp
index 27de632f87d3..a9b7697ded09 100644
--- a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp
+++ b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_halo_v2/device/kernels/dataflow/halo_gather.cpp
@@ -75,18 +75,26 @@ void copy_sticks_async(
             if constexpr (is_read) {
                 uint32_t dst_addr = out_base_l1_addr + dst_offset;
                 uint64_t src_addr = base_addr + src_offset;
-                for(uint16_t k = 0; k < nsticks; k++) {
-                    noc_async_read(src_addr, dst_addr, stick_nbytes);
-                    dst_addr += stick_nbytes;
-                    src_addr += input_aligned_page_size;
+                if (stick_nbytes == input_aligned_page_size) {
+                    noc_async_read(src_addr, dst_addr, size);
+                } else {
+                    for (uint16_t k = 0; k < nsticks; k++) {
+                        noc_async_read(src_addr, dst_addr, stick_nbytes);
+                        dst_addr += stick_nbytes;
+                        src_addr += input_aligned_page_size;
+                    }
                 }
             } else {
                 uint64_t dst_addr = base_addr + dst_offset;
                 uint32_t src_addr = in_base_l1_addr + src_offset;
-                for(uint16_t k = 0; k < nsticks; k++) {
-                    noc_async_write(src_addr, dst_addr, stick_nbytes);
-                    dst_addr += stick_nbytes;
-                    src_addr += input_aligned_page_size;
+                if (stick_nbytes == input_aligned_page_size) {
+                    noc_async_write(src_addr, dst_addr, size);
+                } else {
+                    for (uint16_t k = 0; k < nsticks; k++) {
+                        noc_async_write(src_addr, dst_addr, stick_nbytes);
+                        dst_addr += stick_nbytes;
+                        src_addr += input_aligned_page_size;
+                    }
                 }
             }
         }