diff --git a/models/demos/convnet_mnist/tt/convnet_mnist.py b/models/demos/convnet_mnist/tt/convnet_mnist.py
index a38aa60a770..f0443e938c4 100644
--- a/models/demos/convnet_mnist/tt/convnet_mnist.py
+++ b/models/demos/convnet_mnist/tt/convnet_mnist.py
@@ -19,19 +19,21 @@ def convnet_mnist(
     conv_config = ttnn.Conv2dConfig(
         dtype=ttnn.bfloat16,
         weights_dtype=ttnn.bfloat16,
-        math_fidelity=ttnn.MathFidelity.LoFi,
         activation="",
         shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-        math_approx_mode_enabled=True,
-        fp32_dest_acc_enabled=False,
-        packer_l1_accum_enabled=False,
         input_channels_alignment=32,
         transpose_shards=False,
         reshard_if_not_optimal=True,
         deallocate_activation=True,
         reallocate_halo_output=True,
     )
-
+    compute_config = ttnn.init_device_compute_kernel_config(
+        device.arch(),
+        math_fidelity=ttnn.MathFidelity.LoFi,
+        math_approx_mode=True,
+        fp32_dest_acc_en=False,
+        packer_l1_acc=False,
+    )
     x = ttnn.to_layout(input_tensor, layout=ttnn.ROW_MAJOR_LAYOUT)
     [x, out_height, out_width, weights_device, bias_device] = ttnn.conv2d(
         input_tensor=x,
@@ -47,6 +49,7 @@ def convnet_mnist(
         input_height=input_tensor.shape[1],
         input_width=input_tensor.shape[2],
         conv_config=conv_config,
+        compute_config=compute_config,
         conv_op_cache={},
         debug=True,
         groups=1,
diff --git a/models/demos/llama3/tt/multimodal/llama_conv2d_patch.py b/models/demos/llama3/tt/multimodal/llama_conv2d_patch.py
index a4d1bb59885..f5ff04f7e3e 100644
--- a/models/demos/llama3/tt/multimodal/llama_conv2d_patch.py
+++ b/models/demos/llama3/tt/multimodal/llama_conv2d_patch.py
@@ -79,7 +79,8 @@ def __init__(
             mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device),
         )
 
-        self.compute_kernel_config = ttnn.WormholeComputeKernelConfig(
+        self.compute_kernel_config = ttnn.init_device_compute_kernel_config(
+            mesh_device.arch(),
             math_fidelity=ttnn.MathFidelity.HiFi2,
             math_approx_mode=True,
             fp32_dest_acc_en=True,
diff --git a/models/demos/segformer/tt/common.py b/models/demos/segformer/tt/common.py
index 5f52fe0e507..10a4509bc4e 100644
--- a/models/demos/segformer/tt/common.py
+++ b/models/demos/segformer/tt/common.py
@@ -40,12 +40,8 @@ def __call__(self, device, input_tensor):
         conv_config = ttnn.Conv2dConfig(
             dtype=self.dtype,
             weights_dtype=ttnn.bfloat16,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             activation=self.activation,
             shard_layout=self.shard_layout,
-            math_approx_mode_enabled=True,
-            fp32_dest_acc_enabled=False,
-            packer_l1_accum_enabled=False,
             input_channels_alignment=16 if input_tensor.shape[3] < 16 else 32,
             transpose_shards=False,
             reshard_if_not_optimal=self.reshard,
@@ -54,6 +50,13 @@ def __call__(self, device, input_tensor):
             enable_act_double_buffer=True,
             enable_split_reader=False,
         )
+        compute_config = ttnn.init_device_compute_kernel_config(
+            device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            math_approx_mode=True,
+            fp32_dest_acc_en=False,
+            packer_l1_acc=False,
+        )
         if self.act_block_h is not None:
             conv_config.act_block_h_override = self.act_block_h
 
@@ -71,6 +74,7 @@ def __call__(self, device, input_tensor):
             input_height=input_tensor.shape[1],
             input_width=input_tensor.shape[2],
             conv_config=conv_config,
+            compute_config=compute_config,
             groups=self.groups,
         )
 
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py
index cfe555d0367..123e6a1cef4 100644
--- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py
+++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py
@@ -183,12 +183,14 @@ def run_downsample_if_req(
                 conv_config=ttnn.Conv2dConfig(
                     dtype=self.model_config["ACTIVATIONS_DTYPE"],
                     weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                    math_fidelity=self.model_config["MATH_FIDELITY"],
                     shard_layout=shard_layout,
                     deallocate_activation=True,
                     reallocate_halo_output=True,
                     reshard_if_not_optimal=reshard_if_not_optimal,
                 ),
+                compute_config=ttnn.init_device_compute_kernel_config(
+                    device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+                ),
                 conv_op_cache=conv_op_cache,
             )
             ttnn.deallocate(x)
@@ -230,13 +232,15 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -293,7 +297,6 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 reallocate_halo_output=reallocate_halo_output,
@@ -303,6 +306,9 @@ def __call__(
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -324,12 +330,14 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -562,12 +570,14 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 input_channels_alignment=16 if not is_wormhole_b0() else 32,
                 act_block_h_override=act_block_h_override,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
@@ -873,12 +883,14 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 input_channels_alignment=16 if not is_wormhole_b0() else 32,
                 act_block_h_override=act_block_h_override,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
index 44d90cb0f34..107e562a73d 100644
--- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
+++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py
@@ -176,7 +176,6 @@ def run_downsample_if_req(
                 conv_config=ttnn.Conv2dConfig(
                     dtype=self.model_config["ACTIVATIONS_DTYPE"],
                     weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                    math_fidelity=self.model_config["MATH_FIDELITY"],
                     shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                     if height_sharding
                     else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
@@ -184,7 +183,6 @@ def run_downsample_if_req(
                     reallocate_halo_output=not (is_wormhole_b0() and batch_size == 16),
                     reshard_if_not_optimal=reshard_if_not_optimal,
                     transpose_shards=transpose_shards,
-                    packer_l1_accum_enabled=packer_l1_accum_enabled,
                     enable_act_double_buffer=enable_act_double_buffer
                     if height_sharding
                     else True
@@ -194,6 +192,11 @@ def run_downsample_if_req(
                     enable_split_reader=enable_split_reader,
                     enable_subblock_padding=enable_subblock_padding,
                 ),
+                compute_config=ttnn.init_device_compute_kernel_config(
+                    device.arch(),
+                    math_fidelity=self.model_config["MATH_FIDELITY"],
+                    packer_l1_acc=packer_l1_accum_enabled,
+                ),
                 conv_op_cache=conv_op_cache,
             )
             ttnn.deallocate(x)
@@ -242,14 +245,17 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
                 transpose_shards=transpose_shards,
-                packer_l1_accum_enabled=packer_l1_acc,
+            ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(),
+                math_fidelity=self.model_config["MATH_FIDELITY"],
+                packer_l1_acc=packer_l1_acc,
             ),
             conv_op_cache=conv_op_cache,
         )
@@ -323,7 +329,6 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 reallocate_halo_output=reallocate_halo_output,
@@ -333,12 +338,16 @@ def __call__(
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
                 transpose_shards=transpose_shards,
-                packer_l1_accum_enabled=packer_l1_acc,
                 enable_act_double_buffer=enable_act_double_buffer,
                 enable_weights_double_buffer=True,
                 enable_split_reader=enable_split_reader,
                 enable_subblock_padding=enable_subblock_padding,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(),
+                math_fidelity=self.model_config["MATH_FIDELITY"],
+                packer_l1_acc=packer_l1_acc,
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -374,13 +383,16 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
                 transpose_shards=transpose_shards,
-                packer_l1_accum_enabled=packer_l1_acc,
+            ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(),
+                math_fidelity=self.model_config["MATH_FIDELITY"],
+                packer_l1_acc=packer_l1_acc,
             ),
             conv_op_cache=conv_op_cache,
         )
@@ -569,19 +581,22 @@ def __init__(
         self.conv1_config = ttnn.Conv2dConfig(
             dtype=self.model_config["ACTIVATIONS_DTYPE"],
             weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-            math_fidelity=self.model_config["MATH_FIDELITY"],
             activation="relu",
             deallocate_activation=dealloc_input,
             input_channels_alignment=input_channels_alignment,
             act_block_h_override=act_block_h_override,
             transpose_shards=self.transpose_shards,
-            packer_l1_accum_enabled=True if whb0_and_b16 else False,
             enable_act_double_buffer=True if whb0_and_b16 else False,
             enable_split_reader=True if whb0_and_b16 or not is_wormhole_b0() else False,
             enable_subblock_padding=False,
             shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
             reshard_if_not_optimal=False,
         )
+        self.conv1_compute_config = ttnn.init_device_compute_kernel_config(
+            device.arch(),
+            math_fidelity=self.model_config["MATH_FIDELITY"],
+            packer_l1_acc=True if whb0_and_b16 else False,
+        )
         if whb0_and_b16:
             # Issue #13145: Temp workaround for Galaxy to avoid hangs
             if type(device) == ttnn.MeshDevice and device.get_num_devices() > 8:
@@ -733,6 +748,7 @@ def run(self, input_tensor, device, ops_parallel_config, conv_op_cache={}) -> tt
             input_height=self.conv1_input_height,
             input_width=self.conv1_input_width,
             conv_config=self.conv1_config,
+            compute_config=self.conv1_compute_config,
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py
index 5c0750003c1..90853eb06bc 100644
--- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py
+++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py
@@ -178,7 +178,6 @@ def run_downsample_if_req(
                 conv_config=ttnn.Conv2dConfig(
                     dtype=self.model_config["ACTIVATIONS_DTYPE"],
                     weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                    math_fidelity=self.model_config["MATH_FIDELITY"],
                     shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                     if height_sharding
                     else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
@@ -186,6 +185,9 @@ def run_downsample_if_req(
                     reallocate_halo_output=True,
                     reshard_if_not_optimal=reshard_if_not_optimal,
                 ),
+                compute_config=ttnn.init_device_compute_kernel_config(
+                    device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+                ),
                 conv_op_cache=conv_op_cache,
             )
             ttnn.deallocate(x)
@@ -225,13 +227,15 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -286,7 +290,6 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 reallocate_halo_output=reallocate_halo_output,
@@ -296,6 +299,9 @@ def __call__(
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -317,12 +323,14 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -532,12 +540,14 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 input_channels_alignment=16 if not is_wormhole_b0() else 32,
                 act_block_h_override=act_block_h_override,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
@@ -835,12 +845,14 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 input_channels_alignment=16 if not is_wormhole_b0() else 32,
                 act_block_h_override=act_block_h_override,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py
index f2e266e1d8b..77894c78318 100644
--- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py
+++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py
@@ -180,7 +180,6 @@ def run_downsample_if_req(
                 conv_config=ttnn.Conv2dConfig(
                     dtype=self.model_config["ACTIVATIONS_DTYPE"],
                     weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                    math_fidelity=self.model_config["MATH_FIDELITY"],
                     shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                     if height_sharding
                     else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
@@ -188,6 +187,9 @@ def run_downsample_if_req(
                     reallocate_halo_output=True,
                     reshard_if_not_optimal=reshard_if_not_optimal,
                 ),
+                compute_config=ttnn.init_device_compute_kernel_config(
+                    device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+                ),
                 conv_op_cache=conv_op_cache,
             )
             ttnn.deallocate(x)
@@ -227,13 +229,15 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -289,7 +293,6 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 reallocate_halo_output=reallocate_halo_output,
@@ -299,6 +302,9 @@ def __call__(
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -320,12 +326,14 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -557,12 +565,14 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 input_channels_alignment=16 if not is_wormhole_b0() else 32,
                 act_block_h_override=act_block_h_override,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
@@ -888,12 +898,14 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 input_channels_alignment=16 if not is_wormhole_b0() else 32,
                 act_block_h_override=act_block_h_override,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py
index 45d93ebf685..e1cba745a8c 100644
--- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py
+++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py
@@ -179,7 +179,6 @@ def run_downsample_if_req(
                 conv_config=ttnn.Conv2dConfig(
                     dtype=self.model_config["ACTIVATIONS_DTYPE"],
                     weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                    math_fidelity=self.model_config["MATH_FIDELITY"],
                     shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                     if height_sharding
                     else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
@@ -188,6 +187,9 @@ def run_downsample_if_req(
                     reshard_if_not_optimal=reshard_if_not_optimal,
                     transpose_shards=height_sharding,
                 ),
+                compute_config=ttnn.init_device_compute_kernel_config(
+                    device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+                ),
                 conv_op_cache=conv_op_cache,
             )
             ttnn.deallocate(x)
@@ -232,7 +234,6 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
@@ -240,6 +241,9 @@ def __call__(
                 reshard_if_not_optimal=reshard_if_not_optimal,
                 transpose_shards=height_sharding,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -337,7 +341,6 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 reallocate_halo_output=reallocate_halo_output,
@@ -348,6 +351,9 @@ def __call__(
                 reshard_if_not_optimal=reshard_if_not_optimal,
                 transpose_shards=height_sharding,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -369,13 +375,15 @@ def __call__(
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED
                 if height_sharding
                 else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
                 reshard_if_not_optimal=reshard_if_not_optimal,
                 transpose_shards=height_sharding,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -597,13 +605,15 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 reallocate_halo_output=True,
                 input_channels_alignment=16 if not is_wormhole_b0() else 32,
                 act_block_h_override=act_block_h_override,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
@@ -931,12 +941,14 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
                 deallocate_activation=True,
                 input_channels_alignment=16 if not is_wormhole_b0() else 32,
                 act_block_h_override=act_block_h_override,
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"]
+            ),
             conv_op_cache=conv_op_cache,
         )
         # Relu is fused with conv1
diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py
index 4d5bdf30e06..9cc0397bb07 100644
--- a/models/demos/vgg/tests/test_perf_vgg.py
+++ b/models/demos/vgg/tests/test_perf_vgg.py
@@ -79,17 +79,6 @@ def test_vgg(
         "ACTIVATIONS_DTYPE": act_dtype,
     }
 
-    conv_config = ttnn.Conv2dConfig(
-        dtype=model_config["ACTIVATIONS_DTYPE"],
-        weights_dtype=model_config["WEIGHTS_DTYPE"],
-        math_fidelity=model_config["MATH_FIDELITY"],
-        activation="relu",
-        deallocate_activation=True,
-        input_channels_alignment=16,
-        act_block_h_override=0,
-        transpose_shards=True,
-    )
-
     torch_batched_tensor = torch_input_tensor_nchw.repeat(batch_size, 1, 1, 1)
     torch_input_tensor = torch.permute(torch_batched_tensor, (0, 2, 3, 1))
     tt_batched_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16)
diff --git a/models/demos/vgg/tt/ttnn_vgg.py b/models/demos/vgg/tt/ttnn_vgg.py
index 0748c745d16..ace558cfa11 100644
--- a/models/demos/vgg/tt/ttnn_vgg.py
+++ b/models/demos/vgg/tt/ttnn_vgg.py
@@ -90,10 +90,6 @@ def ttnn_vgg16(
             conv_config = ttnn.Conv2dConfig(
                 dtype=model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=model_config["WEIGHTS_DTYPE"],
-                math_fidelity=model_config["MATH_FIDELITY"],
-                math_approx_mode_enabled=True,
-                fp32_dest_acc_enabled=False,
-                packer_l1_accum_enabled=False,
                 activation="relu",
                 deallocate_activation=False,
                 input_channels_alignment=32,
@@ -107,6 +103,13 @@ def ttnn_vgg16(
             )
             if h_override[iter_conv_id] is not None:
                 conv_config.act_block_h_override = h_override[iter_conv_id]
+            compute_config = ttnn.init_device_compute_kernel_config(
+                device.arch(),
+                math_fidelity=model_config["MATH_FIDELITY"],
+                math_approx_mode=True,
+                fp32_dest_acc_en=False,
+                packer_l1_acc=False,
+            )
 
             tt_weight = parameters.features[conv_feature_ids[iter_conv_id]].weight
             tt_weight = ttnn.to_layout(ttnn.from_device(tt_weight), layout=ttnn.ROW_MAJOR_LAYOUT)
@@ -127,6 +130,7 @@ def ttnn_vgg16(
                 input_height=conv_ttnn_params[iter_conv_id][2],
                 input_width=conv_ttnn_params[iter_conv_id][3],
                 conv_config=conv_config,
+                compute_config=compute_config,
                 conv_op_cache=conv_op_cache,
             )
             tt_x = ttnn.from_device(tt_output_tensor_on_device)
@@ -214,9 +218,6 @@ def ttnn_vgg11(
             conv_config = ttnn.Conv2dConfig(
                 dtype=model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=model_config["WEIGHTS_DTYPE"],
-                math_fidelity=model_config["MATH_FIDELITY"],
-                math_approx_mode_enabled=True,
-                fp32_dest_acc_enabled=True,
                 activation="relu",
                 deallocate_activation=False,
                 input_channels_alignment=32,
@@ -230,6 +231,12 @@ def ttnn_vgg11(
             if height_override_11[iter_conv_id] is not None:
                 conv_config.act_block_h_override = height_override_11[iter_conv_id]
 
+            compute_config = ttnn.init_device_compute_kernel_config(
+                device.arch(),
+                math_fidelity=model_config["MATH_FIDELITY"],
+                math_approx_mode=True,
+                fp32_dest_acc_en=True,
+            )
             tt_weight = parameters.features[conv_feature_ids_2[iter_conv_id]].weight
             tt_weight = ttnn.to_layout(ttnn.from_device(tt_weight), layout=ttnn.ROW_MAJOR_LAYOUT)
             tt_bias = parameters.features[conv_feature_ids_2[iter_conv_id]].bias
@@ -250,6 +257,7 @@ def ttnn_vgg11(
                 input_height=conv_ttnn_params_2[iter_conv_id][2],
                 input_width=conv_ttnn_params_2[iter_conv_id][3],
                 conv_config=conv_config,
+                compute_config=compute_config,
                 conv_op_cache=conv_op_cache,
             )
             tt_x = ttnn.from_device(tt_output_tensor_on_device)
diff --git a/models/demos/wormhole/mamba/tt/mamba_conv.py b/models/demos/wormhole/mamba/tt/mamba_conv.py
index a2700198f83..799ea950392 100644
--- a/models/demos/wormhole/mamba/tt/mamba_conv.py
+++ b/models/demos/wormhole/mamba/tt/mamba_conv.py
@@ -54,11 +54,14 @@ def prepare_conv_config(self):
         self.conv1d_config = ttnn.Conv1dConfig(
             dtype=self.config.output_dtype,
             weights_dtype=self.config.weights_dtype,
-            math_fidelity=self.config.math_fidelity,
             shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
             input_channels_alignment=32,
             deallocate_activation=True,
         )
+        self.conv1d_compute_config = ttnn.init_device_compute_kernel_config(
+            self.device.arch(),
+            math_fidelity=self.config.math_fidelity,
+        )
 
     def prepare_input(self, input_tensor):
         # input_tensor (1, 1, B, 2E)
@@ -100,6 +103,7 @@ def __call__(self, input_tensor):
                 batch_size=1,
                 input_length=self.config.input_length,
                 conv_config=self.conv1d_config,
+                compute_config=self.conv1d_compute_config,
                 conv_op_cache={},
                 debug=False,
                 groups=self.config.groups // self.config.channels_split_factor,
diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py
index 2ad02078d71..1d1478f94d4 100644
--- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py
+++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py
@@ -126,11 +126,7 @@ def __call__(
         conv_config = ttnn.Conv2dConfig(
             dtype=ttnn.bfloat8_b,
             weights_dtype=ttnn.bfloat8_b,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             activation="",
-            math_approx_mode_enabled=True,
-            fp32_dest_acc_enabled=True,
-            packer_l1_accum_enabled=False,
             shard_layout=self.shard_layout,
             input_channels_alignment=32,
             transpose_shards=False,
@@ -140,6 +136,13 @@ def __call__(
         if hidden_states.memory_config() != self.input_memory_config:
             hidden_states = ttnn.to_memory_config(hidden_states, self.input_memory_config)
 
+        compute_config = ttnn.init_device_compute_kernel_config(
+            self.device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            math_approx_mode=True,
+            fp32_dest_acc_en=True,
+            packer_l1_acc=False,
+        )
         if self.conv_config_override and "act_block_h" in self.conv_config_override:
             conv_config.act_block_h_override = self.conv_config_override["act_block_h"]
 
@@ -157,6 +160,7 @@ def __call__(
             weight_tensor=self.conv_weights,
             bias_tensor=self.conv_bias,
             conv_config=conv_config,
+            compute_config=compute_config,
             conv_op_cache=conv_cache,
         )
         # hidden_states = run_ttnn_conv_with_pre_and_post_tensor_formatting(
diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py
index 4cedbdea78c..6d94f60975e 100644
--- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py
+++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py
@@ -459,16 +459,19 @@ def __call__(
             conv_config = ttnn.Conv2dConfig(
                 dtype=ttnn.bfloat8_b,
                 weights_dtype=ttnn.bfloat8_b,
-                math_fidelity=ttnn.MathFidelity.LoFi,
                 activation="",
                 shard_layout=self.conv1_shard_layout,
-                math_approx_mode_enabled=True,
-                fp32_dest_acc_enabled=True,
-                packer_l1_accum_enabled=False,
                 input_channels_alignment=32,
                 transpose_shards=False,
                 reshard_if_not_optimal=False,
             )
+            compute_config = ttnn.init_device_compute_kernel_config(
+                self.device.arch(),
+                math_fidelity=ttnn.MathFidelity.LoFi,
+                math_approx_mode=True,
+                fp32_dest_acc_en=True,
+                packer_l1_acc=False,
+            )
             if self.conv1_config_override and "act_block_h" in self.conv2_config_override:
                 conv_config.act_block_h_override = self.conv1_config_override["act_block_h"]
             [hidden_states, _out_height, _out_width, self.conv1s_weights[0], self.conv1s_bias[0]] = ttnn.conv2d(
@@ -485,6 +488,7 @@ def __call__(
                 input_height=self.conv1_input_height,
                 input_width=self.conv1_input_width,
                 conv_config=conv_config,
+                compute_config=compute_config,
                 conv_op_cache=conv_cache,
             )
 
@@ -529,17 +533,19 @@ def __call__(
                 conv_config = ttnn.Conv2dConfig(
                     dtype=ttnn.bfloat8_b,
                     weights_dtype=ttnn.bfloat8_b,
-                    math_fidelity=ttnn.MathFidelity.LoFi,
                     activation="",
                     shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
-                    math_approx_mode_enabled=True,
-                    fp32_dest_acc_enabled=True,
-                    packer_l1_accum_enabled=False,
                     input_channels_alignment=32,
                     transpose_shards=False,
                     reshard_if_not_optimal=False,
                 )
-
+                compute_config = ttnn.init_device_compute_kernel_config(
+                    self.device.arch(),
+                    math_fidelity=ttnn.MathFidelity.LoFi,
+                    math_approx_mode=True,
+                    fp32_dest_acc_en=True,
+                    packer_l1_acc=False,
+                )
                 if self.conv1_config_override and "act_block_h" in self.conv2_config_override:
                     conv_config.act_block_h_override = self.conv1_config_override["act_block_h"]
 
@@ -563,6 +569,7 @@ def __call__(
                     input_height=self.conv1_input_height,
                     input_width=self.conv1_input_width,
                     conv_config=conv_config,
+                    compute_config=compute_config,
                     conv_op_cache=conv_cache,
                 )
                 if i != 0:
@@ -658,16 +665,19 @@ def __call__(
         conv_config = ttnn.Conv2dConfig(
             dtype=ttnn.bfloat8_b,
             weights_dtype=ttnn.bfloat8_b,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             activation="",
             shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
-            math_approx_mode_enabled=True,
-            fp32_dest_acc_enabled=True,
-            packer_l1_accum_enabled=False,
             input_channels_alignment=32,
             transpose_shards=False,
             reshard_if_not_optimal=False,
         )
+        compute_config = ttnn.init_device_compute_kernel_config(
+            self.device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            math_approx_mode=True,
+            fp32_dest_acc_en=True,
+            packer_l1_acc=False,
+        )
         if self.conv2_config_override and "act_block_h" in self.conv2_config_override:
             conv_config.act_block_h_override = self.conv2_config_override["act_block_h"]
         [hidden_states, _out_height, _out_width, self.conv2_weights, self.conv2_bias] = ttnn.conv2d(
@@ -684,6 +694,7 @@ def __call__(
             input_height=self.conv2_input_height,
             input_width=self.conv2_input_width,
             conv_config=conv_config,
+            compute_config=compute_config,
             conv_op_cache=conv_cache,
         )
         use_in_shortcut = in_channels != out_channels if use_in_shortcut is None else use_in_shortcut
@@ -702,16 +713,19 @@ def __call__(
             conv_config = ttnn.Conv2dConfig(
                 dtype=ttnn.bfloat8_b,
                 weights_dtype=ttnn.bfloat8_b,
-                math_fidelity=ttnn.MathFidelity.LoFi,
                 activation="",
                 shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
-                math_approx_mode_enabled=True,
-                fp32_dest_acc_enabled=True,
-                packer_l1_accum_enabled=False,
                 input_channels_alignment=32,
                 transpose_shards=False,
                 reshard_if_not_optimal=False,
             )
+            compute_config = ttnn.init_device_compute_kernel_config(
+                self.device.arch(),
+                math_fidelity=ttnn.MathFidelity.LoFi,
+                math_approx_mode=True,
+                fp32_dest_acc_en=True,
+                packer_l1_acc=False,
+            )
             [input_tensor, _out_height, _out_width, self.conv_shortcut_weights, self.conv_shortcut_bias] = ttnn.conv2d(
                 input_tensor=input_tensor,
                 weight_tensor=self.conv_shortcut_weights,
@@ -726,6 +740,7 @@ def __call__(
                 input_height=self.conv_shortcut_input_height,
                 input_width=self.conv_shortcut_input_width,
                 conv_config=conv_config,
+                compute_config=compute_config,
                 conv_op_cache=conv_cache,
             )
 
diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py
index 12e4d543207..3a856dce04e 100644
--- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py
+++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py
@@ -242,13 +242,16 @@ def __call__(
         conv_config = ttnn.Conv2dConfig(
             dtype=ttnn.bfloat8_b,
             weights_dtype=ttnn.bfloat8_b,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             activation="",
             shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
             input_channels_alignment=32,
-            fp32_dest_acc_enabled=self.compute_kernel_config.fp32_dest_acc_en,
             transpose_shards=False,
         )
+        compute_config = ttnn.init_device_compute_kernel_config(
+            self.device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            fp32_dest_acc_en=self.compute_kernel_config.fp32_dest_acc_en,
+        )
         [hidden_states, _out_height, _out_width, self.proj_in_conv_weights, self.proj_in_conv_bias] = ttnn.conv2d(
             input_tensor=hidden_states,
             in_channels=self.proj_in_in_channels,
@@ -263,6 +266,7 @@ def __call__(
             weight_tensor=self.proj_in_conv_weights,
             bias_tensor=self.proj_in_conv_bias,
             conv_config=conv_config,
+            compute_config=compute_config,
             conv_op_cache=conv_cache,
         )
 
diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py
index 9cbdfff2f48..a3525c38598 100644
--- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py
+++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py
@@ -383,17 +383,19 @@ def __call__(
         conv_config = ttnn.Conv2dConfig(
             dtype=ttnn.bfloat8_b,
             weights_dtype=ttnn.bfloat8_b,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             activation="",
-            math_approx_mode_enabled=True,
-            fp32_dest_acc_enabled=True,
-            packer_l1_accum_enabled=False,
             shard_layout=shard_layout,
             input_channels_alignment=32,
             transpose_shards=False,
             reshard_if_not_optimal=True,
         )
-
+        compute_config = ttnn.init_device_compute_kernel_config(
+            self.device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            math_approx_mode=True,
+            fp32_dest_acc_en=True,
+            packer_l1_acc=False,
+        )
         [sample, _out_height, _out_width, self.conv_in_weights, self.conv_in_bias] = ttnn.conv2d(
             input_tensor=sample,
             weight_tensor=self.conv_in_weights,
@@ -408,6 +410,7 @@ def __call__(
             input_height=self.input_height,
             input_width=self.input_width,
             conv_config=conv_config,
+            compute_config=compute_config,
             conv_op_cache=conv_cache,
         )
         sample = ttnn.reallocate(sample)  # TODO: Test remove
@@ -646,17 +649,20 @@ def __call__(
         conv_config = ttnn.Conv2dConfig(
             dtype=ttnn.bfloat8_b,
             weights_dtype=ttnn.bfloat8_b,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             activation="",
             shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
-            math_approx_mode_enabled=True,
-            fp32_dest_acc_enabled=True,
-            packer_l1_accum_enabled=False,
             input_channels_alignment=32,
             act_block_h_override=64,
             transpose_shards=False,
             reshard_if_not_optimal=True,
         )
+        compute_config = ttnn.init_device_compute_kernel_config(
+            self.device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            math_approx_mode=True,
+            fp32_dest_acc_en=True,
+            packer_l1_acc=False,
+        )
         [sample, _out_height, _out_width, self.conv_out_weights, self.conv_out_bias] = ttnn.conv2d(
             input_tensor=sample,
             in_channels=self.conv_out_in_channels,
@@ -671,6 +677,7 @@ def __call__(
             weight_tensor=self.conv_out_weights,
             bias_tensor=self.conv_out_bias,
             conv_config=conv_config,
+            compute_config=compute_config,
             conv_op_cache=conv_cache,
         )
         sample = ttnn.to_memory_config(sample, ttnn.L1_MEMORY_CONFIG)
diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py
index 622a63065db..54056a71526 100644
--- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py
+++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py
@@ -91,16 +91,19 @@ def __call__(self, input, in_channels, out_channels):
         conv_config = ttnn.Conv2dConfig(
             dtype=ttnn.bfloat8_b,
             weights_dtype=ttnn.bfloat8_b,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             activation="",
             shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
-            math_approx_mode_enabled=True,
-            fp32_dest_acc_enabled=True,
-            packer_l1_accum_enabled=False,
             input_channels_alignment=32,
             transpose_shards=False,
             reshard_if_not_optimal=False,  # Reshard has error : 1616 Bytes unique+common runtime args targeting kernel reshard_reader on (x=0,y=0) are too large. Cannot be written as they will run into memory region reserved for result. Max allowable size is 1024 Bytes
         )
+        compute_config = ttnn.init_device_compute_kernel_config(
+            self.device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            math_approx_mode=True,
+            fp32_dest_acc_en=True,
+            packer_l1_acc=False,
+        )
         if self.conv_config_override and "act_block_h" in self.conv_config_override:
             conv_config.act_block_h_override = self.conv_config_override["act_block_h"]
         [tt_out, _out_height, _out_width, self.conv_weight_tensor, self.conv_bias_tensor] = ttnn.conv2d(
@@ -117,6 +120,7 @@ def __call__(self, input, in_channels, out_channels):
             weight_tensor=self.conv_weight_tensor,
             bias_tensor=self.conv_bias_tensor,
             conv_config=conv_config,
+            compute_config=compute_config,
             conv_op_cache=conv_cache,
         )
         return tt_out
diff --git a/models/demos/yolov4/ttnn/common.py b/models/demos/yolov4/ttnn/common.py
index b293a6db751..9d3b154aaf4 100644
--- a/models/demos/yolov4/ttnn/common.py
+++ b/models/demos/yolov4/ttnn/common.py
@@ -80,13 +80,9 @@ def __call__(self, device, input_tensor):
         conv_config = ttnn.Conv2dConfig(
             dtype=ttnn.bfloat16,
             weights_dtype=ttnn.bfloat8_b,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             activation=self.activation,
             shard_layout=self.shard_layout,
-            math_approx_mode_enabled=True,
-            fp32_dest_acc_enabled=False,
             act_block_w_div=1,
-            packer_l1_accum_enabled=False,
             input_channels_alignment=16 if self.input_params[3] < 16 else 32,
             transpose_shards=False,
             reshard_if_not_optimal=self.reshard,
@@ -96,6 +92,13 @@ def __call__(self, device, input_tensor):
             enable_act_double_buffer=self.enable_act_double_buffer,
             output_layout=self.output_layout,
         )
+        compute_config = ttnn.init_device_compute_kernel_config(
+            device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            math_approx_mode=False,
+            fp32_dest_acc_en=False,
+            packer_l1_acc=False,
+        )
         if self.act_block_h is not None:
             conv_config.act_block_h_override = self.act_block_h
 
@@ -113,5 +116,6 @@ def __call__(self, device, input_tensor):
             input_height=self.input_params[1],
             input_width=self.input_params[2],
             conv_config=conv_config,
+            compute_config=compute_config,
         )
         return output_tensor
diff --git a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py
index 215399ea23b..fb087ad9279 100644
--- a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py
+++ b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py
@@ -114,10 +114,8 @@ def __init__(
         self.conv_config = ttnn.Conv2dConfig(
             dtype=activation_dtype,
             weights_dtype=weights_dtype,
-            math_fidelity=ttnn.MathFidelity.LoFi,
             shard_layout=shard_layout,
             deallocate_activation=self.deallocate_activation,
-            packer_l1_accum_enabled=False,
             enable_act_double_buffer=(
                 conv.use_activation_double_buffer if "use_activation_double_buffer" in conv else False
             ),
@@ -128,6 +126,12 @@ def __init__(
             input_channels_alignment=conv.input_channels_alignment if "input_channels_alignment" in conv else 32,
             reshard_if_not_optimal=reshard_if_not_optimal,
         )
+        self.compute_config = ttnn.init_device_compute_kernel_config(
+            device.arch(),
+            math_fidelity=ttnn.MathFidelity.LoFi,
+            fp32_dest_acc_en=True,
+            packer_l1_acc=False,
+        )
         config_override = conv.conv_blocking_and_parallelization_config_override
         if config_override and "act_block_h" in config_override:
             self.conv_config.act_block_h_override = config_override["act_block_h"]
@@ -157,6 +161,7 @@ def __call__(self, x):
             stride=self.stride,
             padding=self.padding,
             conv_config=self.conv_config,
+            compute_config=self.compute_config,
             conv_op_cache=self.cache,
             groups=2,
         )
diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py
index 55769adb984..2dc1709bdbd 100644
--- a/tests/sweep_framework/sweep_utils/conv2d_common.py
+++ b/tests/sweep_framework/sweep_utils/conv2d_common.py
@@ -117,18 +117,20 @@ def run_full(
     conv_config = ttnn.Conv2dConfig(
         dtype=activations_dtype,
         weights_dtype=weights_dtype,
-        math_fidelity=math_fidelity,
         shard_layout=None,
         deallocate_activation=deallocate_activation,
-        fp32_dest_acc_enabled=fp32_accum,
-        packer_l1_accum_enabled=packer_l1_acc,
         override_sharding_config=override_sharding_config,
         output_layout=output_layout,
         enable_act_double_buffer=enable_act_double_buffer,
         enable_split_reader=enable_split_reader,
         enable_subblock_padding=enable_subblock_padding,
     )
-
+    compute_config = ttnn.init_device_compute_kernel_config(
+        device.arch(),
+        math_fidelity=math_fidelity,
+        fp32_dest_acc_en=fp32_accum,
+        packer_l1_acc=packer_l1_acc,
+    )
     if override_sharding_config:
         if len(core_grid) == 2:
             conv_config.core_grid = ttnn.CoreRangeSet({ttnn.CoreRange(core_grid[0], core_grid[1])})
@@ -152,6 +154,7 @@ def run_full(
         input_height=input_height,
         input_width=input_width,
         conv_config=conv_config,
+        compute_config=compute_config,
         groups=groups,
     )
 
diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_demo.py b/tests/ttnn/integration_tests/stable_diffusion/test_demo.py
index 5c8dc03b967..36b73e70e3c 100644
--- a/tests/ttnn/integration_tests/stable_diffusion/test_demo.py
+++ b/tests/ttnn/integration_tests/stable_diffusion/test_demo.py
@@ -29,6 +29,8 @@
     ((512, 512),),
 )
 def test_demo_sd(device, reset_seeds, input_path, num_prompts, num_inference_steps, image_size):
+    if device.core_grid.y != 8:
+        pytest.skip("Needs 8x8 Grid")
     demo(device, reset_seeds, input_path, num_prompts, num_inference_steps, image_size)
 
 
@@ -48,4 +50,6 @@ def test_demo_sd(device, reset_seeds, input_path, num_prompts, num_inference_ste
     ((512, 512),),
 )
 def test_demo_sd_db(device, reset_seeds, input_path, num_prompts, num_inference_steps, image_size):
+    if device.core_grid.y != 8:
+        pytest.skip("Needs 8x8 Grid")
     demo_db(device, reset_seeds, input_path, num_prompts, num_inference_steps, image_size)
diff --git a/tests/ttnn/unit_tests/operations/test_conv1d.py b/tests/ttnn/unit_tests/operations/test_conv1d.py
index 3e7a1496c63..a7ca4c9c30c 100644
--- a/tests/ttnn/unit_tests/operations/test_conv1d.py
+++ b/tests/ttnn/unit_tests/operations/test_conv1d.py
@@ -88,12 +88,15 @@ def run_conv(
     conv_config = ttnn.Conv1dConfig(
         dtype=output_dtype,
         weights_dtype=weights_dtype,
-        math_fidelity=math_fidelity,
         shard_layout=shard_layout,
         input_channels_alignment=(16 if use_shallow_conv_variant else 32),
         deallocate_activation=deallocate_activation,
-        fp32_dest_acc_enabled=fp32_accum,
-        packer_l1_accum_enabled=packer_l1_acc,
+    )
+    compute_config = ttnn.init_device_compute_kernel_config(
+        device.arch(),
+        math_fidelity=math_fidelity,
+        fp32_dest_acc_en=fp32_accum,
+        packer_l1_acc=packer_l1_acc,
     )
     if config_override and "act_block_h" in config_override:
         conv_config.act_block_h_override = config_override["act_block_h"]
@@ -117,6 +120,7 @@ def run_conv(
         batch_size=batch_size,
         input_length=input_length,
         conv_config=conv_config,
+        compute_config=compute_config,
         conv_op_cache=reader_patterns_cache,
         debug=debug,
         groups=groups,
diff --git a/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py b/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py
index 699caa49e54..63942ef0f8f 100644
--- a/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py
+++ b/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py
@@ -104,19 +104,22 @@ def run_conv_transpose2d(
     conv_config = ttnn.Conv2dConfig(
         dtype=activations_dtype,
         weights_dtype=weights_dtype,
-        math_fidelity=math_fidelity,
         shard_layout=shard_layout,
         input_channels_alignment=(
             16 if use_shallow_conv_variant or (input_channels == 16 and input_height == 115) else 32
         ),
         deallocate_activation=deallocate_activation,
-        fp32_dest_acc_enabled=fp32_accum,
-        packer_l1_accum_enabled=packer_l1_acc,
         enable_act_double_buffer=False,
         enable_split_reader=False,
         enable_subblock_padding=False,
         output_layout=ttnn.ROW_MAJOR_LAYOUT,
     )
+    compute_config = ttnn.init_device_compute_kernel_config(
+        device.arch(),
+        math_fidelity=math_fidelity,
+        fp32_dest_acc_en=fp32_accum,
+        packer_l1_acc=packer_l1_acc,
+    )
     if config_override and "act_block_h" in config_override:
         conv_config.act_block_h_override = config_override["act_block_h"]
 
@@ -139,6 +142,7 @@ def run_conv_transpose2d(
         input_height=input_height,
         input_width=input_width,
         conv_config=conv_config,
+        compute_config=compute_config,
         groups=groups,
     )
     logger.info(f"Conv2d Transpose Input = {(input_height, input_width)} Output = {out_height, out_width}")
diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
index 3e5f5f857f9..25d4b0bc00f 100644
--- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py
+++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -137,19 +137,22 @@ def run_conv(
     conv_config = ttnn.Conv2dConfig(
         dtype=activations_dtype,
         weights_dtype=weights_dtype,
-        math_fidelity=math_fidelity,
         shard_layout=shard_layout,
         input_channels_alignment=(
             16 if use_shallow_conv_variant or (input_channels == 16 and input_height == 115) else 32
         ),
         deallocate_activation=deallocate_activation,
-        fp32_dest_acc_enabled=fp32_accum,
-        packer_l1_accum_enabled=packer_l1_acc,
         enable_act_double_buffer=False,
         enable_split_reader=False,
         enable_subblock_padding=False,
         output_layout=output_layout,
     )
+    compute_config = ttnn.init_device_compute_kernel_config(
+        device.arch(),
+        math_fidelity=math_fidelity,
+        fp32_dest_acc_en=fp32_accum,
+        packer_l1_acc=packer_l1_acc,
+    )
     if config_override and "act_block_h" in config_override and not auto_shard:
         conv_config.act_block_h_override = config_override["act_block_h"]
 
@@ -177,6 +180,7 @@ def run_conv(
         input_height=input_height,
         input_width=input_width,
         conv_config=conv_config,
+        compute_config=compute_config,
         conv_op_cache=reader_patterns_cache,
         debug=debug,
         groups=groups,
@@ -280,12 +284,15 @@ def run_conv_with_split(
     conv_config = ttnn.Conv2dConfig(
         dtype=activations_dtype,
         weights_dtype=weights_dtype,
-        math_fidelity=math_fidelity,
         shard_layout=shard_layout if use_1d_systolic_array else ttnn.TensorMemoryLayout.BLOCK_SHARDED,
-        fp32_dest_acc_enabled=fp32_accum,
-        packer_l1_accum_enabled=packer_l1_acc,
         # input_channels_alignment=(16 if use_shallow_conv_variant else 32),
     )
+    compute_config = ttnn.init_device_compute_kernel_config(
+        device.arch(),
+        math_fidelity=math_fidelity,
+        fp32_dest_acc_en=fp32_accum,
+        packer_l1_acc=packer_l1_acc,
+    )
     if config_override and "act_block_h" in config_override:
         conv_config.act_block_h_override = config_override["act_block_h"]
         print("Setting Act Block H to ", conv_config.act_block_h_override)
@@ -320,6 +327,7 @@ def run_conv_with_split(
             input_height=input_height,
             input_width=input_width,
             conv_config=conv_config,
+            compute_config=compute_config,
             conv_op_cache=reader_patterns_cache,
         )
         tt_conv_output_tensor = ttnn.from_device(tt_output_tensor_on_device)
@@ -625,12 +633,9 @@ def test_conv_ws(
     conv_config = ttnn.Conv2dConfig(
         dtype=activations_dtype,
         weights_dtype=weights_dtype,
-        math_fidelity=ttnn.MathFidelity.HiFi4,
         shard_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED if not auto_shard else None,
         input_channels_alignment=32,
         deallocate_activation=deallocate_activation,
-        fp32_dest_acc_enabled=fp32_accum,
-        packer_l1_accum_enabled=packer_l1_acc,
         enable_act_double_buffer=False,
         enable_split_reader=False,
         enable_subblock_padding=False,
@@ -638,6 +643,12 @@ def test_conv_ws(
         act_block_w_div=act_block_w_div if not auto_shard else 1,
         act_block_h_override=32,
     )
+    compute_config = ttnn.init_device_compute_kernel_config(
+        device.arch(),
+        math_fidelity=ttnn.MathFidelity.HiFi4,
+        fp32_dest_acc_en=fp32_accum,
+        packer_l1_acc=packer_l1_acc,
+    )
     [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d(
         input_tensor=tt_input_tensor,
         weight_tensor=tt_weight_tensor,
@@ -652,6 +663,7 @@ def test_conv_ws(
         input_height=input_height,
         input_width=input_width,
         conv_config=conv_config,
+        compute_config=compute_config,
         conv_op_cache=reader_patterns_cache,
         debug=debug,
         groups=groups,
@@ -2745,6 +2757,9 @@ def test_shallow_conv_with_tiled_input(device):
         input_height=img_h,
         input_width=img_w,
         groups=1,
+        compute_config=ttnn.init_device_compute_kernel_config(
+            device.arch(),
+        ),
         memory_config=ttnn.DRAM_MEMORY_CONFIG,
     )
 
diff --git a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py
index 23f28658fc3..5a59200a178 100644
--- a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py
+++ b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py
@@ -127,12 +127,11 @@ def test_prepare_conv_weights(
         dtype=ttnn.bfloat16,
         weights_dtype=ttnn.bfloat16,
         input_channels_alignment=(16 if input_channels == 16 and input_height == 115 else 32),
-        packer_l1_accum_enabled=packer_l1_acc,
         enable_act_double_buffer=False,
         enable_split_reader=False,
         enable_subblock_padding=False,
     )
-
+    compute_config = ttnn.init_device_compute_kernel_config(device.arch(), packer_l1_acc=packer_l1_acc)
     if config_override and "act_block_h" in config_override:
         conv_config.act_block_h_override = config_override["act_block_h"]
 
@@ -184,6 +183,7 @@ def test_prepare_conv_weights(
         weight_tensor=tt_weight_tensor_formatted,
         bias_tensor=tt_bias_tensor_formatted,
         **conv_kwargs,
+        compute_config=compute_config,
     )
 
     tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device)
diff --git a/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py b/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py
index 84ee4d5d972..bf233351d1f 100644
--- a/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py
+++ b/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py
@@ -118,6 +118,9 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
+            ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(),
                 math_fidelity=self.model_config["MATH_FIDELITY"],
             ),
             conv_op_cache=conv_op_cache,
@@ -139,9 +142,12 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(),
+                math_fidelity=self.model_config["MATH_FIDELITY"],
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -162,6 +168,9 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac
                 conv_config=ttnn.Conv2dConfig(
                     dtype=self.model_config["ACTIVATIONS_DTYPE"],
                     weights_dtype=self.model_config["WEIGHTS_DTYPE"],
+                ),
+                compute_config=ttnn.init_device_compute_kernel_config(
+                    device.arch(),
                     math_fidelity=self.model_config["MATH_FIDELITY"],
                 ),
                 conv_op_cache=conv_op_cache,
@@ -187,9 +196,12 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
-                math_fidelity=self.model_config["MATH_FIDELITY"],
                 activation="relu",
             ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(),
+                math_fidelity=self.model_config["MATH_FIDELITY"],
+            ),
             conv_op_cache=conv_op_cache,
         )
 
@@ -211,6 +223,9 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac
             conv_config=ttnn.Conv2dConfig(
                 dtype=self.model_config["ACTIVATIONS_DTYPE"],
                 weights_dtype=self.model_config["WEIGHTS_DTYPE"],
+            ),
+            compute_config=ttnn.init_device_compute_kernel_config(
+                device.arch(),
                 math_fidelity=self.model_config["MATH_FIDELITY"],
             ),
             conv_op_cache=conv_op_cache,
diff --git a/ttnn/cpp/pybind11/operations/core.hpp b/ttnn/cpp/pybind11/operations/core.hpp
index eaf0014cf52..74da55f61da 100644
--- a/ttnn/cpp/pybind11/operations/core.hpp
+++ b/ttnn/cpp/pybind11/operations/core.hpp
@@ -6,7 +6,9 @@
 
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <optional>
 
+#include "pybind11/cast.h"
 #include "ttnn/cpp/pybind11/decorators.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "tt_metal/common/work_split.hpp"
@@ -22,12 +24,14 @@ void py_module_types(py::module& module) {
 
     py::class_<GrayskullComputeKernelConfig>(module, "GrayskullComputeKernelConfig")
         .def(
-            py::init<MathFidelity, bool>(),
+            py::init<MathFidelity, bool, bool>(),
             py::kw_only(),
             py::arg("math_fidelity") = MathFidelity::Invalid,
-            py::arg("math_approx_mode") = true)
+            py::arg("math_approx_mode") = true,
+            py::arg("dst_full_sync_en") = false)
         .def_readwrite("math_fidelity", &GrayskullComputeKernelConfig::math_fidelity)
-        .def_readwrite("math_approx_mode", &GrayskullComputeKernelConfig::math_approx_mode);
+        .def_readwrite("math_approx_mode", &GrayskullComputeKernelConfig::math_approx_mode)
+        .def_readwrite("dst_full_sync_en", &GrayskullComputeKernelConfig::dst_full_sync_en);
 
     py::class_<WormholeComputeKernelConfig>(module, "WormholeComputeKernelConfig")
         .def(
@@ -46,6 +50,17 @@ void py_module_types(py::module& module) {
 }
 
 void py_module(py::module& module) {
+
+    module.def("init_device_compute_kernel_config", &ttnn::init_device_compute_kernel_config,
+            py::arg("arch"),
+            py::arg("device_kernel_config") = std::nullopt,
+            py::kw_only(),
+            py::arg("math_fidelity") = MathFidelity::LoFi,
+            py::arg("math_approx_mode") = true,
+            py::arg("fp32_dest_acc_en") = false,
+            py::arg("packer_l1_acc") = false,
+            py::arg("dst_full_sync_en") = false
+        );
     module.def("unsqueeze_to_4D", &ttnn::unsqueeze_to_4D, py::arg("tensor"));
 
     module.def(
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
index 5f9ba6f0ea9..d6d06ec490f 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -17,10 +17,6 @@
 #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/core/core.hpp"
 #include "ttnn/operations/pool/downsample/device/downsample_op.hpp"
-#include "tt_metal/detail/reports/memory_reporter.hpp"
-#include "tt_metal/common/work_split.hpp"
-#include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp"
-#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp"
 #include "ttnn/operations/sliding_window/sliding_window.hpp"
 #include "ttnn/tensor/tensor.hpp"
 #include "ttnn/tensor/types.hpp"
@@ -54,6 +50,7 @@ Result conv2d(
     uint32_t groups,
     std::optional<const ttnn::Tensor> bias_tensor,
     const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_,
     const std::optional<const MemoryConfig>& memory_config) {
     const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups);
     const uint32_t output_height = ((input_height - kernel_size[0] - ((kernel_size[0] - 1 ) * (dilation[0] - 1)) + 2 * padding[0]) / stride[0]) + 1;
@@ -89,6 +86,14 @@ Result conv2d(
         (conv_config.weights_dtype == DataType::BFLOAT8_B || conv_config.weights_dtype == DataType::BFLOAT16) &&
         conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size * in_channels) % (16 * num_cores_c)) == 0;
 
+    DeviceComputeKernelConfig compute_config = compute_config_.value_or( init_device_compute_kernel_config(
+            device->arch(),
+            std::nullopt,
+            MathFidelity::HiFi4,
+            true,
+            false,
+            false
+    ));
     auto [input_tensor_post_tm, parallel_config, output_parallel_config, tensor_manipulated, use_non_tile_height] = shard_or_reshard_tensor_if_required(
         device, input_tensor, conv_config, batch_size, output_height, output_width, in_channels, out_channels, mm_conv, is_non_tile_mul_width);
     if (tensor_manipulated) {
@@ -138,7 +143,7 @@ Result conv2d(
         conv_config.act_block_w_div,
         kernel_size[0],
         kernel_size[1],
-        conv_config.fp32_dest_acc_enabled,
+        get_fp32_dest_acc_en(compute_config),
         conv_config.enable_split_reader);
     bool weight_is_on_device = ttnn::is_tensor_on_device_or_multidevice(weight_tensor);
     ttnn::Tensor weight_tensor_on_device = weight_tensor;
@@ -173,13 +178,6 @@ Result conv2d(
     // call optimized conv op or matmul micro op
     bool input_is_on_device = ttnn::is_tensor_on_device_or_multidevice(input_tensor_post_tm);
     TT_ASSERT(input_is_on_device);
-    DeviceComputeKernelConfig compute_kernel_config = ttnn::init_device_compute_kernel_config(
-        device->arch(),
-        std::nullopt,
-        conv_config.math_fidelity,
-        conv_config.math_approx_mode_enabled,
-        conv_config.fp32_dest_acc_enabled,
-        conv_config.packer_l1_accum_enabled);
 
     if (!mm_conv) {
         // call halo op
@@ -238,14 +236,13 @@ Result conv2d(
             groups,
             conv_config.output_layout == Layout::ROW_MAJOR,
             conv_config.activation == "relu",
-            conv_config.math_fidelity,
             opt_conv_op_parallel_config,
             opt_conv_op_block_config,
             conv_out_memory_config,
             conv_config.dtype,
             {batch_size, input_height, input_width, in_channels},
             conv_config.input_channels_alignment == 16,
-            compute_kernel_config,
+            compute_config,
             conv_config.enable_act_double_buffer,
             conv_config.enable_weights_double_buffer,
             conv_config.enable_split_reader,
@@ -284,7 +281,7 @@ Result conv2d(
             /*bcast_batch=*/std::nullopt,
             conv_out_memory_config,
             conv_config.dtype,
-            compute_kernel_config});
+            compute_config});
         if (conv_config.deallocate_activation) {
             ttnn::operations::core::deallocate(matmul_input);
         }
@@ -314,8 +311,9 @@ Result Conv2dOperation::invoke(
     uint32_t groups,
     std::optional<const ttnn::Tensor> bias_tensor,
     const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_,
     const std::optional<const MemoryConfig>& memory_config){
-    return conv2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), memory_config);
+    return conv2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(compute_config_), memory_config);
 }
 
 Result Conv2dOperation::invoke(
@@ -335,10 +333,12 @@ Result Conv2dOperation::invoke(
     uint32_t groups,
     std::optional<const ttnn::Tensor> bias_tensor,
     const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_,
     const std::optional<const MemoryConfig>& memory_config){
-    return conv2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), memory_config);
+    return conv2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(compute_config_), memory_config);
 }
 
+
 }  // namespace conv2d
 }  // namespace operations
 }  // namespace ttnn
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp
index d15023abb86..e8310c0dbdc 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp
@@ -47,6 +47,7 @@ Result conv2d(
     uint32_t groups,
     std::optional<const ttnn::Tensor> bias_tensor = std::nullopt,
     const std::optional<const Conv2dConfig>& conv_config_ = std::nullopt,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_ = std::nullopt,
     const std::optional<const MemoryConfig>& memory_config = std::nullopt);
 
 
@@ -68,6 +69,7 @@ struct Conv2dOperation{
         uint32_t groups,
         std::optional<const ttnn::Tensor> bias_tensor = std::nullopt,
         const std::optional<const Conv2dConfig>& conv_config_ = std::nullopt,
+        const std::optional<const DeviceComputeKernelConfig>& compute_config_ = std::nullopt,
         const std::optional<const MemoryConfig>& memory_config = std::nullopt);
 
     static Result invoke(
@@ -87,6 +89,7 @@ struct Conv2dOperation{
         uint32_t groups,
         std::optional<const ttnn::Tensor> bias_tensor = std::nullopt,
         const std::optional<const Conv2dConfig>& conv_config_ = std::nullopt,
+        const std::optional<const DeviceComputeKernelConfig>& compute_config_ = std::nullopt,
         const std::optional<const MemoryConfig>& memory_config = std::nullopt);
 };
 }  // namespace conv2d
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
index 6ac28cf56ca..c3356447cab 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
@@ -60,10 +60,11 @@ void py_bind_conv2d(py::module& module) {
                 std::array<uint32_t, 2> dilation,
                 uint32_t groups,
                 std::optional<const ttnn::Tensor> bias_tensor,
-                std::optional<const Conv2dConfig> conv_config,
+                const std::optional<const Conv2dConfig>& conv_config,
+                const std::optional<const DeviceComputeKernelConfig>& compute_config,
                 const std::optional<const MemoryConfig>& memory_config,
                 const uint8_t& queue_id) -> Result {
-                return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, bias_tensor, conv_config, memory_config);
+                return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, bias_tensor, conv_config, compute_config, memory_config);
             },
             py::kw_only(),
             py::arg("input_tensor"),
@@ -81,6 +82,7 @@ void py_bind_conv2d(py::module& module) {
             py::arg("groups"),
             py::arg("bias_tensor") = std::nullopt,
             py::arg("conv_config") = std::nullopt,
+            py::arg("compute_config") = std::nullopt,
             py::arg("memory_config") = std::nullopt,
             py::arg("queue_id") = 0},
 
@@ -99,10 +101,11 @@ void py_bind_conv2d(py::module& module) {
                 std::array<uint32_t, 2> dilation,
                 uint32_t groups,
                 std::optional<const ttnn::Tensor> bias_tensor,
-                std::optional<const Conv2dConfig> conv_config,
+                const std::optional<const Conv2dConfig>& conv_config,
+                const std::optional<const DeviceComputeKernelConfig>& compute_config,
                 const std::optional<const MemoryConfig>& memory_config,
                 const uint8_t& queue_id) -> Result {
-                return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, bias_tensor, conv_config, memory_config);
+                return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, bias_tensor, conv_config, compute_config, memory_config);
             },
             py::kw_only(),
             py::arg("input_tensor"),
@@ -120,6 +123,7 @@ void py_bind_conv2d(py::module& module) {
             py::arg("groups"),
             py::arg("bias_tensor") = std::nullopt,
             py::arg("conv_config") = std::nullopt,
+            py::arg("compute_config") = std::nullopt,
             py::arg("memory_config") = std::nullopt,
             py::arg("queue_id") = 0}
     );
@@ -143,7 +147,8 @@ void py_bind_conv2d(py::module& module) {
         py::arg("dilation"),
         py::arg("groups"),
         py::arg("device"),
-        py::arg("conv_config") = std::nullopt);
+        py::arg("conv_config") = std::nullopt,
+        py::arg("compute_config") = std::nullopt);
 
 
     module.def(
@@ -165,7 +170,8 @@ void py_bind_conv2d(py::module& module) {
         py::arg("dilation"),
         py::arg("groups"),
         py::arg("device"),
-        py::arg("conv_config") = std::nullopt);
+        py::arg("conv_config") = std::nullopt,
+        py::arg("compute_config") = std::nullopt);
 
     module.def(
         "prepare_conv_bias",
@@ -185,7 +191,8 @@ void py_bind_conv2d(py::module& module) {
         py::arg("dilation"),
         py::arg("groups"),
         py::arg("device"),
-        py::arg("conv_config") = std::nullopt);
+        py::arg("conv_config") = std::nullopt,
+        py::arg("compute_config") = std::nullopt);
 
     module.def(
         "prepare_conv_bias",
@@ -205,7 +212,8 @@ void py_bind_conv2d(py::module& module) {
         py::arg("dilation"),
         py::arg("groups"),
         py::arg("device"),
-        py::arg("conv_config") = std::nullopt);
+        py::arg("conv_config") = std::nullopt,
+        py::arg("compute_config") = std::nullopt);
 
     module.def(
         "convert_conv_weight_tensor_to_tiled_layout",
@@ -266,14 +274,10 @@ void py_bind_conv2d(py::module& module) {
 
     auto py_conv_config = py::class_<Conv2dConfig>(module, "Conv2dConfig");
     py_conv_config.def(
-            py::init<MathFidelity, DataType, DataType, bool, bool, bool, string, uint32_t, bool, bool, uint32_t, uint32_t, bool, bool, std::optional<TensorMemoryLayout>, std::optional<CoreRangeSet>, bool, Layout, bool, bool, bool, bool>(),
+            py::init<DataType, DataType, string, uint32_t, bool, bool, uint32_t, uint32_t, bool, bool, std::optional<TensorMemoryLayout>, std::optional<CoreRangeSet>, bool, Layout, bool, bool, bool, bool>(),
             py::kw_only(),
-            py::arg("math_fidelity") = MathFidelity::HiFi4,
             py::arg("dtype") = DataType::BFLOAT16,
             py::arg("weights_dtype") = DataType::BFLOAT16,
-            py::arg("math_approx_mode_enabled") = true,
-            py::arg("fp32_dest_acc_enabled") = false,
-            py::arg("packer_l1_accum_enabled") = false,
             py::arg("activation") = "",
             py::arg("input_channels_alignment") = 32,
             py::arg("deallocate_activation") = false,
@@ -291,12 +295,8 @@ void py_bind_conv2d(py::module& module) {
             py::arg("enable_split_reader") = false,
             py::arg("enable_subblock_padding") = false
         );
-        py_conv_config.def_readwrite("math_fidelity", &Conv2dConfig::math_fidelity);
         py_conv_config.def_readwrite("dtype", &Conv2dConfig::dtype);
         py_conv_config.def_readwrite("weights_dtype", &Conv2dConfig::weights_dtype);
-        py_conv_config.def_readwrite("math_approx_mode_enabled", &Conv2dConfig::math_approx_mode_enabled);
-        py_conv_config.def_readwrite("fp32_dest_acc_enabled", &Conv2dConfig::fp32_dest_acc_enabled);
-        py_conv_config.def_readwrite("packer_l1_accum_enabled", &Conv2dConfig::packer_l1_accum_enabled);
         py_conv_config.def_readwrite("activation", &Conv2dConfig::activation);
         py_conv_config.def_readwrite("input_channels_alignment", &Conv2dConfig::input_channels_alignment);
         py_conv_config.def_readwrite("deallocate_activation", &Conv2dConfig::deallocate_activation);
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
index 9b9645f821f..349e3837329 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp
@@ -30,12 +30,8 @@ using OutputWidth = uint32_t;
 using Result = std::tuple<ttnn::Tensor, OutputHeight, OutputWidth, ttnn::Tensor, std::optional<ttnn::Tensor>>;
 
 struct Conv2dConfig {
-    MathFidelity math_fidelity = MathFidelity::HiFi4;
     DataType dtype = DataType::BFLOAT16;
     DataType weights_dtype = DataType::BFLOAT16;
-    bool math_approx_mode_enabled = true;
-    bool fp32_dest_acc_enabled = false;
-    bool packer_l1_accum_enabled = false;
     string activation = "";
     uint32_t input_channels_alignment = 32;
     bool deallocate_activation = false;
@@ -54,12 +50,8 @@ struct Conv2dConfig {
     bool enable_split_reader = false;
     bool enable_subblock_padding = false;
     static constexpr auto attribute_names = std::make_tuple(
-        "math_fidelity",
         "dtype",
         "weights_dtype",
-        "math_approx_mode_enabled",
-        "fp32_dest_acc_enabled",
-        "packer_l1_accum_enabled",
         "activation",
         "input_channels_alignment",
         "deallocate_activation",
@@ -78,12 +70,8 @@ struct Conv2dConfig {
         "enable_subblock_padding");
     const auto attribute_values() const {
         return std::make_tuple(
-            std::cref(this->math_fidelity),
             std::cref(this->dtype),
             std::cref(this->weights_dtype),
-            std::cref(this->math_approx_mode_enabled),
-            std::cref(this->fp32_dest_acc_enabled),
-            std::cref(this->packer_l1_accum_enabled),
             std::cref(this->activation),
             std::cref(this->input_channels_alignment),
             std::cref(this->deallocate_activation),
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
index 9d57c98db84..e09aa621dd5 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp
@@ -17,6 +17,7 @@
 #include "tt_metal/tt_stl/reflection.hpp"
 
 #include "tt_metal/common/work_split.hpp"
+#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include "ttnn/operations/sharding_utilities.hpp"
 #include "ttnn/operations/experimental/auto_format/auto_format.hpp"
 
@@ -57,14 +58,14 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional<const
     const sliding_window::SlidingWindowConfig& sliding_window_config,
     uint32_t output_channels,
     uint32_t groups,
-    bool untilize_out, bool fuse_relu, MathFidelity math_fidelity,
+    bool untilize_out, bool fuse_relu,
     const OptimizedConvParallelizationConfig& parallelization_config,
     const OptimizedConvBlockConfig& block_config,
     const MemoryConfig& memory_config,
     DataType dtype,
     std::array<std::uint32_t, 4> input_tensor_shape,
     bool use_shallow_conv_variant,
-    std::optional<const DeviceComputeKernelConfig> compute_kernel_config,
+    const DeviceComputeKernelConfig& compute_kernel_config,
     bool enable_act_double_buffer,
     bool enable_weights_double_buffer,
     bool enable_split_reader,
@@ -73,7 +74,7 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional<const
 ) {
     std::vector<Tensor> output_tensors = {Tensor(operation::get_workers_for_op_output({a, b}))};
     operation::launch_op(
-        [sliding_window_config, output_channels, groups, untilize_out, fuse_relu, math_fidelity, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, compute_kernel_config, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height]
+        [sliding_window_config, output_channels, groups, untilize_out, fuse_relu, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, compute_kernel_config, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height]
             (const std::vector<Tensor>& input_tensors, const std::vector<std::optional<const Tensor>>& optional_input_tensors, const std::vector<std::optional<Tensor>>& optional_output_tensors) mutable -> std::vector<Tensor> {
                 using ttnn::operations::experimental::auto_format::FormatParams;
                 auto& a = input_tensors.at(0);
@@ -91,9 +92,8 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional<const
                 auto output_layout = untilize_out ? Layout::ROW_MAJOR : Layout::TILE;
                 auto arch = is_tensor_on_device_or_multidevice(a) ? a.device()->arch() : ttnn::operations::experimental::auto_format::AutoFormat::GetDefaultDevice()->arch();
                 bool fp32_accum = a.device()->arch() == tt::ARCH::WORMHOLE_B0;  // && compute_kernel_config.has_value()) ? compute_kernel_config.value().fp32_dest_acc_en : false;
-                auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::LoFi, true, fp32_accum, false);
                 return operation::run_without_autoformat(
-                    OptimizedConvNew(sliding_window_config, output_channels, groups, untilize_out, bias.has_value(), fuse_relu, math_fidelity, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, kernel_config_val, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height
+                    OptimizedConvNew(sliding_window_config, output_channels, groups, untilize_out, bias.has_value(), fuse_relu, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, compute_kernel_config, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height
                     ),
                     input_tensors,
                     optional_input_tensors);
@@ -219,7 +219,7 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program(const std::vect
         sliding_window_config,
         output_channels,
         groups,
-        untilize_out, fuse_relu, math_fidelity,
+        untilize_out, fuse_relu,
         parallelization_config,
         block_config,
         dtype,
@@ -265,7 +265,7 @@ operation::OpPerformanceModel OptimizedConvNew::create_op_performance_model(cons
     int64_t num_mul_adds_per_elem = conv_activation_c * filter_h * filter_w * 2; // 1 multiply and 1 add per element
     int64_t num_mul_adds = num_mul_adds_per_elem * output_height * output_width * this->output_channels * batch_size;
 
-    int ideal_dev_clock_cycles = std::ceil(((float)num_mul_adds / (float)(num_cores * tensix_mul_adds_per_cycle_lofi)) * (float)operation::OpPerformanceModel::fidelity_multiplier(this->math_fidelity));
+    int ideal_dev_clock_cycles = std::ceil(((float)num_mul_adds / (float)(num_cores * tensix_mul_adds_per_cycle_lofi)) * (float)operation::OpPerformanceModel::fidelity_multiplier(get_math_fidelity(this->compute_kernel_config)));
 
     operation::OpPerformanceModel result(input_tensors, output_tensors, ideal_dev_clock_cycles);
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
index 830ca917e33..a39e97f4fac 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
@@ -47,7 +47,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new(const T
     const sliding_window::SlidingWindowConfig& sliding_window_config,
     uint32_t output_channels,
     uint32_t groups,
-    bool untilize_out, bool fuse_relu, MathFidelity math_fidelity,
+    bool untilize_out, bool fuse_relu,
     const OptimizedConvParallelizationConfig& parallelization_config,
     const OptimizedConvBlockConfig& block_config,
     DataType dtype,
@@ -69,7 +69,6 @@ struct OptimizedConvNew {
     const uint32_t output_channels;
     const uint32_t groups;
     bool untilize_out, has_bias, fuse_relu;
-    MathFidelity math_fidelity;
     MemoryConfig memory_config;
     const DataType dtype;
     std::array<std::uint32_t, 4> input_tensor_shape; // For sharded input, input tensor shape is nonsense
@@ -84,7 +83,7 @@ struct OptimizedConvNew {
         uint32_t output_channels, uint32_t groups,
         bool untile_out,
         bool has_bias, bool fuse_relu,
-        MathFidelity mfidelity, const OptimizedConvParallelizationConfig& p_config,
+        const OptimizedConvParallelizationConfig& p_config,
         const OptimizedConvBlockConfig& b_config,
         MemoryConfig memory_config,
         DataType dtype,
@@ -96,7 +95,6 @@ struct OptimizedConvNew {
             untilize_out(untile_out),
             has_bias(has_bias),
             fuse_relu(fuse_relu),
-            math_fidelity(mfidelity),
             parallelization_config(p_config),
             block_config(b_config),
             memory_config(memory_config),
@@ -124,7 +122,6 @@ struct OptimizedConvNew {
         "untilize_out",
         "has_bias",
         "fuse_relu",
-        "math_fidelity",
         "dtype",
         "input_tensor_shape",
         "use_shallow_conv_variant",
@@ -141,7 +138,6 @@ struct OptimizedConvNew {
             std::cref(this->untilize_out),
             std::cref(this->has_bias),
             std::cref(this->fuse_relu),
-            std::cref(this->math_fidelity),
             std::cref(this->dtype),
             std::cref(this->input_tensor_shape),
             std::cref(this->use_shallow_conv_variant),
@@ -156,14 +152,14 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional<const
     const sliding_window::SlidingWindowConfig& sliding_window_config,
     uint32_t output_channels,
     uint32_t groups,
-    bool untilize_out, bool fuse_relu, MathFidelity math_fidelity,
+    bool untilize_out, bool fuse_relu,
     const OptimizedConvParallelizationConfig& parallelization_config,
     const OptimizedConvBlockConfig& block_config,
     const MemoryConfig& memory_config,
     DataType dtype,
     std::array<std::uint32_t, 4> input_tensor_shape,
     bool use_shallow_conv_variant,
-    std::optional<const DeviceComputeKernelConfig> compute_kernel_config = std::nullopt,
+    const DeviceComputeKernelConfig& compute_kernel_config,
     bool enable_act_double_buffer = false,
     bool enable_weights_double_buffer = false,
     bool enable_split_reader = false,
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
index 0b452a583df..7c0544a8c69 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp
@@ -1793,7 +1793,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new(
     uint32_t groups,
     bool untilize_out,
     bool fuse_relu,
-    MathFidelity math_fidelity,
     const OptimizedConvParallelizationConfig& parallelization_config,
     const OptimizedConvBlockConfig& block_config,
     DataType output_dtype,
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
index 668372c49a4..1009ed7a87b 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp
@@ -4,6 +4,7 @@
 
 #include "prepare_conv2d_weights.hpp"
 #include "conv2d_utils.hpp"
+#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
 #include <sys/types.h>
 #include <cstdint>
 
@@ -67,6 +68,7 @@ OptimizedConvBlockConfig get_opt_block_config(
     T *device,
     Conv2dConfig& conv_config,
     Layout input_tensor_layout,
+    const DeviceComputeKernelConfig& compute_config,
     const MemoryConfig& input_memory_config) {
     auto compute_grid_size = device->compute_with_storage_grid_size();
 
@@ -138,7 +140,7 @@ OptimizedConvBlockConfig get_opt_block_config(
         conv_config.act_block_w_div,
         kernel_size[0],
         kernel_size[1],
-        conv_config.fp32_dest_acc_enabled,
+        get_fp32_dest_acc_en(compute_config),
         conv_config.enable_split_reader);
 }
 
@@ -289,9 +291,11 @@ ttnn::Tensor prepare_conv_weights(
     std::array<uint32_t, 2> dilation,
     uint32_t groups,
     T *device,
-    const std::optional<const Conv2dConfig>& conv_config_) {
+    const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_) {
     TT_FATAL(!ttnn::is_tensor_on_device_or_multidevice(weight_tensor), "Error: weight tensor must be on host for preparation.");
     Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig());
+    DeviceComputeKernelConfig compute_config = compute_config_.value_or(DeviceComputeKernelConfig());
     const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups);
     const uint32_t output_height = ((input_height - kernel_size[0] - ((kernel_size[0] - 1 ) * (dilation[0] - 1)) + 2 * padding[0]) / stride[0]) + 1;
     const uint32_t output_width =
@@ -309,6 +313,7 @@ ttnn::Tensor prepare_conv_weights(
         device,
         conv_config,
         input_tensor_layout,
+        compute_config,
         input_memory_config
     );
 
@@ -366,7 +371,8 @@ ttnn::Tensor prepare_conv_bias(
     std::array<uint32_t, 2> dilation,
     uint32_t groups,
     T *device,
-    const std::optional<const Conv2dConfig>& conv_config_) {
+    const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_) {
 
     TT_FATAL(!ttnn::is_tensor_on_device_or_multidevice(bias_tensor), "Error: bias tensor must be on host for preparation.");
 
@@ -376,6 +382,7 @@ ttnn::Tensor prepare_conv_bias(
         ((input_width - kernel_size[1] - ((kernel_size[0] - 1) * (dilation[0] - 1)) + 2 * padding[1]) / stride[1]) + 1;
 
     Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig());
+    DeviceComputeKernelConfig compute_config = compute_config_.value_or(DeviceComputeKernelConfig());
     auto opt_conv_op_block_config = get_opt_block_config(
         mm_conv,
         in_channels,
@@ -389,6 +396,7 @@ ttnn::Tensor prepare_conv_bias(
         device,
         conv_config,
         input_tensor_layout,
+        compute_config,
         input_memory_config
     );
 
@@ -423,6 +431,7 @@ template OptimizedConvBlockConfig get_opt_block_config<Device>(
     Device *device,
     Conv2dConfig& conv_config,
     Layout input_tensor_layout,
+    const DeviceComputeKernelConfig& compute_config,
     const ttnn::MemoryConfig& input_memory_config);
 
 template OptimizedConvBlockConfig get_opt_block_config<MeshDevice>(
@@ -438,6 +447,7 @@ template OptimizedConvBlockConfig get_opt_block_config<MeshDevice>(
     MeshDevice *device,
     Conv2dConfig& conv_config,
     Layout input_tensor_layout,
+    const DeviceComputeKernelConfig& compute_config,
     const ttnn::MemoryConfig& input_memory_config);
 
 template ttnn::Tensor prepare_conv_weights<Device>(
@@ -456,7 +466,8 @@ template ttnn::Tensor prepare_conv_weights<Device>(
     std::array<uint32_t, 2> dilation,
     uint32_t groups,
     Device *device,
-    const std::optional<const Conv2dConfig>& conv_config_);
+    const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_);
 
 template ttnn::Tensor prepare_conv_weights<MeshDevice>(
     const ttnn::Tensor& weight_tensor,
@@ -474,7 +485,8 @@ template ttnn::Tensor prepare_conv_weights<MeshDevice>(
     std::array<uint32_t, 2> dilation,
     uint32_t groups,
     MeshDevice *device,
-    const std::optional<const Conv2dConfig>& conv_config_);
+    const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_);
 
 template std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_and_move_to_device<Device>(
     const ttnn::Tensor& weight_tensor,
@@ -521,7 +533,8 @@ template ttnn::Tensor prepare_conv_bias<Device>(
     std::array<uint32_t, 2> dilation,
     uint32_t groups,
     Device *device,
-    const std::optional<const Conv2dConfig>& conv_config_);
+    const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_);
 
 template ttnn::Tensor prepare_conv_bias<MeshDevice>(
     const ttnn::Tensor& bias_tensor,
@@ -538,7 +551,8 @@ template ttnn::Tensor prepare_conv_bias<MeshDevice>(
     std::array<uint32_t, 2> dilation,
     uint32_t groups,
     MeshDevice *device,
-    const std::optional<const Conv2dConfig>& conv_config_);
+    const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_);
 
 }  // namespace conv2d
 }  // namespace operations
diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp
index 18e654ad37c..35b80dac824 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp
@@ -43,7 +43,8 @@ ttnn::Tensor prepare_conv_weights(
     std::array<uint32_t, 2> dilation,
     uint32_t groups,
     T *device,
-    const std::optional<const Conv2dConfig>& conv_config_);
+    const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_);
 
 template <typename T>
 ttnn::Tensor prepare_conv_bias(
@@ -61,7 +62,8 @@ ttnn::Tensor prepare_conv_bias(
     std::array<uint32_t, 2> dilation,
     uint32_t groups,
     T *device,
-    const std::optional<const Conv2dConfig>& conv_config_);
+    const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_);
 
 template <typename T>
 std::pair<ttnn::Tensor, std::optional<ttnn::Tensor>> prepare_conv_weights_biases_and_move_to_device(
diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp
index e2c54193bb0..21af1f921fb 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp
@@ -107,8 +107,11 @@ Result conv_transpose2d(
     uint32_t groups,
     std::optional<const ttnn::Tensor> bias_tensor,
     const std::optional<const Conv2dConfig>& conv_config_,
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_,
     const std::optional<const MemoryConfig>& memory_config ) {
         Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig());
+        DeviceComputeKernelConfig compute_config = compute_config_.value_or(DeviceComputeKernelConfig());
+
 
         //Inverse of sliding_window.get_output_shape()
         SlidingWindowConfig sliding_window_config = SlidingWindowConfig{
@@ -174,32 +177,6 @@ Result conv_transpose2d(
                 ttnn::is_tensor_on_device_or_multidevice(input_tensor) ? std::make_optional(input_tensor.memory_config()) : std::nullopt);
         }
 
-        DeviceComputeKernelConfig compute_kernel_config;
-        switch (device->arch()) {
-            case tt::ARCH::WORMHOLE_B0:
-                compute_kernel_config = WormholeComputeKernelConfig(
-                    {.math_fidelity = conv_config.math_fidelity,
-                    .math_approx_mode = conv_config.math_approx_mode_enabled,
-                    .fp32_dest_acc_en = conv_config.fp32_dest_acc_enabled,
-                    .packer_l1_acc = conv_config.packer_l1_accum_enabled});
-                break;
-
-            case tt::ARCH::GRAYSKULL:
-                compute_kernel_config = GrayskullComputeKernelConfig(
-                    {.math_fidelity = conv_config.math_fidelity, .math_approx_mode = conv_config.math_approx_mode_enabled});
-                break;
-
-            case tt::ARCH::BLACKHOLE:
-                compute_kernel_config = BlackholeComputeKernelConfig(
-                    {.math_fidelity = conv_config.math_fidelity,
-                    .math_approx_mode = conv_config.math_approx_mode_enabled,
-                    .fp32_dest_acc_en = conv_config.fp32_dest_acc_enabled,
-                    .packer_l1_acc = conv_config.packer_l1_accum_enabled});
-                break;
-
-            default:
-                TT_THROW("Invalid Device Arch, Got {}",device->arch());
-        }
 
         //Call Halo Transpose
         auto [input_tensor_post_tm, parallel_config, output_parallel_config, tensor_manipulated, use_non_tile_height] = shard_or_reshard_tensor_if_required(
@@ -239,6 +216,16 @@ Result conv_transpose2d(
             0,
             input_tensor_post_tm.memory_config());
 
+        if(conv_config.deallocate_activation) {
+            input_tensor_post_tm.deallocate();
+            log_debug(tt::LogOp, "Deallocate Input Tensor");
+        }
+        if (conv_config.reallocate_halo_output) {
+            auto move_output = ttnn::operations::core::reallocate(halo_output, halo_output.memory_config());
+            halo_output = move_output;
+            log_debug(tt::LogOp, "Reallocate Halo Output");
+        }
+
         //Call Conv2d u_op with Stride = 1, Padding = 0.
         auto conv_out_memory_config = create_sharded_memory_config_from_parallel_config(
             ttnn::Shape(std::array<uint32_t, 4>{1, 1, batch_size * output_height * output_width, tt::round_up(out_channels, 32)}),
@@ -266,7 +253,7 @@ Result conv_transpose2d(
             conv_config.act_block_w_div,
             kernel_size[0],
             kernel_size[1],
-            conv_config.fp32_dest_acc_enabled,
+            get_fp32_dest_acc_en(compute_config),
             conv_config.enable_split_reader);
 
         //TODO: Flip the Weights
@@ -300,7 +287,7 @@ Result conv_transpose2d(
                 parallel_config.shard_orientation == ShardOrientation::COL_MAJOR,
                 num_cores_c);
             Tensor matmul_input = ttnn::to_layout(
-                input_tensor_post_tm, Layout::TILE, conv_config.dtype, input_tensor_post_tm.memory_config(), device
+                halo_output, Layout::TILE, conv_config.dtype, input_tensor_post_tm.memory_config(), device
             );
             auto matmul_output = ttnn::operations::matmul::matmul(
                 matmul_input,
@@ -311,7 +298,7 @@ Result conv_transpose2d(
                 /*bcast_batch=*/std::nullopt,
                 conv_out_memory_config,
                 conv_config.dtype,
-                compute_kernel_config});
+                compute_config});
             if (conv_config.deallocate_activation) {
                 ttnn::operations::core::deallocate(matmul_input);
             }
@@ -332,14 +319,13 @@ Result conv_transpose2d(
             groups,
             conv_config.output_layout == Layout::ROW_MAJOR,
             conv_config.activation == "relu",
-            conv_config.math_fidelity,
             opt_conv_op_parallel_config,
             opt_conv_op_block_config,
             conv_out_memory_config,
             conv_config.dtype,
             {batch_size, input_height, input_width, in_channels},
             conv_config.input_channels_alignment == 16,
-            compute_kernel_config,
+            compute_config,
             conv_config.enable_act_double_buffer,
             conv_config.enable_split_reader,
             conv_config.enable_subblock_padding);
@@ -367,8 +353,9 @@ Result ConvTranpose2dOperation::invoke(
     uint32_t groups,
     std::optional<const ttnn::Tensor> bias_tensor,
     const std::optional<const Conv2dConfig>& conv_config_,
-    const std::optional<const MemoryConfig>& memory_config ) {
-    return conv_transpose2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(memory_config));
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_,
+    const std::optional<const MemoryConfig>& memory_config){
+    return conv_transpose2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(compute_config_), std::move(memory_config));
 }
 
 Result ConvTranpose2dOperation::invoke(
@@ -389,8 +376,9 @@ Result ConvTranpose2dOperation::invoke(
     uint32_t groups,
     std::optional<const ttnn::Tensor> bias_tensor,
     const std::optional<const Conv2dConfig>& conv_config_,
-    const std::optional<const MemoryConfig>& memory_config ) {
-    return conv_transpose2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(memory_config));
+    const std::optional<const DeviceComputeKernelConfig>& compute_config_,
+    const std::optional<const MemoryConfig>& memory_config){
+    return conv_transpose2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(compute_config_), std::move(memory_config));
 }
 
 }
diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp
index 119db2cf842..fc23a6f52d6 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp
@@ -34,6 +34,7 @@ struct ConvTranpose2dOperation{
         uint32_t groups,
         std::optional<const ttnn::Tensor> bias_tensor = std::nullopt,
         const std::optional<const Conv2dConfig>& conv_config_ = std::nullopt,
+        const std::optional<const DeviceComputeKernelConfig>& compute_config_ = std::nullopt,
         const std::optional<const MemoryConfig>& memory_config = std::nullopt);
 
     static Result invoke(
@@ -54,6 +55,7 @@ struct ConvTranpose2dOperation{
         uint32_t groups,
         std::optional<const ttnn::Tensor> bias_tensor = std::nullopt,
         const std::optional<const Conv2dConfig>& conv_config_ = std::nullopt,
+        const std::optional<const DeviceComputeKernelConfig>& compute_config_ = std::nullopt,
         const std::optional<const MemoryConfig>& memory_config = std::nullopt);
 };
 
diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp
index 3cea2a187f9..1e07c21eb42 100644
--- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp
+++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp
@@ -62,6 +62,7 @@ void py_bind_conv_transpose2d(py::module& module) {
             groups         (int): the number of groups for grouped convolution.
             bias_tensor    (ttnn.Tensor, optional): the bias tensor. Defaults to `None`.
             conv_config    (ttnn.Conv2dConfig, optional): the configuration for the convolution operation. Defaults to `None`.
+            compute_config (ttnn.DeviceComputeKernelConfig, optional): the configuration for the compute kernel. Defaults to `None`.
             queue_id       (int): the queue id to use for the operation. Defaults to `0`.
 
         Returns:
@@ -84,6 +85,7 @@ void py_bind_conv_transpose2d(py::module& module) {
                     input_height=input_height,
                     input_width=input_width,
                     conv_config=conv_config,
+                    compute_config=compute_config,
                     groups=groups,
                 )
         )doc",
@@ -103,10 +105,12 @@ void py_bind_conv_transpose2d(py::module& module) {
                 std::array<uint32_t, 2> dilation,
                 uint32_t groups,
                 std::optional<const ttnn::Tensor> bias_tensor,
-                std::optional<const Conv2dConfig> conv_config,
+                const std::optional<const Conv2dConfig>& conv_config,
+                const std::optional<const DeviceComputeKernelConfig>& compute_config,
                 const std::optional<const MemoryConfig>& memory_config,
                 const uint8_t& queue_id) -> Result {
-                return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, bias_tensor, conv_config, memory_config);
+                return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, bias_tensor, conv_config, compute_config, memory_config);
+
             },
             py::kw_only(),
             py::arg("input_tensor"),
@@ -125,6 +129,7 @@ void py_bind_conv_transpose2d(py::module& module) {
             py::arg("groups"),
             py::arg("bias_tensor") = std::nullopt,
             py::arg("conv_config") = std::nullopt,
+            py::arg("compute_config") = std::nullopt,
             py::arg("memory_config") = std::nullopt,
             py::arg("queue_id") = 0},
 
@@ -144,10 +149,12 @@ void py_bind_conv_transpose2d(py::module& module) {
                 std::array<uint32_t, 2> dilation,
                 uint32_t groups,
                 std::optional<const ttnn::Tensor> bias_tensor,
-                std::optional<const Conv2dConfig> conv_config,
+                const std::optional<const Conv2dConfig>& conv_config,
+                const std::optional<const DeviceComputeKernelConfig>& compute_config,
                 const std::optional<const MemoryConfig>& memory_config,
                 const uint8_t& queue_id) -> Result {
-                return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, bias_tensor, conv_config, memory_config);
+                return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, bias_tensor, conv_config, compute_config, memory_config);
+
             },
             py::kw_only(),
             py::arg("input_tensor"),
@@ -166,6 +173,7 @@ void py_bind_conv_transpose2d(py::module& module) {
             py::arg("groups"),
             py::arg("bias_tensor") = std::nullopt,
             py::arg("conv_config") = std::nullopt,
+            py::arg("compute_config") = std::nullopt,
             py::arg("memory_config") = std::nullopt,
             py::arg("queue_id") = 0}
     );
diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py
index 71f4f748660..b699ea556af 100644
--- a/ttnn/ttnn/__init__.py
+++ b/ttnn/ttnn/__init__.py
@@ -187,6 +187,7 @@ def manage_config(name, value):
     pad_to_tile_shape,
     SubDevice,
     SubDeviceManagerId,
+    init_device_compute_kernel_config,
 )
 
 from ttnn.profiler import start_tracy_zone, stop_tracy_zone, tracy_message, tracy_frame
diff --git a/ttnn/ttnn/device.py b/ttnn/ttnn/device.py
index e620c800a6c..b8de80cd87a 100644
--- a/ttnn/ttnn/device.py
+++ b/ttnn/ttnn/device.py
@@ -6,6 +6,7 @@
 from typing import Optional, List
 
 import ttnn
+import os
 
 
 def get_device_core_grid(device):
@@ -27,6 +28,7 @@ def get_device_core_grid(device):
 DEFAULT_TRACE_REGION_SIZE = ttnn._ttnn.device.DEFAULT_TRACE_REGION_SIZE
 
 open_device = ttnn._ttnn.device.open_device
+init_device_compute_kernel_config = ttnn._ttnn.operations.core.init_device_compute_kernel_config
 
 
 def close_device(device: "ttnn.device.Device"):
@@ -132,12 +134,25 @@ def dump_device_memory_state(device, prefix=""):
     ttnn._ttnn.device.DumpDeviceMemoryState(device, prefix)
 
 
-def is_wormhole_b0(device):
-    return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0
+def is_wormhole_b0(device=None):
+    if device is not None:
+        return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0
+    ARCH_NAME = os.environ.get("ARCH_NAME", os.environ.get("TT_ARCH_NAME", "")).lower()
+    return "wormhole_b0" in ARCH_NAME
 
 
-def is_grayskull(device):
-    return device.arch() == ttnn._ttnn.device.Arch.GRAYSKULL
+def is_grayskull(device=None):
+    if device is not None:
+        return device.arch() == ttnn._ttnn.device.Arch.GRAYSKULL
+    ARCH_NAME = os.environ.get("ARCH_NAME", os.environ.get("TT_ARCH_NAME", "")).lower()
+    return "grayskull" in ARCH_NAME
+
+
+def is_blackhole(device=None):
+    if device is not None:
+        return device.arch() == ttnn._ttnn.device.Arch.BLACKHOLE
+    ARCH_NAME = os.environ.get("ARCH_NAME", os.environ.get("TT_ARCH_NAME", "")).lower()
+    return "blackhole" in ARCH_NAME
 
 
 SetDefaultDevice = ttnn._ttnn.device.SetDefaultDevice
diff --git a/ttnn/ttnn/operations/conv1d.py b/ttnn/ttnn/operations/conv1d.py
index e979a12b21d..b899f01e3b3 100644
--- a/ttnn/ttnn/operations/conv1d.py
+++ b/ttnn/ttnn/operations/conv1d.py
@@ -28,6 +28,7 @@ def Conv1d(
     groups: int = 1,
     bias_tensor: ttnn.Tensor = None,
     conv_config: Conv1dConfig = None,  # config overrides by user
+    compute_config: ttnn.DeviceComputeKernelConfig = None,
     conv_op_cache={},  # basic conv object caching in python needed for intermediate refactoring. Not needed after full op refactoring in C++.
     debug=False,
 ) -> Tuple[ttnn.Tensor, int, int, ttnn.Tensor, ttnn.Tensor]:
@@ -60,6 +61,7 @@ def Conv1d(
         groups=groups,
         bias_tensor=bias_tensor,
         conv_config=conv_config,
+        compute_config=compute_config,
     )
 
     return (
diff --git a/ttnn/ttnn/operations/conv2d.py b/ttnn/ttnn/operations/conv2d.py
index ef2859c43a2..2f0fa3ee736 100644
--- a/ttnn/ttnn/operations/conv2d.py
+++ b/ttnn/ttnn/operations/conv2d.py
@@ -176,6 +176,7 @@ def conv2d(
     groups: int = 1,
     bias_tensor: ttnn.Tensor = None,
     conv_config: Conv2dConfig = None,  # config overrides by user
+    compute_config=None,  # compute config overrides by user
     memory_config: ttnn.MemoryConfig = None,  # memory config overrides by user
     conv_op_cache={},  # basic conv object caching in python needed for intermediate refactoring. Not needed after full op refactoring in C++.
     debug=False,  # ignored
@@ -196,6 +197,7 @@ def conv2d(
         groups=groups,
         bias_tensor=bias_tensor,
         conv_config=conv_config,
+        compute_config=compute_config,
         memory_config=memory_config,
     )