From 612744d7e9d8810f20c1458710c7f5eab361bb7e Mon Sep 17 00:00:00 2001 From: Sankar Manoj Date: Mon, 9 Dec 2024 13:07:15 +0530 Subject: [PATCH] #10860: Split Conv2dConfig into Compute config (#15164) ### Ticket #10860 ### Problem description Conv2dConfig contains Compute kernel specific arguments which should be passed using DeviceComputeKernelConfig ### What's changed Removed compute kernel arguments from Conv2dConfig. `conv2d` takes an additional argument called compute_config. Used pybind to expose helper functions to create DeviceComputeKernelConfig from the model code. ### Checklist - [x] Post commit CI [passes](https://github.com/tenstorrent/tt-metal/actions/runs/11893281772) - [x] Model regression CI testing [passes](https://github.com/tenstorrent/tt-metal/actions/runs/11948195753) - [x] Demo tests [passes](https://github.com/tenstorrent/tt-metal/actions/runs/11935742809) - [x] Device performance regression CI testing [passes](https://github.com/tenstorrent/tt-metal/actions/runs/11936504399) - [x] Nightly CI run. [No new regressions. ](https://github.com/tenstorrent/tt-metal/actions/runs/12005424271) --- .../demos/convnet_mnist/tt/convnet_mnist.py | 13 +++-- .../tt/multimodal/llama_conv2d_patch.py | 3 +- models/demos/segformer/tt/common.py | 12 ++-- ..._functional_resnet50_large_new_conv_api.py | 24 ++++++-- .../ttnn_functional_resnet50_new_conv_api.py | 36 ++++++++---- ...functional_resnet50_xlarge_new_conv_api.py | 24 ++++++-- ...ctional_resnet50_xlarge_new_conv_api_24.py | 24 ++++++-- ...unctional_resnet50_xxlarge_new_conv_api.py | 24 ++++++-- models/demos/vgg/tests/test_perf_vgg.py | 11 ---- models/demos/vgg/tt/ttnn_vgg.py | 22 ++++--- models/demos/wormhole/mamba/tt/mamba_conv.py | 6 +- .../ttnn_functional_downsample_2d_new_conv.py | 12 ++-- .../ttnn_functional_resnetblock2d_new_conv.py | 49 ++++++++++------ ...ttnn_functional_transformer_2d_new_conv.py | 8 ++- ...tional_unet_2d_condition_model_new_conv.py | 25 +++++--- .../ttnn_functional_upsample_2d_new_conv.py | 12 ++-- models/demos/yolov4/ttnn/common.py | 12 ++-- .../functional_unet/tt/unet_shallow_ttnn.py | 9 ++- .../sweep_utils/conv2d_common.py | 11 ++-- .../stable_diffusion/test_demo.py | 4 ++ .../ttnn/unit_tests/operations/test_conv1d.py | 10 +++- .../operations/test_conv_transpose2d.py | 10 +++- .../unit_tests/operations/test_new_conv2d.py | 33 ++++++++--- .../operations/test_prepare_conv_weights.py | 4 +- .../operations/test_small_resnet50_block.py | 19 +++++- ttnn/cpp/pybind11/operations/core.hpp | 21 ++++++- .../ttnn/operations/conv/conv2d/conv2d.cpp | 34 +++++------ .../ttnn/operations/conv/conv2d/conv2d.hpp | 3 + .../operations/conv/conv2d/conv2d_pybind.cpp | 34 +++++------ .../operations/conv/conv2d/conv2d_utils.hpp | 12 ---- .../conv/conv2d/device/conv2d_op.cpp | 14 ++--- .../conv/conv2d/device/conv2d_op.hpp | 12 ++-- .../conv2d_op_sharded_program_factory.cpp | 1 - .../conv/conv2d/prepare_conv2d_weights.cpp | 28 ++++++--- .../conv/conv2d/prepare_conv2d_weights.hpp | 6 +- .../conv_transpose2d/conv_transpose2d.cpp | 58 ++++++++----------- .../conv_transpose2d/conv_transpose2d.hpp | 2 + .../conv_transpose2d_pybind.cpp | 16 +++-- ttnn/ttnn/__init__.py | 1 + ttnn/ttnn/device.py | 23 ++++++-- ttnn/ttnn/operations/conv1d.py | 2 + ttnn/ttnn/operations/conv2d.py | 2 + 42 files changed, 441 insertions(+), 245 deletions(-) diff --git a/models/demos/convnet_mnist/tt/convnet_mnist.py b/models/demos/convnet_mnist/tt/convnet_mnist.py index a38aa60a770..f0443e938c4 100644 --- a/models/demos/convnet_mnist/tt/convnet_mnist.py +++ b/models/demos/convnet_mnist/tt/convnet_mnist.py @@ -19,19 +19,21 @@ def convnet_mnist( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat16, weights_dtype=ttnn.bfloat16, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=False, - packer_l1_accum_enabled=False, input_channels_alignment=32, transpose_shards=False, reshard_if_not_optimal=True, deallocate_activation=True, reallocate_halo_output=True, ) - + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=False, + packer_l1_acc=False, + ) x = ttnn.to_layout(input_tensor, layout=ttnn.ROW_MAJOR_LAYOUT) [x, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( input_tensor=x, @@ -47,6 +49,7 @@ def convnet_mnist( input_height=input_tensor.shape[1], input_width=input_tensor.shape[2], conv_config=conv_config, + compute_config=compute_config, conv_op_cache={}, debug=True, groups=1, diff --git a/models/demos/llama3/tt/multimodal/llama_conv2d_patch.py b/models/demos/llama3/tt/multimodal/llama_conv2d_patch.py index a4d1bb59885..f5ff04f7e3e 100644 --- a/models/demos/llama3/tt/multimodal/llama_conv2d_patch.py +++ b/models/demos/llama3/tt/multimodal/llama_conv2d_patch.py @@ -79,7 +79,8 @@ def __init__( mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device), ) - self.compute_kernel_config = ttnn.WormholeComputeKernelConfig( + self.compute_kernel_config = ttnn.init_device_compute_kernel_config( + mesh_device.arch(), math_fidelity=ttnn.MathFidelity.HiFi2, math_approx_mode=True, fp32_dest_acc_en=True, diff --git a/models/demos/segformer/tt/common.py b/models/demos/segformer/tt/common.py index 5f52fe0e507..10a4509bc4e 100644 --- a/models/demos/segformer/tt/common.py +++ b/models/demos/segformer/tt/common.py @@ -40,12 +40,8 @@ def __call__(self, device, input_tensor): conv_config = ttnn.Conv2dConfig( dtype=self.dtype, weights_dtype=ttnn.bfloat16, - math_fidelity=ttnn.MathFidelity.LoFi, activation=self.activation, shard_layout=self.shard_layout, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=False, - packer_l1_accum_enabled=False, input_channels_alignment=16 if input_tensor.shape[3] < 16 else 32, transpose_shards=False, reshard_if_not_optimal=self.reshard, @@ -54,6 +50,13 @@ def __call__(self, device, input_tensor): enable_act_double_buffer=True, enable_split_reader=False, ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=False, + packer_l1_acc=False, + ) if self.act_block_h is not None: conv_config.act_block_h_override = self.act_block_h @@ -71,6 +74,7 @@ def __call__(self, device, input_tensor): input_height=input_tensor.shape[1], input_width=input_tensor.shape[2], conv_config=conv_config, + compute_config=compute_config, groups=self.groups, ) diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py index cfe555d0367..123e6a1cef4 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_large_new_conv_api.py @@ -183,12 +183,14 @@ def run_downsample_if_req( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=shard_layout, deallocate_activation=True, reallocate_halo_output=True, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) ttnn.deallocate(x) @@ -230,13 +232,15 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -293,7 +297,6 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, reallocate_halo_output=reallocate_halo_output, @@ -303,6 +306,9 @@ def __call__( else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -324,12 +330,14 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -562,12 +570,14 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, input_channels_alignment=16 if not is_wormhole_b0() else 32, act_block_h_override=act_block_h_override, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 @@ -873,12 +883,14 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, input_channels_alignment=16 if not is_wormhole_b0() else 32, act_block_h_override=act_block_h_override, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py index 44d90cb0f34..107e562a73d 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_new_conv_api.py @@ -176,7 +176,6 @@ def run_downsample_if_req( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, @@ -184,7 +183,6 @@ def run_downsample_if_req( reallocate_halo_output=not (is_wormhole_b0() and batch_size == 16), reshard_if_not_optimal=reshard_if_not_optimal, transpose_shards=transpose_shards, - packer_l1_accum_enabled=packer_l1_accum_enabled, enable_act_double_buffer=enable_act_double_buffer if height_sharding else True @@ -194,6 +192,11 @@ def run_downsample_if_req( enable_split_reader=enable_split_reader, enable_subblock_padding=enable_subblock_padding, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=self.model_config["MATH_FIDELITY"], + packer_l1_acc=packer_l1_accum_enabled, + ), conv_op_cache=conv_op_cache, ) ttnn.deallocate(x) @@ -242,14 +245,17 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, transpose_shards=transpose_shards, - packer_l1_accum_enabled=packer_l1_acc, + ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=self.model_config["MATH_FIDELITY"], + packer_l1_acc=packer_l1_acc, ), conv_op_cache=conv_op_cache, ) @@ -323,7 +329,6 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, reallocate_halo_output=reallocate_halo_output, @@ -333,12 +338,16 @@ def __call__( else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, transpose_shards=transpose_shards, - packer_l1_accum_enabled=packer_l1_acc, enable_act_double_buffer=enable_act_double_buffer, enable_weights_double_buffer=True, enable_split_reader=enable_split_reader, enable_subblock_padding=enable_subblock_padding, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=self.model_config["MATH_FIDELITY"], + packer_l1_acc=packer_l1_acc, + ), conv_op_cache=conv_op_cache, ) @@ -374,13 +383,16 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, transpose_shards=transpose_shards, - packer_l1_accum_enabled=packer_l1_acc, + ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=self.model_config["MATH_FIDELITY"], + packer_l1_acc=packer_l1_acc, ), conv_op_cache=conv_op_cache, ) @@ -569,19 +581,22 @@ def __init__( self.conv1_config = ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=dealloc_input, input_channels_alignment=input_channels_alignment, act_block_h_override=act_block_h_override, transpose_shards=self.transpose_shards, - packer_l1_accum_enabled=True if whb0_and_b16 else False, enable_act_double_buffer=True if whb0_and_b16 else False, enable_split_reader=True if whb0_and_b16 or not is_wormhole_b0() else False, enable_subblock_padding=False, shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, reshard_if_not_optimal=False, ) + self.conv1_compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=self.model_config["MATH_FIDELITY"], + packer_l1_acc=True if whb0_and_b16 else False, + ) if whb0_and_b16: # Issue #13145: Temp workaround for Galaxy to avoid hangs if type(device) == ttnn.MeshDevice and device.get_num_devices() > 8: @@ -733,6 +748,7 @@ def run(self, input_tensor, device, ops_parallel_config, conv_op_cache={}) -> tt input_height=self.conv1_input_height, input_width=self.conv1_input_width, conv_config=self.conv1_config, + compute_config=self.conv1_compute_config, conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py index 5c0750003c1..90853eb06bc 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api.py @@ -178,7 +178,6 @@ def run_downsample_if_req( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, @@ -186,6 +185,9 @@ def run_downsample_if_req( reallocate_halo_output=True, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) ttnn.deallocate(x) @@ -225,13 +227,15 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -286,7 +290,6 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, reallocate_halo_output=reallocate_halo_output, @@ -296,6 +299,9 @@ def __call__( else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -317,12 +323,14 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -532,12 +540,14 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, input_channels_alignment=16 if not is_wormhole_b0() else 32, act_block_h_override=act_block_h_override, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 @@ -835,12 +845,14 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, input_channels_alignment=16 if not is_wormhole_b0() else 32, act_block_h_override=act_block_h_override, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py index f2e266e1d8b..77894c78318 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xlarge_new_conv_api_24.py @@ -180,7 +180,6 @@ def run_downsample_if_req( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, @@ -188,6 +187,9 @@ def run_downsample_if_req( reallocate_halo_output=True, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) ttnn.deallocate(x) @@ -227,13 +229,15 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -289,7 +293,6 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, reallocate_halo_output=reallocate_halo_output, @@ -299,6 +302,9 @@ def __call__( else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -320,12 +326,14 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -557,12 +565,14 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, input_channels_alignment=16 if not is_wormhole_b0() else 32, act_block_h_override=act_block_h_override, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 @@ -888,12 +898,14 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, input_channels_alignment=16 if not is_wormhole_b0() else 32, act_block_h_override=act_block_h_override, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 diff --git a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py index 45d93ebf685..e1cba745a8c 100644 --- a/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py +++ b/models/demos/ttnn_resnet/tt/ttnn_functional_resnet50_xxlarge_new_conv_api.py @@ -179,7 +179,6 @@ def run_downsample_if_req( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, @@ -188,6 +187,9 @@ def run_downsample_if_req( reshard_if_not_optimal=reshard_if_not_optimal, transpose_shards=height_sharding, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) ttnn.deallocate(x) @@ -232,7 +234,6 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding @@ -240,6 +241,9 @@ def __call__( reshard_if_not_optimal=reshard_if_not_optimal, transpose_shards=height_sharding, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -337,7 +341,6 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, reallocate_halo_output=reallocate_halo_output, @@ -348,6 +351,9 @@ def __call__( reshard_if_not_optimal=reshard_if_not_optimal, transpose_shards=height_sharding, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -369,13 +375,15 @@ def __call__( conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED if height_sharding else ttnn.TensorMemoryLayout.BLOCK_SHARDED, reshard_if_not_optimal=reshard_if_not_optimal, transpose_shards=height_sharding, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) @@ -597,13 +605,15 @@ def first_run(self, input_tensor, device, batch_size, ops_parallel_config) -> tt conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, reallocate_halo_output=True, input_channels_alignment=16 if not is_wormhole_b0() else 32, act_block_h_override=act_block_h_override, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 @@ -931,12 +941,14 @@ def optimized_run(self, input_tensor, device, batch_size, ops_parallel_config, c conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", deallocate_activation=True, input_channels_alignment=16 if not is_wormhole_b0() else 32, act_block_h_override=act_block_h_override, ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"] + ), conv_op_cache=conv_op_cache, ) # Relu is fused with conv1 diff --git a/models/demos/vgg/tests/test_perf_vgg.py b/models/demos/vgg/tests/test_perf_vgg.py index 4d5bdf30e06..9cc0397bb07 100644 --- a/models/demos/vgg/tests/test_perf_vgg.py +++ b/models/demos/vgg/tests/test_perf_vgg.py @@ -79,17 +79,6 @@ def test_vgg( "ACTIVATIONS_DTYPE": act_dtype, } - conv_config = ttnn.Conv2dConfig( - dtype=model_config["ACTIVATIONS_DTYPE"], - weights_dtype=model_config["WEIGHTS_DTYPE"], - math_fidelity=model_config["MATH_FIDELITY"], - activation="relu", - deallocate_activation=True, - input_channels_alignment=16, - act_block_h_override=0, - transpose_shards=True, - ) - torch_batched_tensor = torch_input_tensor_nchw.repeat(batch_size, 1, 1, 1) torch_input_tensor = torch.permute(torch_batched_tensor, (0, 2, 3, 1)) tt_batched_input_tensor = ttnn.from_torch(torch_input_tensor, ttnn.bfloat16) diff --git a/models/demos/vgg/tt/ttnn_vgg.py b/models/demos/vgg/tt/ttnn_vgg.py index 0748c745d16..ace558cfa11 100644 --- a/models/demos/vgg/tt/ttnn_vgg.py +++ b/models/demos/vgg/tt/ttnn_vgg.py @@ -90,10 +90,6 @@ def ttnn_vgg16( conv_config = ttnn.Conv2dConfig( dtype=model_config["ACTIVATIONS_DTYPE"], weights_dtype=model_config["WEIGHTS_DTYPE"], - math_fidelity=model_config["MATH_FIDELITY"], - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=False, - packer_l1_accum_enabled=False, activation="relu", deallocate_activation=False, input_channels_alignment=32, @@ -107,6 +103,13 @@ def ttnn_vgg16( ) if h_override[iter_conv_id] is not None: conv_config.act_block_h_override = h_override[iter_conv_id] + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=model_config["MATH_FIDELITY"], + math_approx_mode=True, + fp32_dest_acc_en=False, + packer_l1_acc=False, + ) tt_weight = parameters.features[conv_feature_ids[iter_conv_id]].weight tt_weight = ttnn.to_layout(ttnn.from_device(tt_weight), layout=ttnn.ROW_MAJOR_LAYOUT) @@ -127,6 +130,7 @@ def ttnn_vgg16( input_height=conv_ttnn_params[iter_conv_id][2], input_width=conv_ttnn_params[iter_conv_id][3], conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_op_cache, ) tt_x = ttnn.from_device(tt_output_tensor_on_device) @@ -214,9 +218,6 @@ def ttnn_vgg11( conv_config = ttnn.Conv2dConfig( dtype=model_config["ACTIVATIONS_DTYPE"], weights_dtype=model_config["WEIGHTS_DTYPE"], - math_fidelity=model_config["MATH_FIDELITY"], - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, activation="relu", deallocate_activation=False, input_channels_alignment=32, @@ -230,6 +231,12 @@ def ttnn_vgg11( if height_override_11[iter_conv_id] is not None: conv_config.act_block_h_override = height_override_11[iter_conv_id] + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=model_config["MATH_FIDELITY"], + math_approx_mode=True, + fp32_dest_acc_en=True, + ) tt_weight = parameters.features[conv_feature_ids_2[iter_conv_id]].weight tt_weight = ttnn.to_layout(ttnn.from_device(tt_weight), layout=ttnn.ROW_MAJOR_LAYOUT) tt_bias = parameters.features[conv_feature_ids_2[iter_conv_id]].bias @@ -250,6 +257,7 @@ def ttnn_vgg11( input_height=conv_ttnn_params_2[iter_conv_id][2], input_width=conv_ttnn_params_2[iter_conv_id][3], conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_op_cache, ) tt_x = ttnn.from_device(tt_output_tensor_on_device) diff --git a/models/demos/wormhole/mamba/tt/mamba_conv.py b/models/demos/wormhole/mamba/tt/mamba_conv.py index a2700198f83..799ea950392 100644 --- a/models/demos/wormhole/mamba/tt/mamba_conv.py +++ b/models/demos/wormhole/mamba/tt/mamba_conv.py @@ -54,11 +54,14 @@ def prepare_conv_config(self): self.conv1d_config = ttnn.Conv1dConfig( dtype=self.config.output_dtype, weights_dtype=self.config.weights_dtype, - math_fidelity=self.config.math_fidelity, shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, input_channels_alignment=32, deallocate_activation=True, ) + self.conv1d_compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=self.config.math_fidelity, + ) def prepare_input(self, input_tensor): # input_tensor (1, 1, B, 2E) @@ -100,6 +103,7 @@ def __call__(self, input_tensor): batch_size=1, input_length=self.config.input_length, conv_config=self.conv1d_config, + compute_config=self.conv1d_compute_config, conv_op_cache={}, debug=False, groups=self.config.groups // self.config.channels_split_factor, diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py index 2ad02078d71..1d1478f94d4 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_downsample_2d_new_conv.py @@ -126,11 +126,7 @@ def __call__( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, - packer_l1_accum_enabled=False, shard_layout=self.shard_layout, input_channels_alignment=32, transpose_shards=False, @@ -140,6 +136,13 @@ def __call__( if hidden_states.memory_config() != self.input_memory_config: hidden_states = ttnn.to_memory_config(hidden_states, self.input_memory_config) + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) if self.conv_config_override and "act_block_h" in self.conv_config_override: conv_config.act_block_h_override = self.conv_config_override["act_block_h"] @@ -157,6 +160,7 @@ def __call__( weight_tensor=self.conv_weights, bias_tensor=self.conv_bias, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) # hidden_states = run_ttnn_conv_with_pre_and_post_tensor_formatting( diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py index 4cedbdea78c..6d94f60975e 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_resnetblock2d_new_conv.py @@ -459,16 +459,19 @@ def __call__( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", shard_layout=self.conv1_shard_layout, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, - packer_l1_accum_enabled=False, input_channels_alignment=32, transpose_shards=False, reshard_if_not_optimal=False, ) + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) if self.conv1_config_override and "act_block_h" in self.conv2_config_override: conv_config.act_block_h_override = self.conv1_config_override["act_block_h"] [hidden_states, _out_height, _out_width, self.conv1s_weights[0], self.conv1s_bias[0]] = ttnn.conv2d( @@ -485,6 +488,7 @@ def __call__( input_height=self.conv1_input_height, input_width=self.conv1_input_width, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) @@ -529,17 +533,19 @@ def __call__( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, - packer_l1_accum_enabled=False, input_channels_alignment=32, transpose_shards=False, reshard_if_not_optimal=False, ) - + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) if self.conv1_config_override and "act_block_h" in self.conv2_config_override: conv_config.act_block_h_override = self.conv1_config_override["act_block_h"] @@ -563,6 +569,7 @@ def __call__( input_height=self.conv1_input_height, input_width=self.conv1_input_width, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) if i != 0: @@ -658,16 +665,19 @@ def __call__( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, - packer_l1_accum_enabled=False, input_channels_alignment=32, transpose_shards=False, reshard_if_not_optimal=False, ) + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) if self.conv2_config_override and "act_block_h" in self.conv2_config_override: conv_config.act_block_h_override = self.conv2_config_override["act_block_h"] [hidden_states, _out_height, _out_width, self.conv2_weights, self.conv2_bias] = ttnn.conv2d( @@ -684,6 +694,7 @@ def __call__( input_height=self.conv2_input_height, input_width=self.conv2_input_width, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) use_in_shortcut = in_channels != out_channels if use_in_shortcut is None else use_in_shortcut @@ -702,16 +713,19 @@ def __call__( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, - packer_l1_accum_enabled=False, input_channels_alignment=32, transpose_shards=False, reshard_if_not_optimal=False, ) + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) [input_tensor, _out_height, _out_width, self.conv_shortcut_weights, self.conv_shortcut_bias] = ttnn.conv2d( input_tensor=input_tensor, weight_tensor=self.conv_shortcut_weights, @@ -726,6 +740,7 @@ def __call__( input_height=self.conv_shortcut_input_height, input_width=self.conv_shortcut_input_width, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py index 12e4d543207..3a856dce04e 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_transformer_2d_new_conv.py @@ -242,13 +242,16 @@ def __call__( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, input_channels_alignment=32, - fp32_dest_acc_enabled=self.compute_kernel_config.fp32_dest_acc_en, transpose_shards=False, ) + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + fp32_dest_acc_en=self.compute_kernel_config.fp32_dest_acc_en, + ) [hidden_states, _out_height, _out_width, self.proj_in_conv_weights, self.proj_in_conv_bias] = ttnn.conv2d( input_tensor=hidden_states, in_channels=self.proj_in_in_channels, @@ -263,6 +266,7 @@ def __call__( weight_tensor=self.proj_in_conv_weights, bias_tensor=self.proj_in_conv_bias, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py index 9cbdfff2f48..a3525c38598 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_unet_2d_condition_model_new_conv.py @@ -383,17 +383,19 @@ def __call__( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, - packer_l1_accum_enabled=False, shard_layout=shard_layout, input_channels_alignment=32, transpose_shards=False, reshard_if_not_optimal=True, ) - + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) [sample, _out_height, _out_width, self.conv_in_weights, self.conv_in_bias] = ttnn.conv2d( input_tensor=sample, weight_tensor=self.conv_in_weights, @@ -408,6 +410,7 @@ def __call__( input_height=self.input_height, input_width=self.input_width, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) sample = ttnn.reallocate(sample) # TODO: Test remove @@ -646,17 +649,20 @@ def __call__( conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", shard_layout=ttnn.TensorMemoryLayout.HEIGHT_SHARDED, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, - packer_l1_accum_enabled=False, input_channels_alignment=32, act_block_h_override=64, transpose_shards=False, reshard_if_not_optimal=True, ) + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) [sample, _out_height, _out_width, self.conv_out_weights, self.conv_out_bias] = ttnn.conv2d( input_tensor=sample, in_channels=self.conv_out_in_channels, @@ -671,6 +677,7 @@ def __call__( weight_tensor=self.conv_out_weights, bias_tensor=self.conv_out_bias, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) sample = ttnn.to_memory_config(sample, ttnn.L1_MEMORY_CONFIG) diff --git a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py index 622a63065db..54056a71526 100644 --- a/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py +++ b/models/demos/wormhole/stable_diffusion/tt/ttnn_functional_upsample_2d_new_conv.py @@ -91,16 +91,19 @@ def __call__(self, input, in_channels, out_channels): conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat8_b, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation="", shard_layout=ttnn.TensorMemoryLayout.BLOCK_SHARDED, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=True, - packer_l1_accum_enabled=False, input_channels_alignment=32, transpose_shards=False, reshard_if_not_optimal=False, # Reshard has error : 1616 Bytes unique+common runtime args targeting kernel reshard_reader on (x=0,y=0) are too large. Cannot be written as they will run into memory region reserved for result. Max allowable size is 1024 Bytes ) + compute_config = ttnn.init_device_compute_kernel_config( + self.device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=True, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) if self.conv_config_override and "act_block_h" in self.conv_config_override: conv_config.act_block_h_override = self.conv_config_override["act_block_h"] [tt_out, _out_height, _out_width, self.conv_weight_tensor, self.conv_bias_tensor] = ttnn.conv2d( @@ -117,6 +120,7 @@ def __call__(self, input, in_channels, out_channels): weight_tensor=self.conv_weight_tensor, bias_tensor=self.conv_bias_tensor, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=conv_cache, ) return tt_out diff --git a/models/demos/yolov4/ttnn/common.py b/models/demos/yolov4/ttnn/common.py index b293a6db751..9d3b154aaf4 100644 --- a/models/demos/yolov4/ttnn/common.py +++ b/models/demos/yolov4/ttnn/common.py @@ -80,13 +80,9 @@ def __call__(self, device, input_tensor): conv_config = ttnn.Conv2dConfig( dtype=ttnn.bfloat16, weights_dtype=ttnn.bfloat8_b, - math_fidelity=ttnn.MathFidelity.LoFi, activation=self.activation, shard_layout=self.shard_layout, - math_approx_mode_enabled=True, - fp32_dest_acc_enabled=False, act_block_w_div=1, - packer_l1_accum_enabled=False, input_channels_alignment=16 if self.input_params[3] < 16 else 32, transpose_shards=False, reshard_if_not_optimal=self.reshard, @@ -96,6 +92,13 @@ def __call__(self, device, input_tensor): enable_act_double_buffer=self.enable_act_double_buffer, output_layout=self.output_layout, ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + math_approx_mode=False, + fp32_dest_acc_en=False, + packer_l1_acc=False, + ) if self.act_block_h is not None: conv_config.act_block_h_override = self.act_block_h @@ -113,5 +116,6 @@ def __call__(self, device, input_tensor): input_height=self.input_params[1], input_width=self.input_params[2], conv_config=conv_config, + compute_config=compute_config, ) return output_tensor diff --git a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py index 215399ea23b..fb087ad9279 100644 --- a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py +++ b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py @@ -114,10 +114,8 @@ def __init__( self.conv_config = ttnn.Conv2dConfig( dtype=activation_dtype, weights_dtype=weights_dtype, - math_fidelity=ttnn.MathFidelity.LoFi, shard_layout=shard_layout, deallocate_activation=self.deallocate_activation, - packer_l1_accum_enabled=False, enable_act_double_buffer=( conv.use_activation_double_buffer if "use_activation_double_buffer" in conv else False ), @@ -128,6 +126,12 @@ def __init__( input_channels_alignment=conv.input_channels_alignment if "input_channels_alignment" in conv else 32, reshard_if_not_optimal=reshard_if_not_optimal, ) + self.compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=ttnn.MathFidelity.LoFi, + fp32_dest_acc_en=True, + packer_l1_acc=False, + ) config_override = conv.conv_blocking_and_parallelization_config_override if config_override and "act_block_h" in config_override: self.conv_config.act_block_h_override = config_override["act_block_h"] @@ -157,6 +161,7 @@ def __call__(self, x): stride=self.stride, padding=self.padding, conv_config=self.conv_config, + compute_config=self.compute_config, conv_op_cache=self.cache, groups=2, ) diff --git a/tests/sweep_framework/sweep_utils/conv2d_common.py b/tests/sweep_framework/sweep_utils/conv2d_common.py index 55769adb984..2dc1709bdbd 100644 --- a/tests/sweep_framework/sweep_utils/conv2d_common.py +++ b/tests/sweep_framework/sweep_utils/conv2d_common.py @@ -117,18 +117,20 @@ def run_full( conv_config = ttnn.Conv2dConfig( dtype=activations_dtype, weights_dtype=weights_dtype, - math_fidelity=math_fidelity, shard_layout=None, deallocate_activation=deallocate_activation, - fp32_dest_acc_enabled=fp32_accum, - packer_l1_accum_enabled=packer_l1_acc, override_sharding_config=override_sharding_config, output_layout=output_layout, enable_act_double_buffer=enable_act_double_buffer, enable_split_reader=enable_split_reader, enable_subblock_padding=enable_subblock_padding, ) - + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=math_fidelity, + fp32_dest_acc_en=fp32_accum, + packer_l1_acc=packer_l1_acc, + ) if override_sharding_config: if len(core_grid) == 2: conv_config.core_grid = ttnn.CoreRangeSet({ttnn.CoreRange(core_grid[0], core_grid[1])}) @@ -152,6 +154,7 @@ def run_full( input_height=input_height, input_width=input_width, conv_config=conv_config, + compute_config=compute_config, groups=groups, ) diff --git a/tests/ttnn/integration_tests/stable_diffusion/test_demo.py b/tests/ttnn/integration_tests/stable_diffusion/test_demo.py index 5c8dc03b967..36b73e70e3c 100644 --- a/tests/ttnn/integration_tests/stable_diffusion/test_demo.py +++ b/tests/ttnn/integration_tests/stable_diffusion/test_demo.py @@ -29,6 +29,8 @@ ((512, 512),), ) def test_demo_sd(device, reset_seeds, input_path, num_prompts, num_inference_steps, image_size): + if device.core_grid.y != 8: + pytest.skip("Needs 8x8 Grid") demo(device, reset_seeds, input_path, num_prompts, num_inference_steps, image_size) @@ -48,4 +50,6 @@ def test_demo_sd(device, reset_seeds, input_path, num_prompts, num_inference_ste ((512, 512),), ) def test_demo_sd_db(device, reset_seeds, input_path, num_prompts, num_inference_steps, image_size): + if device.core_grid.y != 8: + pytest.skip("Needs 8x8 Grid") demo_db(device, reset_seeds, input_path, num_prompts, num_inference_steps, image_size) diff --git a/tests/ttnn/unit_tests/operations/test_conv1d.py b/tests/ttnn/unit_tests/operations/test_conv1d.py index 3e7a1496c63..a7ca4c9c30c 100644 --- a/tests/ttnn/unit_tests/operations/test_conv1d.py +++ b/tests/ttnn/unit_tests/operations/test_conv1d.py @@ -88,12 +88,15 @@ def run_conv( conv_config = ttnn.Conv1dConfig( dtype=output_dtype, weights_dtype=weights_dtype, - math_fidelity=math_fidelity, shard_layout=shard_layout, input_channels_alignment=(16 if use_shallow_conv_variant else 32), deallocate_activation=deallocate_activation, - fp32_dest_acc_enabled=fp32_accum, - packer_l1_accum_enabled=packer_l1_acc, + ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=math_fidelity, + fp32_dest_acc_en=fp32_accum, + packer_l1_acc=packer_l1_acc, ) if config_override and "act_block_h" in config_override: conv_config.act_block_h_override = config_override["act_block_h"] @@ -117,6 +120,7 @@ def run_conv( batch_size=batch_size, input_length=input_length, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=reader_patterns_cache, debug=debug, groups=groups, diff --git a/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py b/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py index 699caa49e54..63942ef0f8f 100644 --- a/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py +++ b/tests/ttnn/unit_tests/operations/test_conv_transpose2d.py @@ -104,19 +104,22 @@ def run_conv_transpose2d( conv_config = ttnn.Conv2dConfig( dtype=activations_dtype, weights_dtype=weights_dtype, - math_fidelity=math_fidelity, shard_layout=shard_layout, input_channels_alignment=( 16 if use_shallow_conv_variant or (input_channels == 16 and input_height == 115) else 32 ), deallocate_activation=deallocate_activation, - fp32_dest_acc_enabled=fp32_accum, - packer_l1_accum_enabled=packer_l1_acc, enable_act_double_buffer=False, enable_split_reader=False, enable_subblock_padding=False, output_layout=ttnn.ROW_MAJOR_LAYOUT, ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=math_fidelity, + fp32_dest_acc_en=fp32_accum, + packer_l1_acc=packer_l1_acc, + ) if config_override and "act_block_h" in config_override: conv_config.act_block_h_override = config_override["act_block_h"] @@ -139,6 +142,7 @@ def run_conv_transpose2d( input_height=input_height, input_width=input_width, conv_config=conv_config, + compute_config=compute_config, groups=groups, ) logger.info(f"Conv2d Transpose Input = {(input_height, input_width)} Output = {out_height, out_width}") diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py index 3e5f5f857f9..25d4b0bc00f 100644 --- a/tests/ttnn/unit_tests/operations/test_new_conv2d.py +++ b/tests/ttnn/unit_tests/operations/test_new_conv2d.py @@ -137,19 +137,22 @@ def run_conv( conv_config = ttnn.Conv2dConfig( dtype=activations_dtype, weights_dtype=weights_dtype, - math_fidelity=math_fidelity, shard_layout=shard_layout, input_channels_alignment=( 16 if use_shallow_conv_variant or (input_channels == 16 and input_height == 115) else 32 ), deallocate_activation=deallocate_activation, - fp32_dest_acc_enabled=fp32_accum, - packer_l1_accum_enabled=packer_l1_acc, enable_act_double_buffer=False, enable_split_reader=False, enable_subblock_padding=False, output_layout=output_layout, ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=math_fidelity, + fp32_dest_acc_en=fp32_accum, + packer_l1_acc=packer_l1_acc, + ) if config_override and "act_block_h" in config_override and not auto_shard: conv_config.act_block_h_override = config_override["act_block_h"] @@ -177,6 +180,7 @@ def run_conv( input_height=input_height, input_width=input_width, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=reader_patterns_cache, debug=debug, groups=groups, @@ -280,12 +284,15 @@ def run_conv_with_split( conv_config = ttnn.Conv2dConfig( dtype=activations_dtype, weights_dtype=weights_dtype, - math_fidelity=math_fidelity, shard_layout=shard_layout if use_1d_systolic_array else ttnn.TensorMemoryLayout.BLOCK_SHARDED, - fp32_dest_acc_enabled=fp32_accum, - packer_l1_accum_enabled=packer_l1_acc, # input_channels_alignment=(16 if use_shallow_conv_variant else 32), ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=math_fidelity, + fp32_dest_acc_en=fp32_accum, + packer_l1_acc=packer_l1_acc, + ) if config_override and "act_block_h" in config_override: conv_config.act_block_h_override = config_override["act_block_h"] print("Setting Act Block H to ", conv_config.act_block_h_override) @@ -320,6 +327,7 @@ def run_conv_with_split( input_height=input_height, input_width=input_width, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=reader_patterns_cache, ) tt_conv_output_tensor = ttnn.from_device(tt_output_tensor_on_device) @@ -625,12 +633,9 @@ def test_conv_ws( conv_config = ttnn.Conv2dConfig( dtype=activations_dtype, weights_dtype=weights_dtype, - math_fidelity=ttnn.MathFidelity.HiFi4, shard_layout=ttnn.TensorMemoryLayout.WIDTH_SHARDED if not auto_shard else None, input_channels_alignment=32, deallocate_activation=deallocate_activation, - fp32_dest_acc_enabled=fp32_accum, - packer_l1_accum_enabled=packer_l1_acc, enable_act_double_buffer=False, enable_split_reader=False, enable_subblock_padding=False, @@ -638,6 +643,12 @@ def test_conv_ws( act_block_w_div=act_block_w_div if not auto_shard else 1, act_block_h_override=32, ) + compute_config = ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=ttnn.MathFidelity.HiFi4, + fp32_dest_acc_en=fp32_accum, + packer_l1_acc=packer_l1_acc, + ) [tt_output_tensor_on_device, out_height, out_width, weights_device, bias_device] = ttnn.conv2d( input_tensor=tt_input_tensor, weight_tensor=tt_weight_tensor, @@ -652,6 +663,7 @@ def test_conv_ws( input_height=input_height, input_width=input_width, conv_config=conv_config, + compute_config=compute_config, conv_op_cache=reader_patterns_cache, debug=debug, groups=groups, @@ -2745,6 +2757,9 @@ def test_shallow_conv_with_tiled_input(device): input_height=img_h, input_width=img_w, groups=1, + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), + ), memory_config=ttnn.DRAM_MEMORY_CONFIG, ) diff --git a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py index 23f28658fc3..5a59200a178 100644 --- a/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py +++ b/tests/ttnn/unit_tests/operations/test_prepare_conv_weights.py @@ -127,12 +127,11 @@ def test_prepare_conv_weights( dtype=ttnn.bfloat16, weights_dtype=ttnn.bfloat16, input_channels_alignment=(16 if input_channels == 16 and input_height == 115 else 32), - packer_l1_accum_enabled=packer_l1_acc, enable_act_double_buffer=False, enable_split_reader=False, enable_subblock_padding=False, ) - + compute_config = ttnn.init_device_compute_kernel_config(device.arch(), packer_l1_acc=packer_l1_acc) if config_override and "act_block_h" in config_override: conv_config.act_block_h_override = config_override["act_block_h"] @@ -184,6 +183,7 @@ def test_prepare_conv_weights( weight_tensor=tt_weight_tensor_formatted, bias_tensor=tt_bias_tensor_formatted, **conv_kwargs, + compute_config=compute_config, ) tt_output_tensor = ttnn.from_device(tt_output_tensor_on_device) diff --git a/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py b/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py index 84ee4d5d972..bf233351d1f 100644 --- a/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py +++ b/tests/ttnn/unit_tests/operations/test_small_resnet50_block.py @@ -118,6 +118,9 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], + ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"], ), conv_op_cache=conv_op_cache, @@ -139,9 +142,12 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=self.model_config["MATH_FIDELITY"], + ), conv_op_cache=conv_op_cache, ) @@ -162,6 +168,9 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], + ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"], ), conv_op_cache=conv_op_cache, @@ -187,9 +196,12 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], - math_fidelity=self.model_config["MATH_FIDELITY"], activation="relu", ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), + math_fidelity=self.model_config["MATH_FIDELITY"], + ), conv_op_cache=conv_op_cache, ) @@ -211,6 +223,9 @@ def __call__(self, x, device, batch_size, input_height, input_width, conv_op_cac conv_config=ttnn.Conv2dConfig( dtype=self.model_config["ACTIVATIONS_DTYPE"], weights_dtype=self.model_config["WEIGHTS_DTYPE"], + ), + compute_config=ttnn.init_device_compute_kernel_config( + device.arch(), math_fidelity=self.model_config["MATH_FIDELITY"], ), conv_op_cache=conv_op_cache, diff --git a/ttnn/cpp/pybind11/operations/core.hpp b/ttnn/cpp/pybind11/operations/core.hpp index eaf0014cf52..74da55f61da 100644 --- a/ttnn/cpp/pybind11/operations/core.hpp +++ b/ttnn/cpp/pybind11/operations/core.hpp @@ -6,7 +6,9 @@ #include #include +#include +#include "pybind11/cast.h" #include "ttnn/cpp/pybind11/decorators.hpp" #include "ttnn/operations/core/core.hpp" #include "tt_metal/common/work_split.hpp" @@ -22,12 +24,14 @@ void py_module_types(py::module& module) { py::class_(module, "GrayskullComputeKernelConfig") .def( - py::init(), + py::init(), py::kw_only(), py::arg("math_fidelity") = MathFidelity::Invalid, - py::arg("math_approx_mode") = true) + py::arg("math_approx_mode") = true, + py::arg("dst_full_sync_en") = false) .def_readwrite("math_fidelity", &GrayskullComputeKernelConfig::math_fidelity) - .def_readwrite("math_approx_mode", &GrayskullComputeKernelConfig::math_approx_mode); + .def_readwrite("math_approx_mode", &GrayskullComputeKernelConfig::math_approx_mode) + .def_readwrite("dst_full_sync_en", &GrayskullComputeKernelConfig::dst_full_sync_en); py::class_(module, "WormholeComputeKernelConfig") .def( @@ -46,6 +50,17 @@ void py_module_types(py::module& module) { } void py_module(py::module& module) { + + module.def("init_device_compute_kernel_config", &ttnn::init_device_compute_kernel_config, + py::arg("arch"), + py::arg("device_kernel_config") = std::nullopt, + py::kw_only(), + py::arg("math_fidelity") = MathFidelity::LoFi, + py::arg("math_approx_mode") = true, + py::arg("fp32_dest_acc_en") = false, + py::arg("packer_l1_acc") = false, + py::arg("dst_full_sync_en") = false + ); module.def("unsqueeze_to_4D", &ttnn::unsqueeze_to_4D, py::arg("tensor")); module.def( diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp index 5f9ba6f0ea9..d6d06ec490f 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp @@ -17,10 +17,6 @@ #include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" #include "ttnn/operations/core/core.hpp" #include "ttnn/operations/pool/downsample/device/downsample_op.hpp" -#include "tt_metal/detail/reports/memory_reporter.hpp" -#include "tt_metal/common/work_split.hpp" -#include "ttnn/operations/eltwise/unary/common/unary_op_utils.hpp" -#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape.hpp" #include "ttnn/operations/sliding_window/sliding_window.hpp" #include "ttnn/tensor/tensor.hpp" #include "ttnn/tensor/types.hpp" @@ -54,6 +50,7 @@ Result conv2d( uint32_t groups, std::optional bias_tensor, const std::optional& conv_config_, + const std::optional& compute_config_, const std::optional& memory_config) { const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups); const uint32_t output_height = ((input_height - kernel_size[0] - ((kernel_size[0] - 1 ) * (dilation[0] - 1)) + 2 * padding[0]) / stride[0]) + 1; @@ -89,6 +86,14 @@ Result conv2d( (conv_config.weights_dtype == DataType::BFLOAT8_B || conv_config.weights_dtype == DataType::BFLOAT16) && conv_config.output_layout == Layout::ROW_MAJOR && ((elem_size * in_channels) % (16 * num_cores_c)) == 0; + DeviceComputeKernelConfig compute_config = compute_config_.value_or( init_device_compute_kernel_config( + device->arch(), + std::nullopt, + MathFidelity::HiFi4, + true, + false, + false + )); auto [input_tensor_post_tm, parallel_config, output_parallel_config, tensor_manipulated, use_non_tile_height] = shard_or_reshard_tensor_if_required( device, input_tensor, conv_config, batch_size, output_height, output_width, in_channels, out_channels, mm_conv, is_non_tile_mul_width); if (tensor_manipulated) { @@ -138,7 +143,7 @@ Result conv2d( conv_config.act_block_w_div, kernel_size[0], kernel_size[1], - conv_config.fp32_dest_acc_enabled, + get_fp32_dest_acc_en(compute_config), conv_config.enable_split_reader); bool weight_is_on_device = ttnn::is_tensor_on_device_or_multidevice(weight_tensor); ttnn::Tensor weight_tensor_on_device = weight_tensor; @@ -173,13 +178,6 @@ Result conv2d( // call optimized conv op or matmul micro op bool input_is_on_device = ttnn::is_tensor_on_device_or_multidevice(input_tensor_post_tm); TT_ASSERT(input_is_on_device); - DeviceComputeKernelConfig compute_kernel_config = ttnn::init_device_compute_kernel_config( - device->arch(), - std::nullopt, - conv_config.math_fidelity, - conv_config.math_approx_mode_enabled, - conv_config.fp32_dest_acc_enabled, - conv_config.packer_l1_accum_enabled); if (!mm_conv) { // call halo op @@ -238,14 +236,13 @@ Result conv2d( groups, conv_config.output_layout == Layout::ROW_MAJOR, conv_config.activation == "relu", - conv_config.math_fidelity, opt_conv_op_parallel_config, opt_conv_op_block_config, conv_out_memory_config, conv_config.dtype, {batch_size, input_height, input_width, in_channels}, conv_config.input_channels_alignment == 16, - compute_kernel_config, + compute_config, conv_config.enable_act_double_buffer, conv_config.enable_weights_double_buffer, conv_config.enable_split_reader, @@ -284,7 +281,7 @@ Result conv2d( /*bcast_batch=*/std::nullopt, conv_out_memory_config, conv_config.dtype, - compute_kernel_config}); + compute_config}); if (conv_config.deallocate_activation) { ttnn::operations::core::deallocate(matmul_input); } @@ -314,8 +311,9 @@ Result Conv2dOperation::invoke( uint32_t groups, std::optional bias_tensor, const std::optional& conv_config_, + const std::optional& compute_config_, const std::optional& memory_config){ - return conv2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), memory_config); + return conv2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(compute_config_), memory_config); } Result Conv2dOperation::invoke( @@ -335,10 +333,12 @@ Result Conv2dOperation::invoke( uint32_t groups, std::optional bias_tensor, const std::optional& conv_config_, + const std::optional& compute_config_, const std::optional& memory_config){ - return conv2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), memory_config); + return conv2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(compute_config_), memory_config); } + } // namespace conv2d } // namespace operations } // namespace ttnn diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp index d15023abb86..e8310c0dbdc 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.hpp @@ -47,6 +47,7 @@ Result conv2d( uint32_t groups, std::optional bias_tensor = std::nullopt, const std::optional& conv_config_ = std::nullopt, + const std::optional& compute_config_ = std::nullopt, const std::optional& memory_config = std::nullopt); @@ -68,6 +69,7 @@ struct Conv2dOperation{ uint32_t groups, std::optional bias_tensor = std::nullopt, const std::optional& conv_config_ = std::nullopt, + const std::optional& compute_config_ = std::nullopt, const std::optional& memory_config = std::nullopt); static Result invoke( @@ -87,6 +89,7 @@ struct Conv2dOperation{ uint32_t groups, std::optional bias_tensor = std::nullopt, const std::optional& conv_config_ = std::nullopt, + const std::optional& compute_config_ = std::nullopt, const std::optional& memory_config = std::nullopt); }; } // namespace conv2d diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp index 6ac28cf56ca..c3356447cab 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp @@ -60,10 +60,11 @@ void py_bind_conv2d(py::module& module) { std::array dilation, uint32_t groups, std::optional bias_tensor, - std::optional conv_config, + const std::optional& conv_config, + const std::optional& compute_config, const std::optional& memory_config, const uint8_t& queue_id) -> Result { - return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, bias_tensor, conv_config, memory_config); + return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, bias_tensor, conv_config, compute_config, memory_config); }, py::kw_only(), py::arg("input_tensor"), @@ -81,6 +82,7 @@ void py_bind_conv2d(py::module& module) { py::arg("groups"), py::arg("bias_tensor") = std::nullopt, py::arg("conv_config") = std::nullopt, + py::arg("compute_config") = std::nullopt, py::arg("memory_config") = std::nullopt, py::arg("queue_id") = 0}, @@ -99,10 +101,11 @@ void py_bind_conv2d(py::module& module) { std::array dilation, uint32_t groups, std::optional bias_tensor, - std::optional conv_config, + const std::optional& conv_config, + const std::optional& compute_config, const std::optional& memory_config, const uint8_t& queue_id) -> Result { - return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, bias_tensor, conv_config, memory_config); + return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, dilation, groups, bias_tensor, conv_config, compute_config, memory_config); }, py::kw_only(), py::arg("input_tensor"), @@ -120,6 +123,7 @@ void py_bind_conv2d(py::module& module) { py::arg("groups"), py::arg("bias_tensor") = std::nullopt, py::arg("conv_config") = std::nullopt, + py::arg("compute_config") = std::nullopt, py::arg("memory_config") = std::nullopt, py::arg("queue_id") = 0} ); @@ -143,7 +147,8 @@ void py_bind_conv2d(py::module& module) { py::arg("dilation"), py::arg("groups"), py::arg("device"), - py::arg("conv_config") = std::nullopt); + py::arg("conv_config") = std::nullopt, + py::arg("compute_config") = std::nullopt); module.def( @@ -165,7 +170,8 @@ void py_bind_conv2d(py::module& module) { py::arg("dilation"), py::arg("groups"), py::arg("device"), - py::arg("conv_config") = std::nullopt); + py::arg("conv_config") = std::nullopt, + py::arg("compute_config") = std::nullopt); module.def( "prepare_conv_bias", @@ -185,7 +191,8 @@ void py_bind_conv2d(py::module& module) { py::arg("dilation"), py::arg("groups"), py::arg("device"), - py::arg("conv_config") = std::nullopt); + py::arg("conv_config") = std::nullopt, + py::arg("compute_config") = std::nullopt); module.def( "prepare_conv_bias", @@ -205,7 +212,8 @@ void py_bind_conv2d(py::module& module) { py::arg("dilation"), py::arg("groups"), py::arg("device"), - py::arg("conv_config") = std::nullopt); + py::arg("conv_config") = std::nullopt, + py::arg("compute_config") = std::nullopt); module.def( "convert_conv_weight_tensor_to_tiled_layout", @@ -266,14 +274,10 @@ void py_bind_conv2d(py::module& module) { auto py_conv_config = py::class_(module, "Conv2dConfig"); py_conv_config.def( - py::init, std::optional, bool, Layout, bool, bool, bool, bool>(), + py::init, std::optional, bool, Layout, bool, bool, bool, bool>(), py::kw_only(), - py::arg("math_fidelity") = MathFidelity::HiFi4, py::arg("dtype") = DataType::BFLOAT16, py::arg("weights_dtype") = DataType::BFLOAT16, - py::arg("math_approx_mode_enabled") = true, - py::arg("fp32_dest_acc_enabled") = false, - py::arg("packer_l1_accum_enabled") = false, py::arg("activation") = "", py::arg("input_channels_alignment") = 32, py::arg("deallocate_activation") = false, @@ -291,12 +295,8 @@ void py_bind_conv2d(py::module& module) { py::arg("enable_split_reader") = false, py::arg("enable_subblock_padding") = false ); - py_conv_config.def_readwrite("math_fidelity", &Conv2dConfig::math_fidelity); py_conv_config.def_readwrite("dtype", &Conv2dConfig::dtype); py_conv_config.def_readwrite("weights_dtype", &Conv2dConfig::weights_dtype); - py_conv_config.def_readwrite("math_approx_mode_enabled", &Conv2dConfig::math_approx_mode_enabled); - py_conv_config.def_readwrite("fp32_dest_acc_enabled", &Conv2dConfig::fp32_dest_acc_enabled); - py_conv_config.def_readwrite("packer_l1_accum_enabled", &Conv2dConfig::packer_l1_accum_enabled); py_conv_config.def_readwrite("activation", &Conv2dConfig::activation); py_conv_config.def_readwrite("input_channels_alignment", &Conv2dConfig::input_channels_alignment); py_conv_config.def_readwrite("deallocate_activation", &Conv2dConfig::deallocate_activation); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp index 9b9645f821f..349e3837329 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_utils.hpp @@ -30,12 +30,8 @@ using OutputWidth = uint32_t; using Result = std::tuple>; struct Conv2dConfig { - MathFidelity math_fidelity = MathFidelity::HiFi4; DataType dtype = DataType::BFLOAT16; DataType weights_dtype = DataType::BFLOAT16; - bool math_approx_mode_enabled = true; - bool fp32_dest_acc_enabled = false; - bool packer_l1_accum_enabled = false; string activation = ""; uint32_t input_channels_alignment = 32; bool deallocate_activation = false; @@ -54,12 +50,8 @@ struct Conv2dConfig { bool enable_split_reader = false; bool enable_subblock_padding = false; static constexpr auto attribute_names = std::make_tuple( - "math_fidelity", "dtype", "weights_dtype", - "math_approx_mode_enabled", - "fp32_dest_acc_enabled", - "packer_l1_accum_enabled", "activation", "input_channels_alignment", "deallocate_activation", @@ -78,12 +70,8 @@ struct Conv2dConfig { "enable_subblock_padding"); const auto attribute_values() const { return std::make_tuple( - std::cref(this->math_fidelity), std::cref(this->dtype), std::cref(this->weights_dtype), - std::cref(this->math_approx_mode_enabled), - std::cref(this->fp32_dest_acc_enabled), - std::cref(this->packer_l1_accum_enabled), std::cref(this->activation), std::cref(this->input_channels_alignment), std::cref(this->deallocate_activation), diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp index 9d57c98db84..e09aa621dd5 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.cpp @@ -17,6 +17,7 @@ #include "tt_metal/tt_stl/reflection.hpp" #include "tt_metal/common/work_split.hpp" +#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" #include "ttnn/operations/sharding_utilities.hpp" #include "ttnn/operations/experimental/auto_format/auto_format.hpp" @@ -57,14 +58,14 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional input_tensor_shape, bool use_shallow_conv_variant, - std::optional compute_kernel_config, + const DeviceComputeKernelConfig& compute_kernel_config, bool enable_act_double_buffer, bool enable_weights_double_buffer, bool enable_split_reader, @@ -73,7 +74,7 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional output_tensors = {Tensor(operation::get_workers_for_op_output({a, b}))}; operation::launch_op( - [sliding_window_config, output_channels, groups, untilize_out, fuse_relu, math_fidelity, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, compute_kernel_config, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height] + [sliding_window_config, output_channels, groups, untilize_out, fuse_relu, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, compute_kernel_config, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height] (const std::vector& input_tensors, const std::vector>& optional_input_tensors, const std::vector>& optional_output_tensors) mutable -> std::vector { using ttnn::operations::experimental::auto_format::FormatParams; auto& a = input_tensors.at(0); @@ -91,9 +92,8 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optionalarch() : ttnn::operations::experimental::auto_format::AutoFormat::GetDefaultDevice()->arch(); bool fp32_accum = a.device()->arch() == tt::ARCH::WORMHOLE_B0; // && compute_kernel_config.has_value()) ? compute_kernel_config.value().fp32_dest_acc_en : false; - auto kernel_config_val = init_device_compute_kernel_config(arch, compute_kernel_config, MathFidelity::LoFi, true, fp32_accum, false); return operation::run_without_autoformat( - OptimizedConvNew(sliding_window_config, output_channels, groups, untilize_out, bias.has_value(), fuse_relu, math_fidelity, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, kernel_config_val, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height + OptimizedConvNew(sliding_window_config, output_channels, groups, untilize_out, bias.has_value(), fuse_relu, parallelization_config, block_config, memory_config, dtype, input_tensor_shape, use_shallow_conv_variant, compute_kernel_config, enable_act_double_buffer, enable_weights_double_buffer, enable_split_reader, enable_subblock_padding, use_non_tile_height ), input_tensors, optional_input_tensors); @@ -219,7 +219,7 @@ operation::ProgramWithCallbacks OptimizedConvNew::create_program(const std::vect sliding_window_config, output_channels, groups, - untilize_out, fuse_relu, math_fidelity, + untilize_out, fuse_relu, parallelization_config, block_config, dtype, @@ -265,7 +265,7 @@ operation::OpPerformanceModel OptimizedConvNew::create_op_performance_model(cons int64_t num_mul_adds_per_elem = conv_activation_c * filter_h * filter_w * 2; // 1 multiply and 1 add per element int64_t num_mul_adds = num_mul_adds_per_elem * output_height * output_width * this->output_channels * batch_size; - int ideal_dev_clock_cycles = std::ceil(((float)num_mul_adds / (float)(num_cores * tensix_mul_adds_per_cycle_lofi)) * (float)operation::OpPerformanceModel::fidelity_multiplier(this->math_fidelity)); + int ideal_dev_clock_cycles = std::ceil(((float)num_mul_adds / (float)(num_cores * tensix_mul_adds_per_cycle_lofi)) * (float)operation::OpPerformanceModel::fidelity_multiplier(get_math_fidelity(this->compute_kernel_config))); operation::OpPerformanceModel result(input_tensors, output_tensors, ideal_dev_clock_cycles); diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp index 830ca917e33..a39e97f4fac 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp @@ -47,7 +47,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new(const T const sliding_window::SlidingWindowConfig& sliding_window_config, uint32_t output_channels, uint32_t groups, - bool untilize_out, bool fuse_relu, MathFidelity math_fidelity, + bool untilize_out, bool fuse_relu, const OptimizedConvParallelizationConfig& parallelization_config, const OptimizedConvBlockConfig& block_config, DataType dtype, @@ -69,7 +69,6 @@ struct OptimizedConvNew { const uint32_t output_channels; const uint32_t groups; bool untilize_out, has_bias, fuse_relu; - MathFidelity math_fidelity; MemoryConfig memory_config; const DataType dtype; std::array input_tensor_shape; // For sharded input, input tensor shape is nonsense @@ -84,7 +83,7 @@ struct OptimizedConvNew { uint32_t output_channels, uint32_t groups, bool untile_out, bool has_bias, bool fuse_relu, - MathFidelity mfidelity, const OptimizedConvParallelizationConfig& p_config, + const OptimizedConvParallelizationConfig& p_config, const OptimizedConvBlockConfig& b_config, MemoryConfig memory_config, DataType dtype, @@ -96,7 +95,6 @@ struct OptimizedConvNew { untilize_out(untile_out), has_bias(has_bias), fuse_relu(fuse_relu), - math_fidelity(mfidelity), parallelization_config(p_config), block_config(b_config), memory_config(memory_config), @@ -124,7 +122,6 @@ struct OptimizedConvNew { "untilize_out", "has_bias", "fuse_relu", - "math_fidelity", "dtype", "input_tensor_shape", "use_shallow_conv_variant", @@ -141,7 +138,6 @@ struct OptimizedConvNew { std::cref(this->untilize_out), std::cref(this->has_bias), std::cref(this->fuse_relu), - std::cref(this->math_fidelity), std::cref(this->dtype), std::cref(this->input_tensor_shape), std::cref(this->use_shallow_conv_variant), @@ -156,14 +152,14 @@ Tensor optimized_conv_new(const Tensor& a, const Tensor &b, std::optional input_tensor_shape, bool use_shallow_conv_variant, - std::optional compute_kernel_config = std::nullopt, + const DeviceComputeKernelConfig& compute_kernel_config, bool enable_act_double_buffer = false, bool enable_weights_double_buffer = false, bool enable_split_reader = false, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp index 0b452a583df..7c0544a8c69 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op_sharded_program_factory.cpp @@ -1793,7 +1793,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new( uint32_t groups, bool untilize_out, bool fuse_relu, - MathFidelity math_fidelity, const OptimizedConvParallelizationConfig& parallelization_config, const OptimizedConvBlockConfig& block_config, DataType output_dtype, diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp index 668372c49a4..1009ed7a87b 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.cpp @@ -4,6 +4,7 @@ #include "prepare_conv2d_weights.hpp" #include "conv2d_utils.hpp" +#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp" #include #include @@ -67,6 +68,7 @@ OptimizedConvBlockConfig get_opt_block_config( T *device, Conv2dConfig& conv_config, Layout input_tensor_layout, + const DeviceComputeKernelConfig& compute_config, const MemoryConfig& input_memory_config) { auto compute_grid_size = device->compute_with_storage_grid_size(); @@ -138,7 +140,7 @@ OptimizedConvBlockConfig get_opt_block_config( conv_config.act_block_w_div, kernel_size[0], kernel_size[1], - conv_config.fp32_dest_acc_enabled, + get_fp32_dest_acc_en(compute_config), conv_config.enable_split_reader); } @@ -289,9 +291,11 @@ ttnn::Tensor prepare_conv_weights( std::array dilation, uint32_t groups, T *device, - const std::optional& conv_config_) { + const std::optional& conv_config_, + const std::optional& compute_config_) { TT_FATAL(!ttnn::is_tensor_on_device_or_multidevice(weight_tensor), "Error: weight tensor must be on host for preparation."); Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig()); + DeviceComputeKernelConfig compute_config = compute_config_.value_or(DeviceComputeKernelConfig()); const bool mm_conv = use_matmul_for_1x1_conv(kernel_size, stride, padding, dilation, groups); const uint32_t output_height = ((input_height - kernel_size[0] - ((kernel_size[0] - 1 ) * (dilation[0] - 1)) + 2 * padding[0]) / stride[0]) + 1; const uint32_t output_width = @@ -309,6 +313,7 @@ ttnn::Tensor prepare_conv_weights( device, conv_config, input_tensor_layout, + compute_config, input_memory_config ); @@ -366,7 +371,8 @@ ttnn::Tensor prepare_conv_bias( std::array dilation, uint32_t groups, T *device, - const std::optional& conv_config_) { + const std::optional& conv_config_, + const std::optional& compute_config_) { TT_FATAL(!ttnn::is_tensor_on_device_or_multidevice(bias_tensor), "Error: bias tensor must be on host for preparation."); @@ -376,6 +382,7 @@ ttnn::Tensor prepare_conv_bias( ((input_width - kernel_size[1] - ((kernel_size[0] - 1) * (dilation[0] - 1)) + 2 * padding[1]) / stride[1]) + 1; Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig()); + DeviceComputeKernelConfig compute_config = compute_config_.value_or(DeviceComputeKernelConfig()); auto opt_conv_op_block_config = get_opt_block_config( mm_conv, in_channels, @@ -389,6 +396,7 @@ ttnn::Tensor prepare_conv_bias( device, conv_config, input_tensor_layout, + compute_config, input_memory_config ); @@ -423,6 +431,7 @@ template OptimizedConvBlockConfig get_opt_block_config( Device *device, Conv2dConfig& conv_config, Layout input_tensor_layout, + const DeviceComputeKernelConfig& compute_config, const ttnn::MemoryConfig& input_memory_config); template OptimizedConvBlockConfig get_opt_block_config( @@ -438,6 +447,7 @@ template OptimizedConvBlockConfig get_opt_block_config( MeshDevice *device, Conv2dConfig& conv_config, Layout input_tensor_layout, + const DeviceComputeKernelConfig& compute_config, const ttnn::MemoryConfig& input_memory_config); template ttnn::Tensor prepare_conv_weights( @@ -456,7 +466,8 @@ template ttnn::Tensor prepare_conv_weights( std::array dilation, uint32_t groups, Device *device, - const std::optional& conv_config_); + const std::optional& conv_config_, + const std::optional& compute_config_); template ttnn::Tensor prepare_conv_weights( const ttnn::Tensor& weight_tensor, @@ -474,7 +485,8 @@ template ttnn::Tensor prepare_conv_weights( std::array dilation, uint32_t groups, MeshDevice *device, - const std::optional& conv_config_); + const std::optional& conv_config_, + const std::optional& compute_config_); template std::pair> prepare_conv_weights_biases_and_move_to_device( const ttnn::Tensor& weight_tensor, @@ -521,7 +533,8 @@ template ttnn::Tensor prepare_conv_bias( std::array dilation, uint32_t groups, Device *device, - const std::optional& conv_config_); + const std::optional& conv_config_, + const std::optional& compute_config_); template ttnn::Tensor prepare_conv_bias( const ttnn::Tensor& bias_tensor, @@ -538,7 +551,8 @@ template ttnn::Tensor prepare_conv_bias( std::array dilation, uint32_t groups, MeshDevice *device, - const std::optional& conv_config_); + const std::optional& conv_config_, + const std::optional& compute_config_); } // namespace conv2d } // namespace operations diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp index 18e654ad37c..35b80dac824 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv2d/prepare_conv2d_weights.hpp @@ -43,7 +43,8 @@ ttnn::Tensor prepare_conv_weights( std::array dilation, uint32_t groups, T *device, - const std::optional& conv_config_); + const std::optional& conv_config_, + const std::optional& compute_config_); template ttnn::Tensor prepare_conv_bias( @@ -61,7 +62,8 @@ ttnn::Tensor prepare_conv_bias( std::array dilation, uint32_t groups, T *device, - const std::optional& conv_config_); + const std::optional& conv_config_, + const std::optional& compute_config_); template std::pair> prepare_conv_weights_biases_and_move_to_device( diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp index e2c54193bb0..21af1f921fb 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.cpp @@ -107,8 +107,11 @@ Result conv_transpose2d( uint32_t groups, std::optional bias_tensor, const std::optional& conv_config_, + const std::optional& compute_config_, const std::optional& memory_config ) { Conv2dConfig conv_config = conv_config_.value_or(Conv2dConfig()); + DeviceComputeKernelConfig compute_config = compute_config_.value_or(DeviceComputeKernelConfig()); + //Inverse of sliding_window.get_output_shape() SlidingWindowConfig sliding_window_config = SlidingWindowConfig{ @@ -174,32 +177,6 @@ Result conv_transpose2d( ttnn::is_tensor_on_device_or_multidevice(input_tensor) ? std::make_optional(input_tensor.memory_config()) : std::nullopt); } - DeviceComputeKernelConfig compute_kernel_config; - switch (device->arch()) { - case tt::ARCH::WORMHOLE_B0: - compute_kernel_config = WormholeComputeKernelConfig( - {.math_fidelity = conv_config.math_fidelity, - .math_approx_mode = conv_config.math_approx_mode_enabled, - .fp32_dest_acc_en = conv_config.fp32_dest_acc_enabled, - .packer_l1_acc = conv_config.packer_l1_accum_enabled}); - break; - - case tt::ARCH::GRAYSKULL: - compute_kernel_config = GrayskullComputeKernelConfig( - {.math_fidelity = conv_config.math_fidelity, .math_approx_mode = conv_config.math_approx_mode_enabled}); - break; - - case tt::ARCH::BLACKHOLE: - compute_kernel_config = BlackholeComputeKernelConfig( - {.math_fidelity = conv_config.math_fidelity, - .math_approx_mode = conv_config.math_approx_mode_enabled, - .fp32_dest_acc_en = conv_config.fp32_dest_acc_enabled, - .packer_l1_acc = conv_config.packer_l1_accum_enabled}); - break; - - default: - TT_THROW("Invalid Device Arch, Got {}",device->arch()); - } //Call Halo Transpose auto [input_tensor_post_tm, parallel_config, output_parallel_config, tensor_manipulated, use_non_tile_height] = shard_or_reshard_tensor_if_required( @@ -239,6 +216,16 @@ Result conv_transpose2d( 0, input_tensor_post_tm.memory_config()); + if(conv_config.deallocate_activation) { + input_tensor_post_tm.deallocate(); + log_debug(tt::LogOp, "Deallocate Input Tensor"); + } + if (conv_config.reallocate_halo_output) { + auto move_output = ttnn::operations::core::reallocate(halo_output, halo_output.memory_config()); + halo_output = move_output; + log_debug(tt::LogOp, "Reallocate Halo Output"); + } + //Call Conv2d u_op with Stride = 1, Padding = 0. auto conv_out_memory_config = create_sharded_memory_config_from_parallel_config( ttnn::Shape(std::array{1, 1, batch_size * output_height * output_width, tt::round_up(out_channels, 32)}), @@ -266,7 +253,7 @@ Result conv_transpose2d( conv_config.act_block_w_div, kernel_size[0], kernel_size[1], - conv_config.fp32_dest_acc_enabled, + get_fp32_dest_acc_en(compute_config), conv_config.enable_split_reader); //TODO: Flip the Weights @@ -300,7 +287,7 @@ Result conv_transpose2d( parallel_config.shard_orientation == ShardOrientation::COL_MAJOR, num_cores_c); Tensor matmul_input = ttnn::to_layout( - input_tensor_post_tm, Layout::TILE, conv_config.dtype, input_tensor_post_tm.memory_config(), device + halo_output, Layout::TILE, conv_config.dtype, input_tensor_post_tm.memory_config(), device ); auto matmul_output = ttnn::operations::matmul::matmul( matmul_input, @@ -311,7 +298,7 @@ Result conv_transpose2d( /*bcast_batch=*/std::nullopt, conv_out_memory_config, conv_config.dtype, - compute_kernel_config}); + compute_config}); if (conv_config.deallocate_activation) { ttnn::operations::core::deallocate(matmul_input); } @@ -332,14 +319,13 @@ Result conv_transpose2d( groups, conv_config.output_layout == Layout::ROW_MAJOR, conv_config.activation == "relu", - conv_config.math_fidelity, opt_conv_op_parallel_config, opt_conv_op_block_config, conv_out_memory_config, conv_config.dtype, {batch_size, input_height, input_width, in_channels}, conv_config.input_channels_alignment == 16, - compute_kernel_config, + compute_config, conv_config.enable_act_double_buffer, conv_config.enable_split_reader, conv_config.enable_subblock_padding); @@ -367,8 +353,9 @@ Result ConvTranpose2dOperation::invoke( uint32_t groups, std::optional bias_tensor, const std::optional& conv_config_, - const std::optional& memory_config ) { - return conv_transpose2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(memory_config)); + const std::optional& compute_config_, + const std::optional& memory_config){ + return conv_transpose2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(compute_config_), std::move(memory_config)); } Result ConvTranpose2dOperation::invoke( @@ -389,8 +376,9 @@ Result ConvTranpose2dOperation::invoke( uint32_t groups, std::optional bias_tensor, const std::optional& conv_config_, - const std::optional& memory_config ) { - return conv_transpose2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(memory_config)); + const std::optional& compute_config_, + const std::optional& memory_config){ + return conv_transpose2d(input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, std::move(bias_tensor), std::move(conv_config_), std::move(compute_config_), std::move(memory_config)); } } diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp index 119db2cf842..fc23a6f52d6 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp +++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d.hpp @@ -34,6 +34,7 @@ struct ConvTranpose2dOperation{ uint32_t groups, std::optional bias_tensor = std::nullopt, const std::optional& conv_config_ = std::nullopt, + const std::optional& compute_config_ = std::nullopt, const std::optional& memory_config = std::nullopt); static Result invoke( @@ -54,6 +55,7 @@ struct ConvTranpose2dOperation{ uint32_t groups, std::optional bias_tensor = std::nullopt, const std::optional& conv_config_ = std::nullopt, + const std::optional& compute_config_ = std::nullopt, const std::optional& memory_config = std::nullopt); }; diff --git a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp index 3cea2a187f9..1e07c21eb42 100644 --- a/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp +++ b/ttnn/cpp/ttnn/operations/conv/conv_transpose2d/conv_transpose2d_pybind.cpp @@ -62,6 +62,7 @@ void py_bind_conv_transpose2d(py::module& module) { groups (int): the number of groups for grouped convolution. bias_tensor (ttnn.Tensor, optional): the bias tensor. Defaults to `None`. conv_config (ttnn.Conv2dConfig, optional): the configuration for the convolution operation. Defaults to `None`. + compute_config (ttnn.DeviceComputeKernelConfig, optional): the configuration for the compute kernel. Defaults to `None`. queue_id (int): the queue id to use for the operation. Defaults to `0`. Returns: @@ -84,6 +85,7 @@ void py_bind_conv_transpose2d(py::module& module) { input_height=input_height, input_width=input_width, conv_config=conv_config, + compute_config=compute_config, groups=groups, ) )doc", @@ -103,10 +105,12 @@ void py_bind_conv_transpose2d(py::module& module) { std::array dilation, uint32_t groups, std::optional bias_tensor, - std::optional conv_config, + const std::optional& conv_config, + const std::optional& compute_config, const std::optional& memory_config, const uint8_t& queue_id) -> Result { - return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, bias_tensor, conv_config, memory_config); + return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, bias_tensor, conv_config, compute_config, memory_config); + }, py::kw_only(), py::arg("input_tensor"), @@ -125,6 +129,7 @@ void py_bind_conv_transpose2d(py::module& module) { py::arg("groups"), py::arg("bias_tensor") = std::nullopt, py::arg("conv_config") = std::nullopt, + py::arg("compute_config") = std::nullopt, py::arg("memory_config") = std::nullopt, py::arg("queue_id") = 0}, @@ -144,10 +149,12 @@ void py_bind_conv_transpose2d(py::module& module) { std::array dilation, uint32_t groups, std::optional bias_tensor, - std::optional conv_config, + const std::optional& conv_config, + const std::optional& compute_config, const std::optional& memory_config, const uint8_t& queue_id) -> Result { - return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, bias_tensor, conv_config, memory_config); + return self(queue_id, input_tensor, weight_tensor, device, in_channels, out_channels, batch_size, input_height, input_width, kernel_size, stride, padding, output_padding, dilation, groups, bias_tensor, conv_config, compute_config, memory_config); + }, py::kw_only(), py::arg("input_tensor"), @@ -166,6 +173,7 @@ void py_bind_conv_transpose2d(py::module& module) { py::arg("groups"), py::arg("bias_tensor") = std::nullopt, py::arg("conv_config") = std::nullopt, + py::arg("compute_config") = std::nullopt, py::arg("memory_config") = std::nullopt, py::arg("queue_id") = 0} ); diff --git a/ttnn/ttnn/__init__.py b/ttnn/ttnn/__init__.py index 71f4f748660..b699ea556af 100644 --- a/ttnn/ttnn/__init__.py +++ b/ttnn/ttnn/__init__.py @@ -187,6 +187,7 @@ def manage_config(name, value): pad_to_tile_shape, SubDevice, SubDeviceManagerId, + init_device_compute_kernel_config, ) from ttnn.profiler import start_tracy_zone, stop_tracy_zone, tracy_message, tracy_frame diff --git a/ttnn/ttnn/device.py b/ttnn/ttnn/device.py index e620c800a6c..b8de80cd87a 100644 --- a/ttnn/ttnn/device.py +++ b/ttnn/ttnn/device.py @@ -6,6 +6,7 @@ from typing import Optional, List import ttnn +import os def get_device_core_grid(device): @@ -27,6 +28,7 @@ def get_device_core_grid(device): DEFAULT_TRACE_REGION_SIZE = ttnn._ttnn.device.DEFAULT_TRACE_REGION_SIZE open_device = ttnn._ttnn.device.open_device +init_device_compute_kernel_config = ttnn._ttnn.operations.core.init_device_compute_kernel_config def close_device(device: "ttnn.device.Device"): @@ -132,12 +134,25 @@ def dump_device_memory_state(device, prefix=""): ttnn._ttnn.device.DumpDeviceMemoryState(device, prefix) -def is_wormhole_b0(device): - return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0 +def is_wormhole_b0(device=None): + if device is not None: + return device.arch() == ttnn._ttnn.device.Arch.WORMHOLE_B0 + ARCH_NAME = os.environ.get("ARCH_NAME", os.environ.get("TT_ARCH_NAME", "")).lower() + return "wormhole_b0" in ARCH_NAME -def is_grayskull(device): - return device.arch() == ttnn._ttnn.device.Arch.GRAYSKULL +def is_grayskull(device=None): + if device is not None: + return device.arch() == ttnn._ttnn.device.Arch.GRAYSKULL + ARCH_NAME = os.environ.get("ARCH_NAME", os.environ.get("TT_ARCH_NAME", "")).lower() + return "grayskull" in ARCH_NAME + + +def is_blackhole(device=None): + if device is not None: + return device.arch() == ttnn._ttnn.device.Arch.BLACKHOLE + ARCH_NAME = os.environ.get("ARCH_NAME", os.environ.get("TT_ARCH_NAME", "")).lower() + return "blackhole" in ARCH_NAME SetDefaultDevice = ttnn._ttnn.device.SetDefaultDevice diff --git a/ttnn/ttnn/operations/conv1d.py b/ttnn/ttnn/operations/conv1d.py index e979a12b21d..b899f01e3b3 100644 --- a/ttnn/ttnn/operations/conv1d.py +++ b/ttnn/ttnn/operations/conv1d.py @@ -28,6 +28,7 @@ def Conv1d( groups: int = 1, bias_tensor: ttnn.Tensor = None, conv_config: Conv1dConfig = None, # config overrides by user + compute_config: ttnn.DeviceComputeKernelConfig = None, conv_op_cache={}, # basic conv object caching in python needed for intermediate refactoring. Not needed after full op refactoring in C++. debug=False, ) -> Tuple[ttnn.Tensor, int, int, ttnn.Tensor, ttnn.Tensor]: @@ -60,6 +61,7 @@ def Conv1d( groups=groups, bias_tensor=bias_tensor, conv_config=conv_config, + compute_config=compute_config, ) return ( diff --git a/ttnn/ttnn/operations/conv2d.py b/ttnn/ttnn/operations/conv2d.py index ef2859c43a2..2f0fa3ee736 100644 --- a/ttnn/ttnn/operations/conv2d.py +++ b/ttnn/ttnn/operations/conv2d.py @@ -176,6 +176,7 @@ def conv2d( groups: int = 1, bias_tensor: ttnn.Tensor = None, conv_config: Conv2dConfig = None, # config overrides by user + compute_config=None, # compute config overrides by user memory_config: ttnn.MemoryConfig = None, # memory config overrides by user conv_op_cache={}, # basic conv object caching in python needed for intermediate refactoring. Not needed after full op refactoring in C++. debug=False, # ignored @@ -196,6 +197,7 @@ def conv2d( groups=groups, bias_tensor=bias_tensor, conv_config=conv_config, + compute_config=compute_config, memory_config=memory_config, )