From 652490d827a15486387a12f69dc7206447a10697 Mon Sep 17 00:00:00 2001
From: Vasisht Suresh <vsuresh@tenstorrent.com>
Date: Fri, 10 Jan 2025 12:39:31 -0800
Subject: [PATCH] #16366: Changed default kernal_config_val for 32bit matmul
 (#16567)

### Ticket
Link to Github Issue
https://github.com/tenstorrent/tt-metal/issues/16366

### Problem description
FP32 PCC not accurate enough for a matmul with default kernel config

### What's changed
Enabled FP32 and disabled L1ACC for FP32

### Checklist
- [x] Post commit CI passes
- [x] Blackhole Post commit (if applicable)
- [x] Model regression CI testing passes (if applicable)
- [x] Device performance regression CI testing passes (if applicable)
- [x] **(For models and ops writers)** Full [new
models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml)
tests passes
- [ ] New/Existing tests provide coverage for changes
---
 tests/ttnn/unit_tests/operations/test_matmul.py  | 15 +++++++++++++++
 .../ttnn/operations/matmul/device/matmul_op.cpp  | 16 ++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/test_matmul.py b/tests/ttnn/unit_tests/operations/test_matmul.py
index 1f69c390513..62321b95f8b 100644
--- a/tests/ttnn/unit_tests/operations/test_matmul.py
+++ b/tests/ttnn/unit_tests/operations/test_matmul.py
@@ -2111,3 +2111,18 @@ def test_optional_output_argument(device, n_size, c, m, k, n):
     assert_with_pcc(torch_output_tensor, output, 0.999)
     assert_with_pcc(torch_output_tensor, optional_output_tensor, 0.999)
     assert_with_pcc(output, optional_output_tensor, 0.999)
+
+
+def test_small_matmul_pcc(device):
+    torch.manual_seed(0)
+    pcc = 0.99
+    torch_input_tensor_a = torch.rand([1, 2048])
+    torch_input_tensor_b = torch.rand([2048, 1000])
+    torch_output_tensor = torch.matmul(torch_input_tensor_a, torch_input_tensor_b)
+
+    input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device)
+    input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device)
+    output1 = ttnn.matmul(input_tensor_a, input_tensor_b)
+    output_tensor = ttnn.to_torch(output1)
+
+    assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc)
diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
index c63d1416308..7a312aeabf9 100644
--- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
+++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp
@@ -1225,13 +1225,6 @@ Matmul create_matmul_struct(
          (input_tensor_b.get_dtype() == DataType::BFLOAT8_B || input_tensor_b.get_dtype() == DataType::BFLOAT4_B));
     const auto increase_fidelity = !has_program_config && !has_user_grid && !are_inputs_low_precision_df;
     auto math_fidelity = increase_fidelity ? MathFidelity::HiFi2 : MathFidelity::LoFi;
-    auto kernel_config_val = init_device_compute_kernel_config(
-        arch,
-        parameters.compute_kernel_config,
-        math_fidelity,
-        /*default_approx_mode=*/false,
-        /*default_fp32_acc=*/false,
-        /*default_l1_acc=*/true);
     bool broadcast_batch =
         parameters.bcast_batch.value_or(get_broadcast_batch(input_tensor_a, input_tensor_b, parameters.program_config));
     TT_FATAL(!(has_user_grid && has_program_config), "Cannot use both user core grid/coordinates and a program config");
@@ -1267,7 +1260,14 @@ Matmul create_matmul_struct(
             output_dtype = input_tensor_a.get_dtype();
         }
     }
-
+    bool is_float_32 = output_dtype==DataType::FLOAT32;
+    auto kernel_config_val = init_device_compute_kernel_config(
+        arch,
+        parameters.compute_kernel_config,
+        math_fidelity,
+        /*default_approx_mode=*/false,
+        /*default_fp32_acc=*/is_float_32,
+        /*default_l1_acc=*/!is_float_32);
     auto in0_tile = input_tensor_a.get_tensor_spec().tile();
     auto in1_tile = input_tensor_b.get_tensor_spec().tile();
     tt::tt_metal::Tile output_tile = get_output_tile(output_mem_config, in0_tile, in1_tile, parameters.output_tile);