From 652490d827a15486387a12f69dc7206447a10697 Mon Sep 17 00:00:00 2001 From: Vasisht Suresh Date: Fri, 10 Jan 2025 12:39:31 -0800 Subject: [PATCH] #16366: Changed default kernal_config_val for 32bit matmul (#16567) ### Ticket Link to Github Issue https://github.com/tenstorrent/tt-metal/issues/16366 ### Problem description FP32 PCC not accurate enough for a matmul with default kernel config ### What's changed Enabled FP32 and disabled L1ACC for FP32 ### Checklist - [x] Post commit CI passes - [x] Blackhole Post commit (if applicable) - [x] Model regression CI testing passes (if applicable) - [x] Device performance regression CI testing passes (if applicable) - [x] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --- tests/ttnn/unit_tests/operations/test_matmul.py | 15 +++++++++++++++ .../ttnn/operations/matmul/device/matmul_op.cpp | 16 ++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/test_matmul.py b/tests/ttnn/unit_tests/operations/test_matmul.py index 1f69c390513..62321b95f8b 100644 --- a/tests/ttnn/unit_tests/operations/test_matmul.py +++ b/tests/ttnn/unit_tests/operations/test_matmul.py @@ -2111,3 +2111,18 @@ def test_optional_output_argument(device, n_size, c, m, k, n): assert_with_pcc(torch_output_tensor, output, 0.999) assert_with_pcc(torch_output_tensor, optional_output_tensor, 0.999) assert_with_pcc(output, optional_output_tensor, 0.999) + + +def test_small_matmul_pcc(device): + torch.manual_seed(0) + pcc = 0.99 + torch_input_tensor_a = torch.rand([1, 2048]) + torch_input_tensor_b = torch.rand([2048, 1000]) + torch_output_tensor = torch.matmul(torch_input_tensor_a, torch_input_tensor_b) + + input_tensor_a = ttnn.from_torch(torch_input_tensor_a, layout=ttnn.TILE_LAYOUT, device=device) + input_tensor_b = ttnn.from_torch(torch_input_tensor_b, layout=ttnn.TILE_LAYOUT, device=device) + output1 = ttnn.matmul(input_tensor_a, input_tensor_b) + output_tensor = ttnn.to_torch(output1) + + assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc) diff --git a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp index c63d1416308..7a312aeabf9 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/matmul_op.cpp @@ -1225,13 +1225,6 @@ Matmul create_matmul_struct( (input_tensor_b.get_dtype() == DataType::BFLOAT8_B || input_tensor_b.get_dtype() == DataType::BFLOAT4_B)); const auto increase_fidelity = !has_program_config && !has_user_grid && !are_inputs_low_precision_df; auto math_fidelity = increase_fidelity ? MathFidelity::HiFi2 : MathFidelity::LoFi; - auto kernel_config_val = init_device_compute_kernel_config( - arch, - parameters.compute_kernel_config, - math_fidelity, - /*default_approx_mode=*/false, - /*default_fp32_acc=*/false, - /*default_l1_acc=*/true); bool broadcast_batch = parameters.bcast_batch.value_or(get_broadcast_batch(input_tensor_a, input_tensor_b, parameters.program_config)); TT_FATAL(!(has_user_grid && has_program_config), "Cannot use both user core grid/coordinates and a program config"); @@ -1267,7 +1260,14 @@ Matmul create_matmul_struct( output_dtype = input_tensor_a.get_dtype(); } } - + bool is_float_32 = output_dtype==DataType::FLOAT32; + auto kernel_config_val = init_device_compute_kernel_config( + arch, + parameters.compute_kernel_config, + math_fidelity, + /*default_approx_mode=*/false, + /*default_fp32_acc=*/is_float_32, + /*default_l1_acc=*/!is_float_32); auto in0_tile = input_tensor_a.get_tensor_spec().tile(); auto in1_tile = input_tensor_b.get_tensor_spec().tile(); tt::tt_metal::Tile output_tile = get_output_tile(output_mem_config, in0_tile, in1_tile, parameters.output_tile);