From 24eabeafb8cbc1ed4486c22a0fc3121fc0998e20 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Tue, 26 Nov 2024 06:53:47 -0800 Subject: [PATCH] Fix optimal `nc` computation in `batch-matrix-multiply`, `convolution-nhwc`, and `dynamic-fully-connected` ops. This is the same fix that had already been applied to the `fully-connected` op. This speeds up the GEMMs for two reasons: * Larger tile sizes lead to fewer tiles and thus fewer overheads, e.g. in `pthreadpool`, * Getting the minimum number of tiles (e.g. when running with 8 threads) right provides better load balancing. PiperOrigin-RevId: 700327559 --- src/microkernel-utils.c | 27 ++++++++++++++++++++++ src/operators/batch-matrix-multiply-nc.c | 13 +++-------- src/operators/convolution-nhwc.c | 11 ++------- src/operators/dynamic-fully-connected-nc.c | 19 +++++---------- src/operators/fully-connected-nc.c | 18 ++++----------- src/xnnpack/microkernel-utils.h | 5 ++++ 6 files changed, 48 insertions(+), 45 deletions(-) diff --git a/src/microkernel-utils.c b/src/microkernel-utils.c index bf81a8db233..e40bd61b8a8 100644 --- a/src/microkernel-utils.c +++ b/src/microkernel-utils.c @@ -9,6 +9,33 @@ #include "xnnpack/math.h" #include "xnnpack/microkernel-utils.h" +size_t xnn_gemm_best_nc(size_t num_groups, size_t m, size_t n, size_t mr, + size_t nr, size_t num_threads) { + const size_t target_tiles_per_thread = 5; + size_t nc = n; + if (num_threads > 1) { + const size_t num_tile_rows = divide_round_up(m, mr) * num_groups; + const size_t max_nc = divide_round_up( + n * num_tile_rows, num_threads * target_tiles_per_thread); + if (max_nc < nc) { + nc = min(nc, divide_round_up(nc, divide_round_up(nc, max_nc) * nr) * nr); + } + } + +#ifndef NDEBUG + // Verify that we indeed have at least `target_tiles_per_thread` tiles per + // thread. + if (num_threads > 1 && nr < nc) { + const size_t num_tiles_m = divide_round_up(m, mr); + const size_t num_tiles_n = divide_round_up(n, nc); + const size_t num_tiles = num_groups * num_tiles_m * num_tiles_n; + assert(target_tiles_per_thread * num_threads <= num_tiles); + } +#endif // NDEBUG + + return nc; +} + static size_t dwconv_num_middle_pass( size_t kernel_size, size_t first_pass_tile, diff --git a/src/operators/batch-matrix-multiply-nc.c b/src/operators/batch-matrix-multiply-nc.c index c828d6a4ae5..ccf49c4f8c5 100644 --- a/src/operators/batch-matrix-multiply-nc.c +++ b/src/operators/batch-matrix-multiply-nc.c @@ -19,6 +19,7 @@ #include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microkernel-type.h" +#include "xnnpack/microkernel-utils.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" #include "xnnpack/operator-type.h" @@ -627,17 +628,9 @@ static enum xnn_status reshape_batch_matrix_multiply_nc( memcpy(&batch_matrix_multiply_op->context.gemm.gemm.gemm.params, params, params_size); batch_matrix_multiply_op->context.gemm.gemm.gemm.fused_params = &batch_matrix_multiply_op->context.gemm.gemm.gemm.params; - size_t nc = n; - if (num_threads > 1) { - const size_t num_other_tiles = divide_round_up(m, mr); - const size_t target_tiles_per_thread = 5; - const size_t max_nc = divide_round_up(n * num_other_tiles, num_threads * target_tiles_per_thread); - if (max_nc < nc) { - nc = min(nc, divide_round_up(nc, max_nc * nr) * nr); - } - } + size_t nc = xnn_gemm_best_nc(batch_size_c, m, n, mr, nr, num_threads); - #if XNN_MAX_UARCH_TYPES > 1 +#if XNN_MAX_UARCH_TYPES > 1 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) { gemm_compute->type = xnn_parallelization_type_3d_tile_2d_with_uarch; gemm_compute->task_3d_tile_2d_with_id = diff --git a/src/operators/convolution-nhwc.c b/src/operators/convolution-nhwc.c index 0692799b06b..5269796d896 100644 --- a/src/operators/convolution-nhwc.c +++ b/src/operators/convolution-nhwc.c @@ -1946,15 +1946,8 @@ static enum xnn_status reshape_gemm( memcpy(&convolution_op->context.gemm.gemm.gemm.params, &convolution_op->params, sizeof(convolution_op->context.gemm.gemm.gemm.params)); convolution_op->context.gemm.gemm.gemm.fused_params = &convolution_op->context.gemm.gemm.gemm.params; - size_t nc = group_output_channels; - if (num_threads > 1) { - const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr); - const size_t target_tiles_per_thread = 5; - const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread); - if (max_nc < nc) { - nc = min(nc, divide_round_up(nc, max_nc * nr) * nr); - } - } + size_t nc = xnn_gemm_best_nc(groups, batch_output_size, group_output_channels, + mr, nr, num_threads); if (groups == 1) { #if XNN_MAX_UARCH_TYPES > 1 diff --git a/src/operators/dynamic-fully-connected-nc.c b/src/operators/dynamic-fully-connected-nc.c index c3eb10fab03..cf5be914310 100644 --- a/src/operators/dynamic-fully-connected-nc.c +++ b/src/operators/dynamic-fully-connected-nc.c @@ -17,7 +17,9 @@ #include "xnnpack/config.h" #include "xnnpack/log.h" #include "xnnpack/math.h" +#include "xnnpack/microfnptr.h" #include "xnnpack/microkernel-type.h" +#include "xnnpack/microkernel-utils.h" #include "xnnpack/microparams.h" #include "xnnpack/operator-type.h" #include "xnnpack/operator.h" @@ -389,20 +391,11 @@ static enum xnn_status reshape_dynamic_fully_connected_nc( } dynamic_fully_connected_op->context.gemm.gemm.gemm.fused_params = &dynamic_fully_connected_op->context.gemm.gemm.gemm.params; - size_t nc = output_channels; - const size_t num_threads = pthreadpool_get_threads_count(threadpool); - if (num_threads > 1) { - const size_t num_other_tiles = divide_round_up(batch_size, mr); - const size_t target_tiles_per_thread = 5; - const size_t max_nc = divide_round_up(output_channels * num_other_tiles, num_threads * target_tiles_per_thread); - if (max_nc < nc) { - nc = min(nc, divide_round_up(output_channels, - divide_round_up(nc, max_nc) * nr) * - nr); - } - } + size_t nc = + xnn_gemm_best_nc(/*num_groups=*/1, batch_size, output_channels, mr, nr, + pthreadpool_get_threads_count(threadpool)); - #if XNN_MAX_UARCH_TYPES > 1 +#if XNN_MAX_UARCH_TYPES > 1 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) { dynamic_fully_connected_op->compute[1].type = xnn_parallelization_type_2d_tile_2d_with_uarch; dynamic_fully_connected_op->compute[1].task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_gemm; diff --git a/src/operators/fully-connected-nc.c b/src/operators/fully-connected-nc.c index 5a81fc86e83..c74784e279f 100644 --- a/src/operators/fully-connected-nc.c +++ b/src/operators/fully-connected-nc.c @@ -24,6 +24,7 @@ #include "xnnpack/math.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microkernel-type.h" +#include "xnnpack/microkernel-utils.h" #include "xnnpack/microparams-init.h" #include "xnnpack/microparams.h" #include "xnnpack/operator-type.h" @@ -1981,20 +1982,11 @@ static enum xnn_status reshape_fully_connected_nc( memcpy(&fully_connected_op->context.gemm.gemm.gemm.params, params, params_size); fully_connected_op->context.gemm.gemm.gemm.fused_params = &fully_connected_op->context.gemm.gemm.gemm.params; - size_t nc = output_channels; - const size_t num_threads = pthreadpool_get_threads_count(threadpool); - if (num_threads > 1) { - const size_t num_other_tiles = divide_round_up(batch_size, mr); - const size_t target_tiles_per_thread = 5; - const size_t max_nc = divide_round_up(output_channels * num_other_tiles, num_threads * target_tiles_per_thread); - if (max_nc < nc) { - nc = min(nc, divide_round_up(output_channels, - divide_round_up(nc, max_nc) * nr) * - nr); - } - } + size_t nc = + xnn_gemm_best_nc(/*num_groups=*/1, batch_size, output_channels, mr, nr, + pthreadpool_get_threads_count(threadpool)); - #if XNN_MAX_UARCH_TYPES > 1 +#if XNN_MAX_UARCH_TYPES > 1 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) { fully_connected_op->compute[0].type = xnn_parallelization_type_2d_tile_2d_with_uarch; if (dynamic_quantization) { diff --git a/src/xnnpack/microkernel-utils.h b/src/xnnpack/microkernel-utils.h index dda4f47fa57..17965325b19 100644 --- a/src/xnnpack/microkernel-utils.h +++ b/src/xnnpack/microkernel-utils.h @@ -13,6 +13,11 @@ extern "C" { #endif +// Computes the largest `nc`, the largest multiple of `nr` such that there are +// at least five tiles per thread (if `num_threads > 1`). +size_t xnn_gemm_best_nc(size_t num_groups, size_t m, size_t n, size_t mr, + size_t nr, size_t num_threads); + // The total tile size needed to cover kernel_size. XNN_INTERNAL size_t xnn_dwconv_multipass_tile_size( size_t kernel_size,