Skip to content

Commit

Permalink
Add wrappers for the qp8_f32_qc8w GEMM kernels linked from KleidiAI.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 704703418
  • Loading branch information
gonnet authored and xnnpack-bot committed Dec 10, 2024
1 parent f9758af commit acb51c6
Show file tree
Hide file tree
Showing 47 changed files with 1,869 additions and 558 deletions.
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,7 @@ xnnpack_cc_library(
":datatype",
":fp16",
":indirection",
":internal",
":logging",
":math",
":microkernel_configs",
Expand Down
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1565,6 +1565,7 @@ IF(XNNPACK_BUILD_TESTS)
qd8-f32-qc4w-gemm-minmax
qd8-f32-qc8w-igemm-minmax
qp8-f32-qc4w-gemm-minmax
qp8-f32-qc8w-gemm-minmax
qp8-f32-qb4w-gemm-minmax
qs8-qc8w-gemm-minmax-fp32
qs8-qc8w-igemm-minmax-fp32
Expand Down Expand Up @@ -1944,6 +1945,7 @@ IF(XNNPACK_BUILD_BENCHMARKS)
qd8-f32-qc4w-gemm
qd8-f32-qc8w-gemm
qp8-f32-qc4w-gemm
qp8-f32-qc8w-gemm
qp8-f32-qb4w-gemm
qs8-dwconv
qs8-gemm
Expand Down
6 changes: 3 additions & 3 deletions MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ http_archive(
# KleidiAI library, used for ARM microkernels.
http_archive(
name = "KleidiAI",
sha256 = "ad37707084a6d4ff41be10cbe8540c75bea057ba79d0de6c367c1bfac6ba0852",
strip_prefix = "kleidiai-40a926833857fb64786e02f97703e42b1537cb57",
sha256 = "8ba8cdb9f945941174d34d10eb4ad158ad1cbc1aef259de5ad992b0bbe85861f",
strip_prefix = "kleidiai-7e8c4baf953227fa447a2f345e5d6491a504aa56",
urls = [
"https://gitlab.arm.com/kleidi/kleidiai/-/archive/40a926833857fb64786e02f97703e42b1537cb57/kleidiai-40a926833857fb64786e02f97703e42b1537cb57.zip",
"https://gitlab.arm.com/kleidi/kleidiai/-/archive/7e8c4baf953227fa447a2f345e5d6491a504aa56/kleidiai-7e8c4baf953227fa447a2f345e5d6491a504aa56.zip",
],
)
# LINT.ThenChange(cmake/DownloadKleidiAI.cmake)
Expand Down
17 changes: 16 additions & 1 deletion bench/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,22 @@ xnnpack_benchmark(
)

xnnpack_benchmark(
name = "qp8_f32_qb4w_gemm",
name = "qp8_f32_qc8w_gemm_bench",
srcs = [
"qp8-f32-qc8w-gemm.cc",
],
defines = xnnpack_kleidiai_defines(),
tags = xnnpack_slow_benchmark_tags(),
deps = MICROKERNEL_BENCHMARK_DEPS + [
":gemm_benchmark",
"//:isa_checks",
] + xnnpack_if_kleidiai_enabled([
"@KleidiAI//kai/ukernels/matmul",
]),
)

xnnpack_benchmark(
name = "qp8_f32_qb4w_gemm_bench",
srcs = ["qp8-f32-qb4w-gemm.cc"],
defines = xnnpack_kleidiai_defines(),
tags = xnnpack_slow_benchmark_tags(),
Expand Down
86 changes: 86 additions & 0 deletions bench/qp8-f32-qc8w-gemm.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright 2023 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
//
// Auto-generated file. Do not edit!
// Specification: test/qp8-f32-qc8w-gemm-minmax.yaml
// Generator: tools/generate-gemm-test.py

#include <benchmark/benchmark.h>
#include "gemm-benchmark.h"
#include "utils.h"
#include "xnnpack/common.h"
#include "xnnpack/gemm.h"
#include "xnnpack/isa-checks.h"
#include "xnnpack/microfnptr.h"
#include "xnnpack/microparams-init.h"
#include "xnnpack/pack.h"
#include "xnnpack/packw.h"


#if XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64
#if XNN_ENABLE_KLEIDIAI
static void qp8_f32_qc8w_gemm_minmax_ukernel_16x4c8__neoni8mm_mstep4(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qp8_f32_qc8w_gemm_minmax_ukernel_16x4c8__neoni8mm_mstep4,
xnn_init_f32_minmax_scalar_params,
xnn_pack_kai_qs8_weights_and_biases,
xnn_packed_stride_kai_qs8_weights_and_biases,
/*mr=*/16, /*nr=*/4, /*kr=*/8, /*sr=*/1,
/*mr_packed=*/4,
benchmark::utils::CheckNEONI8MM);
}

BENCHMARK_GEMM(qp8_f32_qc8w_gemm_minmax_ukernel_16x4c8__neoni8mm_mstep4)
#endif // XNN_ENABLE_KLEIDIAI
#endif // XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64


#if XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64
#if XNN_ENABLE_KLEIDIAI
static void qp8_f32_qc8w_gemm_minmax_ukernel_1x4c4__aarch64_neondot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qp8_f32_qc8w_gemm_minmax_ukernel_1x4c4__aarch64_neondot,
xnn_init_f32_minmax_scalar_params,
xnn_pack_kai_qs8_weights_and_biases,
xnn_packed_stride_kai_qs8_weights_and_biases,
/*mr=*/1, /*nr=*/4, /*kr=*/4, /*sr=*/1,
/*mr_packed=*/1,
benchmark::utils::CheckNEONDOT);
}

BENCHMARK_GEMM(qp8_f32_qc8w_gemm_minmax_ukernel_1x4c4__aarch64_neondot)

static void qp8_f32_qc8w_gemm_minmax_ukernel_1x4c8__aarch64_neondot(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qp8_f32_qc8w_gemm_minmax_ukernel_1x4c8__aarch64_neondot,
xnn_init_f32_minmax_scalar_params,
xnn_pack_kai_qs8_weights_and_biases,
xnn_packed_stride_kai_qs8_weights_and_biases,
/*mr=*/1, /*nr=*/4, /*kr=*/8, /*sr=*/1,
/*mr_packed=*/1,
benchmark::utils::CheckNEONDOT);
}

BENCHMARK_GEMM(qp8_f32_qc8w_gemm_minmax_ukernel_1x4c8__aarch64_neondot)

static void qp8_f32_qc8w_gemm_minmax_ukernel_16x4c4__aarch64_neondot_mstep4(benchmark::State& state, const char* net) {
GEMMBenchmark(state,
xnn_qp8_f32_qc8w_gemm_minmax_ukernel_16x4c4__aarch64_neondot_mstep4,
xnn_init_f32_minmax_scalar_params,
xnn_pack_kai_qs8_weights_and_biases,
xnn_packed_stride_kai_qs8_weights_and_biases,
/*mr=*/16, /*nr=*/4, /*kr=*/4, /*sr=*/1,
/*mr_packed=*/4,
benchmark::utils::CheckNEONDOT);
}

BENCHMARK_GEMM(qp8_f32_qc8w_gemm_minmax_ukernel_16x4c4__aarch64_neondot_mstep4)
#endif // XNN_ENABLE_KLEIDIAI
#endif // XNN_ENABLE_ARM_DOTPROD && XNN_ARCH_ARM64


#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif
4 changes: 2 additions & 2 deletions cmake/DownloadGoogleTest.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ ENDIF()
# LINT.IfChange
INCLUDE(ExternalProject)
ExternalProject_Add(googletest
URL https://github.com/google/googletest/archive/d144031940543e15423a25ae5a8a74141044862f.zip
URL_HASH SHA256=648b9430fca63acc68c59ee98f624dcbcd9c24ea6b278c306ab6b7f49f62034a
URL https://github.com/google/googletest/archive/35d0c365609296fa4730d62057c487e3cfa030ff.zip
URL_HASH SHA256=307ccaebc77e0acd19d1d09fe856278a66d1936269a999d40accdb46ec3ab6a4
SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-source"
BINARY_DIR "${CMAKE_BINARY_DIR}/googletest"
CONFIGURE_COMMAND ""
Expand Down
4 changes: 2 additions & 2 deletions cmake/DownloadKleidiAI.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ ENDIF()

INCLUDE(ExternalProject)
ExternalProject_Add(kleidiai
URL https://gitlab.arm.com/kleidi/kleidiai/-/archive/40a926833857fb64786e02f97703e42b1537cb57/kleidiai-40a926833857fb64786e02f97703e42b1537cb57.zip
URL_HASH SHA256=ad37707084a6d4ff41be10cbe8540c75bea057ba79d0de6c367c1bfac6ba0852
URL https://gitlab.arm.com/kleidi/kleidiai/-/archive/7e8c4baf953227fa447a2f345e5d6491a504aa56/kleidiai-7e8c4baf953227fa447a2f345e5d6491a504aa56.zip
URL_HASH SHA256=8ba8cdb9f945941174d34d10eb4ad158ad1cbc1aef259de5ad992b0bbe85861f
SOURCE_DIR "${CMAKE_BINARY_DIR}/kleidiai-source"
BINARY_DIR "${CMAKE_BINARY_DIR}/kleidiai"
CONFIGURE_COMMAND ""
Expand Down
5 changes: 4 additions & 1 deletion cmake/gen/neondot_aarch64_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
SET(PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS
src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-1x4c16s2-aarch64-neondot.c
src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-1x8c16s2-aarch64-neondot.c
src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x8c16s2-aarch64-neondot.c)
src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x8c16s2-aarch64-neondot.c
src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x4c4-aarch64-neondot.c
src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x4c8-aarch64-neondot.c
src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x4c4-mstep4-aarch64-neondot.c)

SET(NON_PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS
src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8c8-minmax-aarch64-neondot-ld128.c
Expand Down
1 change: 1 addition & 0 deletions cmake/gen/neoni8mm_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ SET(PROD_NEONI8MM_MICROKERNEL_SRCS
src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c8-minmax-neoni8mm.c
src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-16x4c16s2-mstep4-neoni8mm.c
src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-8x8c16s2-mstep2-neoni8mm.c
src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x4c8-mstep4-neoni8mm.c
src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-neoni8mm.c
src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c8-minmax-fp32-neoni8mm.c
src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-fp32-neoni8mm.c
Expand Down
3 changes: 3 additions & 0 deletions gen/neondot_aarch64_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
"src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-1x4c16s2-aarch64-neondot.c",
"src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-1x8c16s2-aarch64-neondot.c",
"src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-1x8c16s2-aarch64-neondot.c",
"src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x4c4-aarch64-neondot.c",
"src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-1x4c8-aarch64-neondot.c",
"src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x4c4-mstep4-aarch64-neondot.c",
]

NON_PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS = [
Expand Down
1 change: 1 addition & 0 deletions gen/neoni8mm_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ PROD_NEONI8MM_MICROKERNEL_SRCS = [
"src/qd8-f32-qc8w-igemm/gen/qd8-f32-qc8w-igemm-4x16c8-minmax-neoni8mm.c",
"src/qp8-f32-qb4w-gemm/qp8-f32-qb4w-gemm-minmax-16x4c16s2-mstep4-neoni8mm.c",
"src/qp8-f32-qc4w-gemm/qp8-f32-qc4w-gemm-minmax-8x8c16s2-mstep2-neoni8mm.c",
"src/qp8-f32-qc8w-gemm/qp8-f32-qc8w-gemm-minmax-16x4c8-mstep4-neoni8mm.c",
"src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-1x16c8-minmax-fp32-neoni8mm.c",
"src/qs8-qc8w-gemm/gen/qs8-qc8w-gemm-4x16c8-minmax-fp32-neoni8mm.c",
"src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-1x16c8-minmax-fp32-neoni8mm.c",
Expand Down
1 change: 1 addition & 0 deletions scripts/generate-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ tools/generate-gemm-test.py --spec test/qd8-f32-qc4w-gemm-minmax.yaml --output-t
tools/generate-gemm-test.py --spec test/qd8-f32-qb4w-gemm-minmax.yaml --output-test test/qd8-f32-qb4w-gemm-minmax.cc --output-bench bench/qd8-f32-qb4w-gemm.cc &

tools/generate-gemm-test.py --spec test/qp8-f32-qc4w-gemm-minmax.yaml --output-test test/qp8-f32-qc4w-gemm-minmax.cc --output-bench bench/qp8-f32-qc4w-gemm.cc &
tools/generate-gemm-test.py --spec test/qp8-f32-qc8w-gemm-minmax.yaml --output-test test/qp8-f32-qc8w-gemm-minmax.cc --output-bench bench/qp8-f32-qc8w-gemm.cc &
tools/generate-gemm-test.py --spec test/qp8-f32-qb4w-gemm-minmax.yaml --output-test test/qp8-f32-qb4w-gemm-minmax.cc --output-bench bench/qp8-f32-qb4w-gemm.cc &

tools/generate-gemm-test.py --spec test/qs8-qc8w-gemm-minmax-fp32.yaml --output-test test/qs8-qc8w-gemm-minmax-fp32.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-2.cc --output-test test/qs8-qc8w-gemm-minmax-fp32-3.cc --output-bench bench/qs8-qc8w-gemm-fp32.cc &
Expand Down
Loading

0 comments on commit acb51c6

Please sign in to comment.