From d56494495d0e15066bd91de3549504db2fb19610 Mon Sep 17 00:00:00 2001 From: Gary Yi-Hung Chen Date: Wed, 30 Oct 2024 16:56:44 +0800 Subject: [PATCH] Add RVV f32-dwconv-unipass --- bench/f32-dwconv.cc | 32 +++ cmake/gen/rvv_microkernels.cmake | 8 + gen/rvv_microkernels.bzl | 8 + scripts/generate-f32-dwconv.sh | 11 + src/configs/dwconv-config.c | 35 ++++ src/f32-dwconv/f32-dwconv-minmax-unipass.h | 7 + src/f32-dwconv/f32-dwconv-unipass.h | 7 + .../gen/f32-dwconv-25p8vc-minmax-rvv.c | 190 ++++++++++++++++++ src/f32-dwconv/gen/f32-dwconv-25p8vc-rvv.c | 186 +++++++++++++++++ .../gen/f32-dwconv-3p8vc-minmax-rvv.c | 80 ++++++++ src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c | 76 +++++++ .../gen/f32-dwconv-4p8vc-minmax-rvv.c | 85 ++++++++ src/f32-dwconv/gen/f32-dwconv-4p8vc-rvv.c | 81 ++++++++ .../gen/f32-dwconv-9p8vc-minmax-rvv.c | 110 ++++++++++ src/f32-dwconv/gen/f32-dwconv-9p8vc-rvv.c | 106 ++++++++++ src/f32-dwconv/unipass-rvv.c.in | 76 +++++++ 16 files changed, 1098 insertions(+) create mode 100644 src/f32-dwconv/gen/f32-dwconv-25p8vc-minmax-rvv.c create mode 100644 src/f32-dwconv/gen/f32-dwconv-25p8vc-rvv.c create mode 100644 src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c create mode 100644 src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c create mode 100644 src/f32-dwconv/gen/f32-dwconv-4p8vc-minmax-rvv.c create mode 100644 src/f32-dwconv/gen/f32-dwconv-4p8vc-rvv.c create mode 100644 src/f32-dwconv/gen/f32-dwconv-9p8vc-minmax-rvv.c create mode 100644 src/f32-dwconv/gen/f32-dwconv-9p8vc-rvv.c create mode 100644 src/f32-dwconv/unipass-rvv.c.in diff --git a/bench/f32-dwconv.cc b/bench/f32-dwconv.cc index fa21809f074..82e890f9d05 100644 --- a/bench/f32-dwconv.cc +++ b/bench/f32-dwconv.cc @@ -16,6 +16,7 @@ #include "xnnpack.h" #include "xnnpack/common.h" #include "xnnpack/dwconv.h" +#include "xnnpack/hardware-config.h" #include "xnnpack/indirection.h" #include "xnnpack/microfnptr.h" #include "xnnpack/microkernel-utils.h" @@ -1592,6 +1593,37 @@ static void f32_dwconv( BENCHMARK_DWCONV(f32_dwconv_5f5m5l4c4s4r__wasmrelaxedsimd_fma_acc2) #endif +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV +static void f32_dwconv_3p8vc__rvv(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_3p8vc__rvv, + xnn_init_f32_minmax_scalar_params, + 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)) /* channel tile */, 3 /* primary tile */); +} +static void f32_dwconv_4p8vc__rvv(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_4p8vc__rvv, + xnn_init_f32_minmax_scalar_params, + 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)) /* channel tile */, 4 /* primary tile */); +} +static void f32_dwconv_9p8vc__rvv(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_9p8vc__rvv, + xnn_init_f32_minmax_scalar_params, + 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)) /* channel tile */, 9 /* primary tile */); +} +static void f32_dwconv_25p8vc__rvv(benchmark::State& state, const char* net) { + f32_dwconv(state, + xnn_f32_dwconv_minmax_ukernel_25p8vc__rvv, + xnn_init_f32_minmax_scalar_params, + 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)) /* channel tile */, 25 /* primary tile */); +} + +BENCHMARK_DWCONV(f32_dwconv_3p8vc__rvv) +BENCHMARK_DWCONV(f32_dwconv_4p8vc__rvv) +BENCHMARK_DWCONV(f32_dwconv_9p8vc__rvv) +BENCHMARK_DWCONV(f32_dwconv_25p8vc__rvv) +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV static void f32_dwconv_4p1c__scalar(benchmark::State& state, const char* net) { f32_dwconv(state, diff --git a/cmake/gen/rvv_microkernels.cmake b/cmake/gen/rvv_microkernels.cmake index 602adea680e..9c371722ebd 100644 --- a/cmake/gen/rvv_microkernels.cmake +++ b/cmake/gen/rvv_microkernels.cmake @@ -15,6 +15,14 @@ SET(PROD_RVV_MICROKERNEL_SRCS src/f32-argmaxpool/f32-argmaxpool-9x-rvv-u1v.c src/f32-avgpool/gen/f32-avgpool-9p8x-minmax-rvv-c2v.c src/f32-avgpool/gen/f32-avgpool-9x-minmax-rvv-c2v.c + src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c + src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c + src/f32-dwconv/gen/f32-dwconv-4p8vc-minmax-rvv.c + src/f32-dwconv/gen/f32-dwconv-4p8vc-rvv.c + src/f32-dwconv/gen/f32-dwconv-9p8vc-minmax-rvv.c + src/f32-dwconv/gen/f32-dwconv-9p8vc-rvv.c + src/f32-dwconv/gen/f32-dwconv-25p8vc-minmax-rvv.c + src/f32-dwconv/gen/f32-dwconv-25p8vc-rvv.c src/f32-gemm/gen/f32-gemm-1x4v-minmax-rvv.c src/f32-gemm/gen/f32-gemm-7x4v-minmax-rvv.c src/f32-igemm/gen/f32-igemm-1x4v-minmax-rvv.c diff --git a/gen/rvv_microkernels.bzl b/gen/rvv_microkernels.bzl index 8d86489dbb2..2158e7523be 100644 --- a/gen/rvv_microkernels.bzl +++ b/gen/rvv_microkernels.bzl @@ -11,6 +11,14 @@ PROD_RVV_MICROKERNEL_SRCS = [ "src/f32-argmaxpool/f32-argmaxpool-9x-rvv-u1v.c", "src/f32-avgpool/gen/f32-avgpool-9p8x-minmax-rvv-c2v.c", "src/f32-avgpool/gen/f32-avgpool-9x-minmax-rvv-c2v.c", + "src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c", + "src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c", + "src/f32-dwconv/gen/f32-dwconv-4p8vc-minmax-rvv.c", + "src/f32-dwconv/gen/f32-dwconv-4p8vc-rvv.c", + "src/f32-dwconv/gen/f32-dwconv-9p8vc-minmax-rvv.c", + "src/f32-dwconv/gen/f32-dwconv-9p8vc-rvv.c", + "src/f32-dwconv/gen/f32-dwconv-25p8vc-minmax-rvv.c", + "src/f32-dwconv/gen/f32-dwconv-25p8vc-rvv.c", "src/f32-gemm/gen/f32-gemm-1x4v-minmax-rvv.c", "src/f32-gemm/gen/f32-gemm-7x4v-minmax-rvv.c", "src/f32-igemm/gen/f32-igemm-1x4v-minmax-rvv.c", diff --git a/scripts/generate-f32-dwconv.sh b/scripts/generate-f32-dwconv.sh index 236adb70b35..f5ad98e7a62 100755 --- a/scripts/generate-f32-dwconv.sh +++ b/scripts/generate-f32-dwconv.sh @@ -459,4 +459,15 @@ tools/xngen src/f32-dwconv/multipass-avx512.c.in -D CHANNEL_TILE=16 -D FIRST_PAS tools/xngen src/f32-dwconv/multipass-avx512.c.in -D CHANNEL_TILE=32 -D FIRST_PASS_TILE=5 -D MIDDLE_PASS_TILE=5 -D LAST_PASS_TILE=5 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/f32-dwconv-5f5m5l32c16s1r-minmax-avx512f.c & tools/xngen src/f32-dwconv/multipass-avx512.c.in -D CHANNEL_TILE=32 -D FIRST_PASS_TILE=5 -D MIDDLE_PASS_TILE=5 -D LAST_PASS_TILE=5 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/f32-dwconv-5f5m5l32c16s1r-minmax-avx512f-acc2.c & +################################## RISC-V RVV ################################# +tools/xngen src/f32-dwconv/unipass-rvv.c.in -D CHANNEL_TILE=m8 -D KERNEL_TILE=3 -D ACTIVATION=LINEAR -o src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c & +tools/xngen src/f32-dwconv/unipass-rvv.c.in -D CHANNEL_TILE=m8 -D KERNEL_TILE=4 -D ACTIVATION=LINEAR -o src/f32-dwconv/gen/f32-dwconv-4p8vc-rvv.c & +tools/xngen src/f32-dwconv/unipass-rvv.c.in -D CHANNEL_TILE=m8 -D KERNEL_TILE=9 -D ACTIVATION=LINEAR -o src/f32-dwconv/gen/f32-dwconv-9p8vc-rvv.c & +tools/xngen src/f32-dwconv/unipass-rvv.c.in -D CHANNEL_TILE=m8 -D KERNEL_TILE=25 -D ACTIVATION=LINEAR -o src/f32-dwconv/gen/f32-dwconv-25p8vc-rvv.c & + +tools/xngen src/f32-dwconv/unipass-rvv.c.in -D CHANNEL_TILE=m8 -D KERNEL_TILE=3 -D ACTIVATION=MINMAX -o src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c & +tools/xngen src/f32-dwconv/unipass-rvv.c.in -D CHANNEL_TILE=m8 -D KERNEL_TILE=4 -D ACTIVATION=MINMAX -o src/f32-dwconv/gen/f32-dwconv-4p8vc-minmax-rvv.c & +tools/xngen src/f32-dwconv/unipass-rvv.c.in -D CHANNEL_TILE=m8 -D KERNEL_TILE=9 -D ACTIVATION=MINMAX -o src/f32-dwconv/gen/f32-dwconv-9p8vc-minmax-rvv.c & +tools/xngen src/f32-dwconv/unipass-rvv.c.in -D CHANNEL_TILE=m8 -D KERNEL_TILE=25 -D ACTIVATION=MINMAX -o src/f32-dwconv/gen/f32-dwconv-25p8vc-minmax-rvv.c & + wait diff --git a/src/configs/dwconv-config.c b/src/configs/dwconv-config.c index 804aa34cefe..04a3e9bf40e 100644 --- a/src/configs/dwconv-config.c +++ b/src/configs/dwconv-config.c @@ -638,6 +638,41 @@ static void init_f32_dwconv_config(void) { f32_dwconv_config[3].channel_round = 1; f32_dwconv_config[3].primary_tile = 25; #endif // XNN_ENABLE_DWCONV_MULTIPASS + #elif XNN_ARCH_RISCV && XNN_ENABLE_RISCV_VECTOR + const int lmul = 8; + const int element_size = 4; + const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); + f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p8vc__rvv; + f32_dwconv_config[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p8vc__rvv; + f32_dwconv_config[0].init.f32 = xnn_init_f32_minmax_scalar_params; + f32_dwconv_config[0].channel_tile = hardware_config->vlenb / element_size * lmul; + f32_dwconv_config[0].channel_subtile = hardware_config->vlenb / element_size * lmul; + f32_dwconv_config[0].channel_round = 1; + f32_dwconv_config[0].primary_tile = 3; + + f32_dwconv_config[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_4p8vc__rvv; + f32_dwconv_config[1].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_4p8vc__rvv; + f32_dwconv_config[1].init.f32 = xnn_init_f32_minmax_scalar_params; + f32_dwconv_config[1].channel_tile = hardware_config->vlenb / element_size * lmul; + f32_dwconv_config[1].channel_subtile = hardware_config->vlenb / element_size * lmul; + f32_dwconv_config[1].channel_round = 1; + f32_dwconv_config[1].primary_tile = 4; + + f32_dwconv_config[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_9p8vc__rvv; + f32_dwconv_config[2].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_9p8vc__rvv; + f32_dwconv_config[2].init.f32 = xnn_init_f32_minmax_scalar_params; + f32_dwconv_config[2].channel_tile = hardware_config->vlenb / element_size * lmul; + f32_dwconv_config[2].channel_subtile = hardware_config->vlenb / element_size * lmul; + f32_dwconv_config[2].channel_round = 1; + f32_dwconv_config[2].primary_tile = 9; + + f32_dwconv_config[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_25p8vc__rvv; + f32_dwconv_config[3].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_25p8vc__rvv; + f32_dwconv_config[3].init.f32 = xnn_init_f32_minmax_scalar_params; + f32_dwconv_config[3].channel_tile = hardware_config->vlenb / element_size * lmul; + f32_dwconv_config[3].channel_subtile = hardware_config->vlenb / element_size * lmul; + f32_dwconv_config[3].channel_round = 1; + f32_dwconv_config[3].primary_tile = 25; #else f32_dwconv_config[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2; f32_dwconv_config[0].linear.unipass = (xnn_dwconv_unipass_ukernel_fn) xnn_f32_dwconv_ukernel_3p1c__scalar_acc2; diff --git a/src/f32-dwconv/f32-dwconv-minmax-unipass.h b/src/f32-dwconv/f32-dwconv-minmax-unipass.h index 56b94f8b195..4d94199a21e 100644 --- a/src/f32-dwconv/f32-dwconv-minmax-unipass.h +++ b/src/f32-dwconv/f32-dwconv-minmax-unipass.h @@ -221,6 +221,13 @@ XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p2c__wasm, 2, false, 2, 25 XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_25p2c__wasm_acc2, 2, false, 2, 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV +XNN_DWCONV_UNIPASS(xnn_arch_riscv_vector, xnn_f32_dwconv_minmax_ukernel_3p8vc__rvv, 8, false, 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)), 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_riscv_vector, xnn_f32_dwconv_minmax_ukernel_4p8vc__rvv, 8, false, 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)), 4, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_riscv_vector, xnn_f32_dwconv_minmax_ukernel_9p8vc__rvv, 8, false, 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)), 9, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +XNN_DWCONV_UNIPASS(xnn_arch_riscv_vector, xnn_f32_dwconv_minmax_ukernel_25p8vc__rvv, 8, false, 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)), 25, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p1c__scalar, 1, false, 1, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p1c__scalar_acc2, 1, false, 1, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_minmax_ukernel_3p2c__scalar, 2, false, 2, 3, float, float, union xnn_f32_minmax_params, xnn_init_f32_minmax_scalar_params) diff --git a/src/f32-dwconv/f32-dwconv-unipass.h b/src/f32-dwconv/f32-dwconv-unipass.h index f52415fdb7d..92df50e0747 100644 --- a/src/f32-dwconv/f32-dwconv-unipass.h +++ b/src/f32-dwconv/f32-dwconv-unipass.h @@ -30,6 +30,13 @@ XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p4c__wasmrelaxedsimd_fma, 4, fals XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_25p8c__wasmrelaxedsimd_fma, 8, false, 8, 25, float, float, struct xnn_f32_default_params, NULL) #endif // XNN_ARCH_WASMRELAXEDSIMD +#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV +XNN_DWCONV_UNIPASS(xnn_arch_riscv_vector, xnn_f32_dwconv_ukernel_3p8vc__rvv, 8, false, 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)), 3, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(xnn_arch_riscv_vector, xnn_f32_dwconv_ukernel_4p8vc__rvv, 8, false, 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)), 4, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(xnn_arch_riscv_vector, xnn_f32_dwconv_ukernel_9p8vc__rvv, 8, false, 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)), 9, float, float, struct xnn_f32_default_params, NULL) +XNN_DWCONV_UNIPASS(xnn_arch_riscv_vector, xnn_f32_dwconv_ukernel_25p8vc__rvv, 8, false, 8 * (xnn_init_hardware_config()->vlenb / sizeof(float)), 25, float, float, struct xnn_f32_default_params, NULL) +#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV + XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p1c__scalar, 1, false, 1, 3, float, float, struct xnn_f32_default_params, NULL) XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p1c__scalar_acc2, 1, false, 1, 3, float, float, struct xnn_f32_default_params, NULL) XNN_DWCONV_UNIPASS(0, xnn_f32_dwconv_ukernel_3p2c__scalar, 2, false, 2, 3, float, float, struct xnn_f32_default_params, NULL) diff --git a/src/f32-dwconv/gen/f32-dwconv-25p8vc-minmax-rvv.c b/src/f32-dwconv/gen/f32-dwconv-25p8vc-minmax-rvv.c new file mode 100644 index 00000000000..7b270c2b151 --- /dev/null +++ b/src/f32-dwconv/gen/f32-dwconv-25p8vc-minmax-rvv.c @@ -0,0 +1,190 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-dwconv/unipass-rvv.c.in +// Generator: tools/xngen +// + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +#include +#include +#include "xnnpack/dwconv.h" + +void xnn_f32_dwconv_minmax_ukernel_25p8vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + do { + const float* i[25]; + i[0] = input[0]; + assert(i[0] != NULL); + if XNN_UNPREDICTABLE(i[0] != zero) { + i[0] = (const float*) ((uintptr_t) i[0] + input_offset); + } + i[1] = input[1]; + assert(i[1] != NULL); + if XNN_UNPREDICTABLE(i[1] != zero) { + i[1] = (const float*) ((uintptr_t) i[1] + input_offset); + } + i[2] = input[2]; + assert(i[2] != NULL); + if XNN_UNPREDICTABLE(i[2] != zero) { + i[2] = (const float*) ((uintptr_t) i[2] + input_offset); + } + i[3] = input[3]; + assert(i[3] != NULL); + if XNN_UNPREDICTABLE(i[3] != zero) { + i[3] = (const float*) ((uintptr_t) i[3] + input_offset); + } + i[4] = input[4]; + assert(i[4] != NULL); + if XNN_UNPREDICTABLE(i[4] != zero) { + i[4] = (const float*) ((uintptr_t) i[4] + input_offset); + } + i[5] = input[5]; + assert(i[5] != NULL); + if XNN_UNPREDICTABLE(i[5] != zero) { + i[5] = (const float*) ((uintptr_t) i[5] + input_offset); + } + i[6] = input[6]; + assert(i[6] != NULL); + if XNN_UNPREDICTABLE(i[6] != zero) { + i[6] = (const float*) ((uintptr_t) i[6] + input_offset); + } + i[7] = input[7]; + assert(i[7] != NULL); + if XNN_UNPREDICTABLE(i[7] != zero) { + i[7] = (const float*) ((uintptr_t) i[7] + input_offset); + } + i[8] = input[8]; + assert(i[8] != NULL); + if XNN_UNPREDICTABLE(i[8] != zero) { + i[8] = (const float*) ((uintptr_t) i[8] + input_offset); + } + i[9] = input[9]; + assert(i[9] != NULL); + if XNN_UNPREDICTABLE(i[9] != zero) { + i[9] = (const float*) ((uintptr_t) i[9] + input_offset); + } + i[10] = input[10]; + assert(i[10] != NULL); + if XNN_UNPREDICTABLE(i[10] != zero) { + i[10] = (const float*) ((uintptr_t) i[10] + input_offset); + } + i[11] = input[11]; + assert(i[11] != NULL); + if XNN_UNPREDICTABLE(i[11] != zero) { + i[11] = (const float*) ((uintptr_t) i[11] + input_offset); + } + i[12] = input[12]; + assert(i[12] != NULL); + if XNN_UNPREDICTABLE(i[12] != zero) { + i[12] = (const float*) ((uintptr_t) i[12] + input_offset); + } + i[13] = input[13]; + assert(i[13] != NULL); + if XNN_UNPREDICTABLE(i[13] != zero) { + i[13] = (const float*) ((uintptr_t) i[13] + input_offset); + } + i[14] = input[14]; + assert(i[14] != NULL); + if XNN_UNPREDICTABLE(i[14] != zero) { + i[14] = (const float*) ((uintptr_t) i[14] + input_offset); + } + i[15] = input[15]; + assert(i[15] != NULL); + if XNN_UNPREDICTABLE(i[15] != zero) { + i[15] = (const float*) ((uintptr_t) i[15] + input_offset); + } + i[16] = input[16]; + assert(i[16] != NULL); + if XNN_UNPREDICTABLE(i[16] != zero) { + i[16] = (const float*) ((uintptr_t) i[16] + input_offset); + } + i[17] = input[17]; + assert(i[17] != NULL); + if XNN_UNPREDICTABLE(i[17] != zero) { + i[17] = (const float*) ((uintptr_t) i[17] + input_offset); + } + i[18] = input[18]; + assert(i[18] != NULL); + if XNN_UNPREDICTABLE(i[18] != zero) { + i[18] = (const float*) ((uintptr_t) i[18] + input_offset); + } + i[19] = input[19]; + assert(i[19] != NULL); + if XNN_UNPREDICTABLE(i[19] != zero) { + i[19] = (const float*) ((uintptr_t) i[19] + input_offset); + } + i[20] = input[20]; + assert(i[20] != NULL); + if XNN_UNPREDICTABLE(i[20] != zero) { + i[20] = (const float*) ((uintptr_t) i[20] + input_offset); + } + i[21] = input[21]; + assert(i[21] != NULL); + if XNN_UNPREDICTABLE(i[21] != zero) { + i[21] = (const float*) ((uintptr_t) i[21] + input_offset); + } + i[22] = input[22]; + assert(i[22] != NULL); + if XNN_UNPREDICTABLE(i[22] != zero) { + i[22] = (const float*) ((uintptr_t) i[22] + input_offset); + } + i[23] = input[23]; + assert(i[23] != NULL); + if XNN_UNPREDICTABLE(i[23] != zero) { + i[23] = (const float*) ((uintptr_t) i[23] + input_offset); + } + i[24] = input[24]; + assert(i[24] != NULL); + if XNN_UNPREDICTABLE(i[24] != zero) { + i[24] = (const float*) ((uintptr_t) i[24] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m8(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m8(c); + // load bias to vAcc + vfloat32m8_t vAcc = __riscv_vle32_v_f32m8_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m8_t va; + vfloat32m8_t vb; + for (int k=0; k<25; k++) { + va = __riscv_vle32_v_f32m8_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m8_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m8_tu(vAcc, va, vb, vl); + } + + vAcc = __riscv_vfmax_vf_f32m8_tu(vAcc, vAcc, vmin, vl); + vAcc = __riscv_vfmin_vf_f32m8_tu(vAcc, vAcc, vmax, vl); + __riscv_vse32_v_f32m8(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +} diff --git a/src/f32-dwconv/gen/f32-dwconv-25p8vc-rvv.c b/src/f32-dwconv/gen/f32-dwconv-25p8vc-rvv.c new file mode 100644 index 00000000000..37be909e00a --- /dev/null +++ b/src/f32-dwconv/gen/f32-dwconv-25p8vc-rvv.c @@ -0,0 +1,186 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-dwconv/unipass-rvv.c.in +// Generator: tools/xngen +// + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +#include +#include +#include "xnnpack/dwconv.h" + +void xnn_f32_dwconv_ukernel_25p8vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + do { + const float* i[25]; + i[0] = input[0]; + assert(i[0] != NULL); + if XNN_UNPREDICTABLE(i[0] != zero) { + i[0] = (const float*) ((uintptr_t) i[0] + input_offset); + } + i[1] = input[1]; + assert(i[1] != NULL); + if XNN_UNPREDICTABLE(i[1] != zero) { + i[1] = (const float*) ((uintptr_t) i[1] + input_offset); + } + i[2] = input[2]; + assert(i[2] != NULL); + if XNN_UNPREDICTABLE(i[2] != zero) { + i[2] = (const float*) ((uintptr_t) i[2] + input_offset); + } + i[3] = input[3]; + assert(i[3] != NULL); + if XNN_UNPREDICTABLE(i[3] != zero) { + i[3] = (const float*) ((uintptr_t) i[3] + input_offset); + } + i[4] = input[4]; + assert(i[4] != NULL); + if XNN_UNPREDICTABLE(i[4] != zero) { + i[4] = (const float*) ((uintptr_t) i[4] + input_offset); + } + i[5] = input[5]; + assert(i[5] != NULL); + if XNN_UNPREDICTABLE(i[5] != zero) { + i[5] = (const float*) ((uintptr_t) i[5] + input_offset); + } + i[6] = input[6]; + assert(i[6] != NULL); + if XNN_UNPREDICTABLE(i[6] != zero) { + i[6] = (const float*) ((uintptr_t) i[6] + input_offset); + } + i[7] = input[7]; + assert(i[7] != NULL); + if XNN_UNPREDICTABLE(i[7] != zero) { + i[7] = (const float*) ((uintptr_t) i[7] + input_offset); + } + i[8] = input[8]; + assert(i[8] != NULL); + if XNN_UNPREDICTABLE(i[8] != zero) { + i[8] = (const float*) ((uintptr_t) i[8] + input_offset); + } + i[9] = input[9]; + assert(i[9] != NULL); + if XNN_UNPREDICTABLE(i[9] != zero) { + i[9] = (const float*) ((uintptr_t) i[9] + input_offset); + } + i[10] = input[10]; + assert(i[10] != NULL); + if XNN_UNPREDICTABLE(i[10] != zero) { + i[10] = (const float*) ((uintptr_t) i[10] + input_offset); + } + i[11] = input[11]; + assert(i[11] != NULL); + if XNN_UNPREDICTABLE(i[11] != zero) { + i[11] = (const float*) ((uintptr_t) i[11] + input_offset); + } + i[12] = input[12]; + assert(i[12] != NULL); + if XNN_UNPREDICTABLE(i[12] != zero) { + i[12] = (const float*) ((uintptr_t) i[12] + input_offset); + } + i[13] = input[13]; + assert(i[13] != NULL); + if XNN_UNPREDICTABLE(i[13] != zero) { + i[13] = (const float*) ((uintptr_t) i[13] + input_offset); + } + i[14] = input[14]; + assert(i[14] != NULL); + if XNN_UNPREDICTABLE(i[14] != zero) { + i[14] = (const float*) ((uintptr_t) i[14] + input_offset); + } + i[15] = input[15]; + assert(i[15] != NULL); + if XNN_UNPREDICTABLE(i[15] != zero) { + i[15] = (const float*) ((uintptr_t) i[15] + input_offset); + } + i[16] = input[16]; + assert(i[16] != NULL); + if XNN_UNPREDICTABLE(i[16] != zero) { + i[16] = (const float*) ((uintptr_t) i[16] + input_offset); + } + i[17] = input[17]; + assert(i[17] != NULL); + if XNN_UNPREDICTABLE(i[17] != zero) { + i[17] = (const float*) ((uintptr_t) i[17] + input_offset); + } + i[18] = input[18]; + assert(i[18] != NULL); + if XNN_UNPREDICTABLE(i[18] != zero) { + i[18] = (const float*) ((uintptr_t) i[18] + input_offset); + } + i[19] = input[19]; + assert(i[19] != NULL); + if XNN_UNPREDICTABLE(i[19] != zero) { + i[19] = (const float*) ((uintptr_t) i[19] + input_offset); + } + i[20] = input[20]; + assert(i[20] != NULL); + if XNN_UNPREDICTABLE(i[20] != zero) { + i[20] = (const float*) ((uintptr_t) i[20] + input_offset); + } + i[21] = input[21]; + assert(i[21] != NULL); + if XNN_UNPREDICTABLE(i[21] != zero) { + i[21] = (const float*) ((uintptr_t) i[21] + input_offset); + } + i[22] = input[22]; + assert(i[22] != NULL); + if XNN_UNPREDICTABLE(i[22] != zero) { + i[22] = (const float*) ((uintptr_t) i[22] + input_offset); + } + i[23] = input[23]; + assert(i[23] != NULL); + if XNN_UNPREDICTABLE(i[23] != zero) { + i[23] = (const float*) ((uintptr_t) i[23] + input_offset); + } + i[24] = input[24]; + assert(i[24] != NULL); + if XNN_UNPREDICTABLE(i[24] != zero) { + i[24] = (const float*) ((uintptr_t) i[24] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m8(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m8(c); + // load bias to vAcc + vfloat32m8_t vAcc = __riscv_vle32_v_f32m8_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m8_t va; + vfloat32m8_t vb; + for (int k=0; k<25; k++) { + va = __riscv_vle32_v_f32m8_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m8_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m8_tu(vAcc, va, vb, vl); + } + + __riscv_vse32_v_f32m8(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +} diff --git a/src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c b/src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c new file mode 100644 index 00000000000..790a2b48751 --- /dev/null +++ b/src/f32-dwconv/gen/f32-dwconv-3p8vc-minmax-rvv.c @@ -0,0 +1,80 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-dwconv/unipass-rvv.c.in +// Generator: tools/xngen +// + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +#include +#include +#include "xnnpack/dwconv.h" + +void xnn_f32_dwconv_minmax_ukernel_3p8vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + do { + const float* i[3]; + i[0] = input[0]; + assert(i[0] != NULL); + if XNN_UNPREDICTABLE(i[0] != zero) { + i[0] = (const float*) ((uintptr_t) i[0] + input_offset); + } + i[1] = input[1]; + assert(i[1] != NULL); + if XNN_UNPREDICTABLE(i[1] != zero) { + i[1] = (const float*) ((uintptr_t) i[1] + input_offset); + } + i[2] = input[2]; + assert(i[2] != NULL); + if XNN_UNPREDICTABLE(i[2] != zero) { + i[2] = (const float*) ((uintptr_t) i[2] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m8(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m8(c); + // load bias to vAcc + vfloat32m8_t vAcc = __riscv_vle32_v_f32m8_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m8_t va; + vfloat32m8_t vb; + for (int k=0; k<3; k++) { + va = __riscv_vle32_v_f32m8_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m8_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m8_tu(vAcc, va, vb, vl); + } + + vAcc = __riscv_vfmax_vf_f32m8_tu(vAcc, vAcc, vmin, vl); + vAcc = __riscv_vfmin_vf_f32m8_tu(vAcc, vAcc, vmax, vl); + __riscv_vse32_v_f32m8(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +} diff --git a/src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c b/src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c new file mode 100644 index 00000000000..015fa3c8882 --- /dev/null +++ b/src/f32-dwconv/gen/f32-dwconv-3p8vc-rvv.c @@ -0,0 +1,76 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-dwconv/unipass-rvv.c.in +// Generator: tools/xngen +// + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +#include +#include +#include "xnnpack/dwconv.h" + +void xnn_f32_dwconv_ukernel_3p8vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + do { + const float* i[3]; + i[0] = input[0]; + assert(i[0] != NULL); + if XNN_UNPREDICTABLE(i[0] != zero) { + i[0] = (const float*) ((uintptr_t) i[0] + input_offset); + } + i[1] = input[1]; + assert(i[1] != NULL); + if XNN_UNPREDICTABLE(i[1] != zero) { + i[1] = (const float*) ((uintptr_t) i[1] + input_offset); + } + i[2] = input[2]; + assert(i[2] != NULL); + if XNN_UNPREDICTABLE(i[2] != zero) { + i[2] = (const float*) ((uintptr_t) i[2] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m8(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m8(c); + // load bias to vAcc + vfloat32m8_t vAcc = __riscv_vle32_v_f32m8_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m8_t va; + vfloat32m8_t vb; + for (int k=0; k<3; k++) { + va = __riscv_vle32_v_f32m8_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m8_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m8_tu(vAcc, va, vb, vl); + } + + __riscv_vse32_v_f32m8(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +} diff --git a/src/f32-dwconv/gen/f32-dwconv-4p8vc-minmax-rvv.c b/src/f32-dwconv/gen/f32-dwconv-4p8vc-minmax-rvv.c new file mode 100644 index 00000000000..d77e9133f38 --- /dev/null +++ b/src/f32-dwconv/gen/f32-dwconv-4p8vc-minmax-rvv.c @@ -0,0 +1,85 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-dwconv/unipass-rvv.c.in +// Generator: tools/xngen +// + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +#include +#include +#include "xnnpack/dwconv.h" + +void xnn_f32_dwconv_minmax_ukernel_4p8vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + do { + const float* i[4]; + i[0] = input[0]; + assert(i[0] != NULL); + if XNN_UNPREDICTABLE(i[0] != zero) { + i[0] = (const float*) ((uintptr_t) i[0] + input_offset); + } + i[1] = input[1]; + assert(i[1] != NULL); + if XNN_UNPREDICTABLE(i[1] != zero) { + i[1] = (const float*) ((uintptr_t) i[1] + input_offset); + } + i[2] = input[2]; + assert(i[2] != NULL); + if XNN_UNPREDICTABLE(i[2] != zero) { + i[2] = (const float*) ((uintptr_t) i[2] + input_offset); + } + i[3] = input[3]; + assert(i[3] != NULL); + if XNN_UNPREDICTABLE(i[3] != zero) { + i[3] = (const float*) ((uintptr_t) i[3] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m8(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m8(c); + // load bias to vAcc + vfloat32m8_t vAcc = __riscv_vle32_v_f32m8_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m8_t va; + vfloat32m8_t vb; + for (int k=0; k<4; k++) { + va = __riscv_vle32_v_f32m8_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m8_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m8_tu(vAcc, va, vb, vl); + } + + vAcc = __riscv_vfmax_vf_f32m8_tu(vAcc, vAcc, vmin, vl); + vAcc = __riscv_vfmin_vf_f32m8_tu(vAcc, vAcc, vmax, vl); + __riscv_vse32_v_f32m8(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +} diff --git a/src/f32-dwconv/gen/f32-dwconv-4p8vc-rvv.c b/src/f32-dwconv/gen/f32-dwconv-4p8vc-rvv.c new file mode 100644 index 00000000000..b875952799a --- /dev/null +++ b/src/f32-dwconv/gen/f32-dwconv-4p8vc-rvv.c @@ -0,0 +1,81 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-dwconv/unipass-rvv.c.in +// Generator: tools/xngen +// + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +#include +#include +#include "xnnpack/dwconv.h" + +void xnn_f32_dwconv_ukernel_4p8vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + do { + const float* i[4]; + i[0] = input[0]; + assert(i[0] != NULL); + if XNN_UNPREDICTABLE(i[0] != zero) { + i[0] = (const float*) ((uintptr_t) i[0] + input_offset); + } + i[1] = input[1]; + assert(i[1] != NULL); + if XNN_UNPREDICTABLE(i[1] != zero) { + i[1] = (const float*) ((uintptr_t) i[1] + input_offset); + } + i[2] = input[2]; + assert(i[2] != NULL); + if XNN_UNPREDICTABLE(i[2] != zero) { + i[2] = (const float*) ((uintptr_t) i[2] + input_offset); + } + i[3] = input[3]; + assert(i[3] != NULL); + if XNN_UNPREDICTABLE(i[3] != zero) { + i[3] = (const float*) ((uintptr_t) i[3] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m8(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m8(c); + // load bias to vAcc + vfloat32m8_t vAcc = __riscv_vle32_v_f32m8_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m8_t va; + vfloat32m8_t vb; + for (int k=0; k<4; k++) { + va = __riscv_vle32_v_f32m8_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m8_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m8_tu(vAcc, va, vb, vl); + } + + __riscv_vse32_v_f32m8(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +} diff --git a/src/f32-dwconv/gen/f32-dwconv-9p8vc-minmax-rvv.c b/src/f32-dwconv/gen/f32-dwconv-9p8vc-minmax-rvv.c new file mode 100644 index 00000000000..c80bb910d57 --- /dev/null +++ b/src/f32-dwconv/gen/f32-dwconv-9p8vc-minmax-rvv.c @@ -0,0 +1,110 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-dwconv/unipass-rvv.c.in +// Generator: tools/xngen +// + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +#include +#include +#include "xnnpack/dwconv.h" + +void xnn_f32_dwconv_minmax_ukernel_9p8vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + do { + const float* i[9]; + i[0] = input[0]; + assert(i[0] != NULL); + if XNN_UNPREDICTABLE(i[0] != zero) { + i[0] = (const float*) ((uintptr_t) i[0] + input_offset); + } + i[1] = input[1]; + assert(i[1] != NULL); + if XNN_UNPREDICTABLE(i[1] != zero) { + i[1] = (const float*) ((uintptr_t) i[1] + input_offset); + } + i[2] = input[2]; + assert(i[2] != NULL); + if XNN_UNPREDICTABLE(i[2] != zero) { + i[2] = (const float*) ((uintptr_t) i[2] + input_offset); + } + i[3] = input[3]; + assert(i[3] != NULL); + if XNN_UNPREDICTABLE(i[3] != zero) { + i[3] = (const float*) ((uintptr_t) i[3] + input_offset); + } + i[4] = input[4]; + assert(i[4] != NULL); + if XNN_UNPREDICTABLE(i[4] != zero) { + i[4] = (const float*) ((uintptr_t) i[4] + input_offset); + } + i[5] = input[5]; + assert(i[5] != NULL); + if XNN_UNPREDICTABLE(i[5] != zero) { + i[5] = (const float*) ((uintptr_t) i[5] + input_offset); + } + i[6] = input[6]; + assert(i[6] != NULL); + if XNN_UNPREDICTABLE(i[6] != zero) { + i[6] = (const float*) ((uintptr_t) i[6] + input_offset); + } + i[7] = input[7]; + assert(i[7] != NULL); + if XNN_UNPREDICTABLE(i[7] != zero) { + i[7] = (const float*) ((uintptr_t) i[7] + input_offset); + } + i[8] = input[8]; + assert(i[8] != NULL); + if XNN_UNPREDICTABLE(i[8] != zero) { + i[8] = (const float*) ((uintptr_t) i[8] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m8(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m8(c); + // load bias to vAcc + vfloat32m8_t vAcc = __riscv_vle32_v_f32m8_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m8_t va; + vfloat32m8_t vb; + for (int k=0; k<9; k++) { + va = __riscv_vle32_v_f32m8_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m8_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m8_tu(vAcc, va, vb, vl); + } + + vAcc = __riscv_vfmax_vf_f32m8_tu(vAcc, vAcc, vmin, vl); + vAcc = __riscv_vfmin_vf_f32m8_tu(vAcc, vAcc, vmax, vl); + __riscv_vse32_v_f32m8(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +} diff --git a/src/f32-dwconv/gen/f32-dwconv-9p8vc-rvv.c b/src/f32-dwconv/gen/f32-dwconv-9p8vc-rvv.c new file mode 100644 index 00000000000..d578697fa2f --- /dev/null +++ b/src/f32-dwconv/gen/f32-dwconv-9p8vc-rvv.c @@ -0,0 +1,106 @@ +// Auto-generated file. Do not edit! +// Template: src/f32-dwconv/unipass-rvv.c.in +// Generator: tools/xngen +// + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +#include +#include +#include "xnnpack/dwconv.h" + +void xnn_f32_dwconv_ukernel_9p8vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const struct xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + do { + const float* i[9]; + i[0] = input[0]; + assert(i[0] != NULL); + if XNN_UNPREDICTABLE(i[0] != zero) { + i[0] = (const float*) ((uintptr_t) i[0] + input_offset); + } + i[1] = input[1]; + assert(i[1] != NULL); + if XNN_UNPREDICTABLE(i[1] != zero) { + i[1] = (const float*) ((uintptr_t) i[1] + input_offset); + } + i[2] = input[2]; + assert(i[2] != NULL); + if XNN_UNPREDICTABLE(i[2] != zero) { + i[2] = (const float*) ((uintptr_t) i[2] + input_offset); + } + i[3] = input[3]; + assert(i[3] != NULL); + if XNN_UNPREDICTABLE(i[3] != zero) { + i[3] = (const float*) ((uintptr_t) i[3] + input_offset); + } + i[4] = input[4]; + assert(i[4] != NULL); + if XNN_UNPREDICTABLE(i[4] != zero) { + i[4] = (const float*) ((uintptr_t) i[4] + input_offset); + } + i[5] = input[5]; + assert(i[5] != NULL); + if XNN_UNPREDICTABLE(i[5] != zero) { + i[5] = (const float*) ((uintptr_t) i[5] + input_offset); + } + i[6] = input[6]; + assert(i[6] != NULL); + if XNN_UNPREDICTABLE(i[6] != zero) { + i[6] = (const float*) ((uintptr_t) i[6] + input_offset); + } + i[7] = input[7]; + assert(i[7] != NULL); + if XNN_UNPREDICTABLE(i[7] != zero) { + i[7] = (const float*) ((uintptr_t) i[7] + input_offset); + } + i[8] = input[8]; + assert(i[8] != NULL); + if XNN_UNPREDICTABLE(i[8] != zero) { + i[8] = (const float*) ((uintptr_t) i[8] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m8(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m8(c); + // load bias to vAcc + vfloat32m8_t vAcc = __riscv_vle32_v_f32m8_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m8_t va; + vfloat32m8_t vb; + for (int k=0; k<9; k++) { + va = __riscv_vle32_v_f32m8_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m8_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m8_tu(vAcc, va, vb, vl); + } + + __riscv_vse32_v_f32m8(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +} diff --git a/src/f32-dwconv/unipass-rvv.c.in b/src/f32-dwconv/unipass-rvv.c.in new file mode 100644 index 00000000000..992ed7cd7fc --- /dev/null +++ b/src/f32-dwconv/unipass-rvv.c.in @@ -0,0 +1,76 @@ + +// Copyright 2024 Andes Technology Corporation +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree.# + +$assert CHANNEL_TILE in ["m1", "m2", "m4", "m8"] +$LMUL = int(CHANNEL_TILE[1]) +$assert KERNEL_TILE >= 2 +$assert ACTIVATION in ["LINEAR", "MINMAX"] +$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" +#include +#include +#include "xnnpack/dwconv.h" + +$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION] +$PARAMS = {"LINEAR": "struct xnn_f32_default_params", "MINMAX": "union xnn_f32_minmax_params"}[ACTIVATION] +void xnn_f32_dwconv${SUFFIX}_ukernel_${KERNEL_TILE}p${LMUL}vc__rvv( + size_t channels, + size_t output_width, + const float** input, + const float* weights, + float* output, + intptr_t input_stride, + size_t output_increment, + size_t input_offset, + const float* zero, + const ${PARAMS} params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(channels != 0); + assert(output_width != 0); + + $if ACTIVATION == "MINMAX": + const float vmin = params->scalar.min; + const float vmax = params->scalar.max; + do { + const float* i[${KERNEL_TILE}]; + $for K in range(KERNEL_TILE): + i[${K}] = input[${K}]; + assert(i[${K}] != NULL); + if XNN_UNPREDICTABLE(i[${K}] != zero) { + i[${K}] = (const float*) ((uintptr_t) i[${K}] + input_offset); + } + input = (const float**) ((uintptr_t) input + input_stride); + + size_t c = channels; + const float* w = weights; + const size_t vlmax = __riscv_vsetvlmax_e32m${LMUL}(); + size_t vl; + + do { + vl = __riscv_vsetvl_e32m${LMUL}(c); + // load bias to vAcc + vfloat32m${LMUL}_t vAcc = __riscv_vle32_v_f32m${LMUL}_tu(vAcc, w, vl); + w += vlmax; + + vfloat32m${LMUL}_t va; + vfloat32m${LMUL}_t vb; + for (int k=0; k<${KERNEL_TILE}; k++) { + va = __riscv_vle32_v_f32m${LMUL}_tu(va, i[k], vl); + vb = __riscv_vle32_v_f32m${LMUL}_tu(vb, w, vl); + w += vlmax; + i[k] += vlmax; + vAcc = __riscv_vfmacc_vv_f32m${LMUL}_tu(vAcc, va, vb, vl); + } + + $if ACTIVATION == "MINMAX": + vAcc = __riscv_vfmax_vf_f32m${LMUL}_tu(vAcc, vAcc, vmin, vl); + vAcc = __riscv_vfmin_vf_f32m${LMUL}_tu(vAcc, vAcc, vmax, vl); + __riscv_vse32_v_f32m${LMUL}(output, vAcc, vl); + output += vl; + c -= vl; + } while(c != 0); + output = (float*) ((uintptr_t) output + output_increment); + } while (--output_width != 0); +}