diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b5d4d9451..e1cd18e8a 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -17,13 +17,13 @@ stages: _TRIPLET: "i686-w64-mingw32" _PLATFORMSUFFIX: ".exe" _WRAPPER: "wine" - _CONTRIB_URL: "https://artifacts.videolan.org/vlc/win32/" + _CONTRIB_URL: "https://artifacts.videolan.org/vlc/win32-llvm/" .variables-win64: &variables-win64 _TRIPLET: "x86_64-w64-mingw32" _PLATFORMSUFFIX: ".exe" - _WRAPPER: "wine64" - _CONTRIB_URL: "https://artifacts.videolan.org/vlc/win64/" + _WRAPPER: "wine" + _CONTRIB_URL: "https://artifacts.videolan.org/vlc/win64-llvm/" .variables-win-armv7: &variables-win-armv7 _TRIPLET: "armv7-w64-mingw32" @@ -79,11 +79,12 @@ stages: - x264${_PLATFORMSUFFIX} - checkasm8${_PLATFORMSUFFIX} - checkasm10${_PLATFORMSUFFIX} + - config.log expire_in: 1 week build-debian-amd64: extends: .build - image: registry.videolan.org/x264-debian-unstable:20211206140856 + image: registry.videolan.org/vlc-debian-unstable:20240212151604 tags: - docker - amd64 @@ -99,9 +100,11 @@ build-debian-aarch64: .build-win: extends: build-debian-amd64 + image: registry.videolan.org/vlc-debian-llvm-msvcrt:20240212151604 script: | set -x LOCAL_INSTALL_DIR=`pwd`/${_TRIPLET} + export PKGCONFIG=pkg-config export PKG_CONFIG_LIBDIR=${LOCAL_INSTALL_DIR}/lib/pkgconfig curl -f -o vlc-contrib.txt ${_CONTRIB_URL} CONTRIB_NAME=$(sed -n -e "s@.*href=\"\(vlc-contrib-${_TRIPLET}-[^\"]*\.tar\.bz2\)\".*@\1@p" vlc-contrib.txt | sed -n -e '1p') @@ -128,7 +131,7 @@ build-win64: .build-llvm-mingw: extends: .build - image: registry.videolan.org/vlc-debian-llvm-mingw:20211020094514 + image: registry.videolan.org/vlc-debian-llvm-ucrt:20240212151604 tags: - docker - amd64 @@ -225,6 +228,21 @@ test-macos-x86_64: - build-macos-x86_64 variables: *variables-macos-x86_64 +test-aarch64-qemu: + <<: *test + extends: build-debian-amd64 + image: registry.videolan.org/x264-debian-unstable:20231113190916 + dependencies: + - build-debian-aarch64 + variables: *variables-debian-amd64 + script: | + set -x + for size in 128 256 512 1024 2048; do + for tool in checkasm8 checkasm10; do + qemu-aarch64 -cpu max,sve-default-vector-length=256,sve$size=on -L /usr/aarch64-linux-gnu ./$tool + done + done + .release: &release stage: release script: | diff --git a/Makefile b/Makefile index cfb72dabb..3c65b3051 100644 --- a/Makefile +++ b/Makefile @@ -160,7 +160,7 @@ endif OBJCHK += tools/checkasm-arm.o endif -# AArch64 NEON optims +# AArch64 NEON and SVE/SVE2 optims ifeq ($(SYS_ARCH),AARCH64) SRCASM_X = common/aarch64/bitstream-a.S \ common/aarch64/cabac-a.S \ @@ -170,6 +170,15 @@ SRCASM_X = common/aarch64/bitstream-a.S \ common/aarch64/pixel-a.S \ common/aarch64/predict-a.S \ common/aarch64/quant-a.S +ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),) +SRCASM_X += common/aarch64/dct-a-sve.S \ + common/aarch64/deblock-a-sve.S \ + common/aarch64/mc-a-sve.S \ + common/aarch64/pixel-a-sve.S +endif +ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),) +SRCASM_X += common/aarch64/dct-a-sve2.S +endif SRCS_X += common/aarch64/asm-offsets.c \ common/aarch64/mc-c.c \ common/aarch64/predict-c.c @@ -197,6 +206,33 @@ SRCS_X += common/mips/dct-c.c \ endif endif +# LOONGARCH optimization +ifeq ($(SYS_ARCH),LOONGARCH) +ifneq ($(findstring HAVE_LSX 1, $(CONFIG)),) +SRCASM_X += common/loongarch/deblock-a.S \ + common/loongarch/sad-a.S \ + common/loongarch/predict-a.S \ + common/loongarch/quant-a.S \ + common/loongarch/mc-a.S \ + common/loongarch/dct-a.S \ + common/loongarch/pixel-a.S + +SRCS_X += common/loongarch/predict-c.c \ + common/loongarch/mc-c.c \ + common/loongarch/pixel-c.c + +OBJASM += +ifneq ($(findstring HAVE_BITDEPTH8 1, $(CONFIG)),) +OBJASM += $(SRCASM_X:%.S=%-8.o) +endif +ifneq ($(findstring HAVE_BITDEPTH10 1, $(CONFIG)),) +OBJASM += $(SRCASM_X:%.S=%-10.o) +endif + +OBJCHK += tools/checkasm-loongarch.o +endif +endif + endif ifneq ($(HAVE_GETOPT_LONG),1) diff --git a/autocomplete.c b/autocomplete.c index 592e27863..1fc33b09e 100644 --- a/autocomplete.c +++ b/autocomplete.c @@ -1,7 +1,7 @@ /***************************************************************************** * autocomplete: x264cli shell autocomplete ***************************************************************************** - * Copyright (C) 2018-2023 x264 project + * Copyright (C) 2018-2024 x264 project * * Authors: Henrik Gramner * diff --git a/common/aarch64/asm-offsets.c b/common/aarch64/asm-offsets.c index d183b4c86..ec669a4b1 100644 --- a/common/aarch64/asm-offsets.c +++ b/common/aarch64/asm-offsets.c @@ -1,7 +1,7 @@ /***************************************************************************** * asm-offsets.c: check asm offsets for aarch64 ***************************************************************************** - * Copyright (C) 2014-2023 x264 project + * Copyright (C) 2014-2024 x264 project * * Authors: Janne Grunau * diff --git a/common/aarch64/asm-offsets.h b/common/aarch64/asm-offsets.h index 4f497545b..216546909 100644 --- a/common/aarch64/asm-offsets.h +++ b/common/aarch64/asm-offsets.h @@ -1,7 +1,7 @@ /***************************************************************************** * asm-offsets.h: asm offsets for aarch64 ***************************************************************************** - * Copyright (C) 2014-2023 x264 project + * Copyright (C) 2014-2024 x264 project * * Authors: Janne Grunau * diff --git a/common/aarch64/asm.S b/common/aarch64/asm.S index cce58d7e8..baec521d1 100644 --- a/common/aarch64/asm.S +++ b/common/aarch64/asm.S @@ -1,7 +1,7 @@ /***************************************************************************** * asm.S: AArch64 utility macros ***************************************************************************** - * Copyright (C) 2008-2023 x264 project + * Copyright (C) 2008-2024 x264 project * * Authors: Mans Rullgard * David Conrad @@ -133,8 +133,8 @@ MACH .const_data .macro SUMSUB_AB sum, sub, a, b - add \sum, \a, \b - sub \sub, \a, \b + add \sum, \a, \b + sub \sub, \a, \b .endm .macro unzip t1, t2, s1, s2 @@ -163,35 +163,35 @@ MACH .const_data .macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9 - trn1 \r8\().8H, \r0\().8H, \r1\().8H - trn2 \r9\().8H, \r0\().8H, \r1\().8H - trn1 \r1\().8H, \r2\().8H, \r3\().8H - trn2 \r3\().8H, \r2\().8H, \r3\().8H - trn1 \r0\().8H, \r4\().8H, \r5\().8H - trn2 \r5\().8H, \r4\().8H, \r5\().8H - trn1 \r2\().8H, \r6\().8H, \r7\().8H - trn2 \r7\().8H, \r6\().8H, \r7\().8H - - trn1 \r4\().4S, \r0\().4S, \r2\().4S - trn2 \r2\().4S, \r0\().4S, \r2\().4S - trn1 \r6\().4S, \r5\().4S, \r7\().4S - trn2 \r7\().4S, \r5\().4S, \r7\().4S - trn1 \r5\().4S, \r9\().4S, \r3\().4S - trn2 \r9\().4S, \r9\().4S, \r3\().4S - trn1 \r3\().4S, \r8\().4S, \r1\().4S - trn2 \r8\().4S, \r8\().4S, \r1\().4S - - trn1 \r0\().2D, \r3\().2D, \r4\().2D - trn2 \r4\().2D, \r3\().2D, \r4\().2D - - trn1 \r1\().2D, \r5\().2D, \r6\().2D - trn2 \r5\().2D, \r5\().2D, \r6\().2D - - trn2 \r6\().2D, \r8\().2D, \r2\().2D - trn1 \r2\().2D, \r8\().2D, \r2\().2D - - trn1 \r3\().2D, \r9\().2D, \r7\().2D - trn2 \r7\().2D, \r9\().2D, \r7\().2D + trn1 \r8\().8h, \r0\().8h, \r1\().8h + trn2 \r9\().8h, \r0\().8h, \r1\().8h + trn1 \r1\().8h, \r2\().8h, \r3\().8h + trn2 \r3\().8h, \r2\().8h, \r3\().8h + trn1 \r0\().8h, \r4\().8h, \r5\().8h + trn2 \r5\().8h, \r4\().8h, \r5\().8h + trn1 \r2\().8h, \r6\().8h, \r7\().8h + trn2 \r7\().8h, \r6\().8h, \r7\().8h + + trn1 \r4\().4s, \r0\().4s, \r2\().4s + trn2 \r2\().4s, \r0\().4s, \r2\().4s + trn1 \r6\().4s, \r5\().4s, \r7\().4s + trn2 \r7\().4s, \r5\().4s, \r7\().4s + trn1 \r5\().4s, \r9\().4s, \r3\().4s + trn2 \r9\().4s, \r9\().4s, \r3\().4s + trn1 \r3\().4s, \r8\().4s, \r1\().4s + trn2 \r8\().4s, \r8\().4s, \r1\().4s + + trn1 \r0\().2d, \r3\().2d, \r4\().2d + trn2 \r4\().2d, \r3\().2d, \r4\().2d + + trn1 \r1\().2d, \r5\().2d, \r6\().2d + trn2 \r5\().2d, \r5\().2d, \r6\().2d + + trn2 \r6\().2d, \r8\().2d, \r2\().2d + trn1 \r2\().2d, \r8\().2d, \r2\().2d + + trn1 \r3\().2d, \r9\().2d, \r7\().2d + trn2 \r7\().2d, \r9\().2d, \r7\().2d .endm .macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1 diff --git a/common/aarch64/bitstream-a.S b/common/aarch64/bitstream-a.S index 81bbb29c6..c7e06b60d 100644 --- a/common/aarch64/bitstream-a.S +++ b/common/aarch64/bitstream-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * bitstream-a.S: aarch64 bitstream functions ***************************************************************************** - * Copyright (C) 2014-2023 x264 project + * Copyright (C) 2014-2024 x264 project * * Authors: Janne Grunau * diff --git a/common/aarch64/bitstream.h b/common/aarch64/bitstream.h index 21e702b13..c366f36fb 100644 --- a/common/aarch64/bitstream.h +++ b/common/aarch64/bitstream.h @@ -1,7 +1,7 @@ /***************************************************************************** * bitstream.h: aarch64 bitstream functions ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/common/aarch64/cabac-a.S b/common/aarch64/cabac-a.S index 816c52401..3e2f7fadf 100644 --- a/common/aarch64/cabac-a.S +++ b/common/aarch64/cabac-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * cabac-a.S: aarch64 cabac ***************************************************************************** - * Copyright (C) 2014-2023 x264 project + * Copyright (C) 2014-2024 x264 project * * Authors: Janne Grunau * diff --git a/common/aarch64/dct-a-common.S b/common/aarch64/dct-a-common.S new file mode 100644 index 000000000..60579339c --- /dev/null +++ b/common/aarch64/dct-a-common.S @@ -0,0 +1,40 @@ +/**************************************************************************** + * dct-a-common.S: aarch64 transform and zigzag + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: David Conrad + * Janne Grunau + * David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +// This file contains the NEON macros that are intended to be used by +// the SVE/SVE2 functions as well + +.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7 + SUMSUB_AB \v1, \v6, \v5, \v6 + SUMSUB_AB \v3, \v7, \v4, \v7 + add \v0, \v3, \v1 + add \v4, \v7, \v7 + add \v5, \v6, \v6 + sub \v2, \v3, \v1 + add \v1, \v4, \v6 + sub \v3, \v7, \v5 +.endm diff --git a/common/aarch64/dct-a-sve.S b/common/aarch64/dct-a-sve.S new file mode 100644 index 000000000..9b94b73ec --- /dev/null +++ b/common/aarch64/dct-a-sve.S @@ -0,0 +1,88 @@ +/**************************************************************************** + * dct-a-sve.S: aarch64 transform and zigzag + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" +#include "dct-a-common.S" + +.arch armv8-a+sve + +function sub4x4_dct_sve, export=1 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + ptrue p0.h, vl4 + ld1b {z0.h}, p0/z, [x1] + add x1, x1, x3 + ld1b {z1.h}, p0/z, [x2] + add x2, x2, x4 + ld1b {z2.h}, p0/z, [x1] + add x1, x1, x3 + sub v16.4h, v0.4h, v1.4h + ld1b {z3.h}, p0/z, [x2] + add x2, x2, x4 + ld1b {z4.h}, p0/z, [x1] + add x1, x1, x3 + sub v17.4h, v2.4h, v3.4h + ld1b {z5.h}, p0/z, [x2] + add x2, x2, x4 + ld1b {z6.h}, p0/z, [x1] + sub v18.4h, v4.4h, v5.4h + ld1b {z7.h}, p0/z, [x2] + sub v19.4h, v6.4h, v7.4h + + DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h + transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7 + DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h + st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0] + ret +endfunc + +function zigzag_interleave_8x8_cavlc_sve, export=1 + mov z31.s, #1 + ptrue p2.s, vl2 + ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64 + ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64 + umax v16.8h, v0.8h, v4.8h + umax v17.8h, v1.8h, v5.8h + umax v18.8h, v2.8h, v6.8h + umax v19.8h, v3.8h, v7.8h + st1 {v0.8h}, [x0], #16 + st1 {v4.8h}, [x0], #16 + umaxp v16.8h, v16.8h, v17.8h + umaxp v18.8h, v18.8h, v19.8h + st1 {v1.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + umaxp v16.8h, v16.8h, v18.8h + st1 {v2.8h}, [x0], #16 + st1 {v6.8h}, [x0], #16 + cmhs v16.4s, v16.4s, v31.4s + st1 {v3.8h}, [x0], #16 + and v16.16b, v16.16b, v31.16b + st1 {v7.8h}, [x0], #16 + st1b {z16.s}, p2, [x2] + add x2, x2, #8 + mov v16.d[0], v16.d[1] + st1b {z16.s}, p2, [x2] + ret +endfunc diff --git a/common/aarch64/dct-a-sve2.S b/common/aarch64/dct-a-sve2.S new file mode 100644 index 000000000..ebe3969e3 --- /dev/null +++ b/common/aarch64/dct-a-sve2.S @@ -0,0 +1,89 @@ +/**************************************************************************** + * dct-a-sve2.S: aarch64 transform and zigzag + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" +#include "dct-a-common.S" + +.arch armv8-a+sve+sve2 + +function add4x4_idct_sve2, export=1 + mov x2, #FDEC_STRIDE + mov x11, x0 + ptrue p0.h, vl8 + ptrue p1.h, vl4 + ld1 {v0.8h, v1.8h}, [x1] + + SUMSUB_AB v4.8h, v5.8h, v0.8h, v1.8h + + sshr v7.8h, v0.8h, #1 + sshr v6.8h, v1.8h, #1 + sub v7.8h, v7.8h, v1.8h + add v6.8h, v6.8h, v0.8h + mov v7.d[0], v7.d[1] + mov v6.d[0], v6.d[1] + ld1b {z28.h}, p0/z, [x11] + add x11, x11, x2 + SUMSUB_AB v0.8h, v2.8h, v4.8h, v6.8h + SUMSUB_AB v1.8h, v3.8h, v5.8h, v7.8h + + transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19 + + SUMSUB_AB v4.4h, v5.4h, v0.4h, v3.4h + + sshr v7.4h, v1.4h, #1 + sshr v6.4h, v2.4h, #1 + sub v7.4h, v7.4h, v2.4h + add v6.4h, v6.4h, v1.4h + ld1b {z29.h}, p0/z, [x11] + add x11, x11, x2 + SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h + SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h + + srshr z0.h, p1/m, z0.h, #6 + srshr z1.h, p1/m, z1.h, #6 + ld1b {z31.h}, p0/z, [x11] + add x11, x11, x2 + srshr z2.h, p1/m, z2.h, #6 + srshr z3.h, p1/m, z3.h, #6 + ld1b {z30.h}, p0/z, [x11] + + add v0.8h, v0.8h, v28.8h + add v1.8h, v1.8h, v29.8h + add v2.8h, v2.8h, v30.8h + add v3.8h, v3.8h, v31.8h + sqxtunb z0.b, z0.h + sqxtunb z1.b, z1.h + sqxtunb z2.b, z2.h + sqxtunb z3.b, z3.h + + st1b {z0.h}, p1, [x0] + add x0, x0, x2 + st1b {z1.h}, p1, [x0] + add x0, x0, x2 + st1b {z3.h}, p1, [x0] + add x0, x0, x2 + st1b {z2.h}, p1, [x0] + ret +endfunc diff --git a/common/aarch64/dct-a.S b/common/aarch64/dct-a.S index 8a704e7c2..8d7a09b2d 100644 --- a/common/aarch64/dct-a.S +++ b/common/aarch64/dct-a.S @@ -1,7 +1,7 @@ /**************************************************************************** * dct-a.S: aarch64 transform and zigzag ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -25,6 +25,7 @@ *****************************************************************************/ #include "asm.S" +#include "dct-a-common.S" const scan4x4_frame, align=4 .byte 0,1, 8,9, 2,3, 4,5 @@ -80,7 +81,7 @@ endconst function dct4x4dc_neon, export=1 - ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] movi v31.4h, #1 SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h @@ -98,12 +99,12 @@ function dct4x4dc_neon, export=1 shsub v1.4h, v16.4h, v5.4h shsub v2.4h, v17.4h, v7.4h srhadd v3.4h, v6.4h, v7.4h - st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] ret endfunc function idct4x4dc_neon, export=1 - ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h @@ -116,60 +117,49 @@ function idct4x4dc_neon, export=1 transpose v6.2s, v7.2s, v2.2s, v3.2s SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h - st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] + st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0] ret endfunc -.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7 - SUMSUB_AB \v1, \v6, \v5, \v6 - SUMSUB_AB \v3, \v7, \v4, \v7 - add \v0, \v3, \v1 - add \v4, \v7, \v7 - add \v5, \v6, \v6 - sub \v2, \v3, \v1 - add \v1, \v4, \v6 - sub \v3, \v7, \v5 -.endm - function sub4x4_dct_neon, export=1 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE - ld1 {v0.s}[0], [x1], x3 - ld1 {v1.s}[0], [x2], x4 - ld1 {v2.s}[0], [x1], x3 + ld1 {v0.s}[0], [x1], x3 + ld1 {v1.s}[0], [x2], x4 + ld1 {v2.s}[0], [x1], x3 usubl v16.8h, v0.8b, v1.8b - ld1 {v3.s}[0], [x2], x4 - ld1 {v4.s}[0], [x1], x3 + ld1 {v3.s}[0], [x2], x4 + ld1 {v4.s}[0], [x1], x3 usubl v17.8h, v2.8b, v3.8b - ld1 {v5.s}[0], [x2], x4 - ld1 {v6.s}[0], [x1], x3 + ld1 {v5.s}[0], [x2], x4 + ld1 {v6.s}[0], [x1], x3 usubl v18.8h, v4.8b, v5.8b - ld1 {v7.s}[0], [x2], x4 + ld1 {v7.s}[0], [x2], x4 usubl v19.8h, v6.8b, v7.8b DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7 DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h - st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0] + st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0] ret endfunc function sub8x4_dct_neon - ld1 {v0.8b}, [x1], x3 - ld1 {v1.8b}, [x2], x4 + ld1 {v0.8b}, [x1], x3 + ld1 {v1.8b}, [x2], x4 usubl v16.8h, v0.8b, v1.8b - ld1 {v2.8b}, [x1], x3 - ld1 {v3.8b}, [x2], x4 + ld1 {v2.8b}, [x1], x3 + ld1 {v3.8b}, [x2], x4 usubl v17.8h, v2.8b, v3.8b - ld1 {v4.8b}, [x1], x3 - ld1 {v5.8b}, [x2], x4 + ld1 {v4.8b}, [x1], x3 + ld1 {v5.8b}, [x2], x4 usubl v18.8h, v4.8b, v5.8b - ld1 {v6.8b}, [x1], x3 - ld1 {v7.8b}, [x2], x4 + ld1 {v6.8b}, [x1], x3 + ld1 {v7.8b}, [x2], x4 usubl v19.8h, v6.8b, v7.8b DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h - transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 + transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7 SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h @@ -186,10 +176,10 @@ function sub8x4_dct_neon zip1 v5.2d, v1.2d, v3.2d zip2 v7.2d, v1.2d, v3.2d - st1 {v4.8h}, [x0], #16 - st1 {v5.8h}, [x0], #16 - st1 {v6.8h}, [x0], #16 - st1 {v7.8h}, [x0], #16 + st1 {v4.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + st1 {v6.8h}, [x0], #16 + st1 {v7.8h}, [x0], #16 ret endfunc @@ -249,7 +239,7 @@ endfunc add v22.8h, v28.8h, v22.8h add v31.8h, v31.8h, v19.8h - SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h + SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h @@ -258,37 +248,37 @@ endfunc function sub8x8_dct8_neon, export=1 mov x3, #FENC_STRIDE mov x4, #FDEC_STRIDE - ld1 {v16.8b}, [x1], x3 - ld1 {v17.8b}, [x2], x4 - ld1 {v18.8b}, [x1], x3 - ld1 {v19.8b}, [x2], x4 + ld1 {v16.8b}, [x1], x3 + ld1 {v17.8b}, [x2], x4 + ld1 {v18.8b}, [x1], x3 + ld1 {v19.8b}, [x2], x4 usubl v0.8h, v16.8b, v17.8b - ld1 {v20.8b}, [x1], x3 - ld1 {v21.8b}, [x2], x4 + ld1 {v20.8b}, [x1], x3 + ld1 {v21.8b}, [x2], x4 usubl v1.8h, v18.8b, v19.8b - ld1 {v22.8b}, [x1], x3 - ld1 {v23.8b}, [x2], x4 + ld1 {v22.8b}, [x1], x3 + ld1 {v23.8b}, [x2], x4 usubl v2.8h, v20.8b, v21.8b - ld1 {v24.8b}, [x1], x3 - ld1 {v25.8b}, [x2], x4 + ld1 {v24.8b}, [x1], x3 + ld1 {v25.8b}, [x2], x4 usubl v3.8h, v22.8b, v23.8b - ld1 {v26.8b}, [x1], x3 - ld1 {v27.8b}, [x2], x4 + ld1 {v26.8b}, [x1], x3 + ld1 {v27.8b}, [x2], x4 usubl v4.8h, v24.8b, v25.8b - ld1 {v28.8b}, [x1], x3 - ld1 {v29.8b}, [x2], x4 + ld1 {v28.8b}, [x1], x3 + ld1 {v29.8b}, [x2], x4 usubl v5.8h, v26.8b, v27.8b - ld1 {v30.8b}, [x1], x3 - ld1 {v31.8b}, [x2], x4 + ld1 {v30.8b}, [x1], x3 + ld1 {v31.8b}, [x2], x4 usubl v6.8h, v28.8b, v29.8b usubl v7.8h, v30.8b, v31.8b - DCT8_1D row + DCT8_1D row transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 - DCT8_1D col + DCT8_1D col - st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64 - st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64 + st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64 + st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64 ret endfunc @@ -319,26 +309,26 @@ endfunc function add4x4_idct_neon, export=1 mov x2, #FDEC_STRIDE - ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h - ld1 {v28.s}[0], [x0], x2 + ld1 {v28.s}[0], [x0], x2 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19 IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h - ld1 {v29.s}[0], [x0], x2 + ld1 {v29.s}[0], [x0], x2 SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h srshr v0.4h, v0.4h, #6 srshr v1.4h, v1.4h, #6 - ld1 {v31.s}[0], [x0], x2 + ld1 {v31.s}[0], [x0], x2 srshr v2.4h, v2.4h, #6 srshr v3.4h, v3.4h, #6 - ld1 {v30.s}[0], [x0], x2 + ld1 {v30.s}[0], [x0], x2 sub x0, x0, x2, lsl #2 uaddw v0.8h, v0.8h, v28.8b @@ -350,16 +340,16 @@ function add4x4_idct_neon, export=1 sqxtun v2.8b, v2.8h sqxtun v3.8b, v3.8h - st1 {v0.s}[0], [x0], x2 - st1 {v1.s}[0], [x0], x2 - st1 {v3.s}[0], [x0], x2 - st1 {v2.s}[0], [x0], x2 + st1 {v0.s}[0], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v3.s}[0], [x0], x2 + st1 {v2.s}[0], [x0], x2 ret endfunc function add8x4_idct_neon, export=1 - ld1 {v0.8h,v1.8h}, [x1], #32 - ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v0.8h,v1.8h}, [x1], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 transpose v20.2d, v21.2d, v0.2d, v2.2d transpose v22.2d, v23.2d, v1.2d, v3.2d IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h @@ -373,13 +363,13 @@ function add8x4_idct_neon, export=1 SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h srshr v0.8h, v0.8h, #6 - ld1 {v28.8b}, [x0], x2 + ld1 {v28.8b}, [x0], x2 srshr v1.8h, v1.8h, #6 - ld1 {v29.8b}, [x0], x2 + ld1 {v29.8b}, [x0], x2 srshr v2.8h, v2.8h, #6 - ld1 {v30.8b}, [x0], x2 + ld1 {v30.8b}, [x0], x2 srshr v3.8h, v3.8h, #6 - ld1 {v31.8b}, [x0], x2 + ld1 {v31.8b}, [x0], x2 sub x0, x0, x2, lsl #2 uaddw v0.8h, v0.8h, v28.8b @@ -389,12 +379,12 @@ function add8x4_idct_neon, export=1 sqxtun v0.8b, v0.8h sqxtun v1.8b, v1.8h - st1 {v0.8b}, [x0], x2 + st1 {v0.8b}, [x0], x2 sqxtun v2.8b, v2.8h - st1 {v1.8b}, [x0], x2 + st1 {v1.8b}, [x0], x2 sqxtun v3.8b, v3.8h - st1 {v2.8b}, [x0], x2 - st1 {v3.8b}, [x0], x2 + st1 {v2.8b}, [x0], x2 + st1 {v3.8b}, [x0], x2 ret endfunc @@ -426,7 +416,7 @@ endfunc .macro IDCT8_1D type SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2 .ifc \type, row - ld1 {v22.8h,v23.8h}, [x1], #32 + ld1 {v22.8h,v23.8h}, [x1], #32 .endif SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4 SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h @@ -448,9 +438,9 @@ endfunc function add8x8_idct8_neon, export=1 mov x2, #FDEC_STRIDE - ld1 {v16.8h,v17.8h}, [x1], #32 - ld1 {v18.8h,v19.8h}, [x1], #32 - ld1 {v20.8h,v21.8h}, [x1], #32 + ld1 {v16.8h,v17.8h}, [x1], #32 + ld1 {v18.8h,v19.8h}, [x1], #32 + ld1 {v20.8h,v21.8h}, [x1], #32 IDCT8_1D row @@ -458,21 +448,21 @@ function add8x8_idct8_neon, export=1 IDCT8_1D col - ld1 {v0.8b}, [x0], x2 + ld1 {v0.8b}, [x0], x2 srshr v16.8h, v16.8h, #6 - ld1 {v1.8b}, [x0], x2 + ld1 {v1.8b}, [x0], x2 srshr v17.8h, v17.8h, #6 - ld1 {v2.8b}, [x0], x2 + ld1 {v2.8b}, [x0], x2 srshr v18.8h, v18.8h, #6 - ld1 {v3.8b}, [x0], x2 + ld1 {v3.8b}, [x0], x2 srshr v19.8h, v19.8h, #6 - ld1 {v4.8b}, [x0], x2 + ld1 {v4.8b}, [x0], x2 srshr v20.8h, v20.8h, #6 - ld1 {v5.8b}, [x0], x2 + ld1 {v5.8b}, [x0], x2 srshr v21.8h, v21.8h, #6 - ld1 {v6.8b}, [x0], x2 + ld1 {v6.8b}, [x0], x2 srshr v22.8h, v22.8h, #6 - ld1 {v7.8b}, [x0], x2 + ld1 {v7.8b}, [x0], x2 srshr v23.8h, v23.8h, #6 sub x0, x0, x2, lsl #3 @@ -483,23 +473,23 @@ function add8x8_idct8_neon, export=1 sqxtun v1.8b, v17.8h sqxtun v2.8b, v18.8h uaddw v19.8h, v19.8h, v3.8b - st1 {v0.8b}, [x0], x2 + st1 {v0.8b}, [x0], x2 uaddw v20.8h, v20.8h, v4.8b - st1 {v1.8b}, [x0], x2 + st1 {v1.8b}, [x0], x2 uaddw v21.8h, v21.8h, v5.8b - st1 {v2.8b}, [x0], x2 + st1 {v2.8b}, [x0], x2 sqxtun v3.8b, v19.8h sqxtun v4.8b, v20.8h uaddw v22.8h, v22.8h, v6.8b uaddw v23.8h, v23.8h, v7.8b - st1 {v3.8b}, [x0], x2 + st1 {v3.8b}, [x0], x2 sqxtun v5.8b, v21.8h - st1 {v4.8b}, [x0], x2 + st1 {v4.8b}, [x0], x2 sqxtun v6.8b, v22.8h sqxtun v7.8b, v23.8h - st1 {v5.8b}, [x0], x2 - st1 {v6.8b}, [x0], x2 - st1 {v7.8b}, [x0], x2 + st1 {v5.8b}, [x0], x2 + st1 {v6.8b}, [x0], x2 + st1 {v7.8b}, [x0], x2 ret endfunc @@ -517,24 +507,24 @@ endfunc function add8x8_idct_dc_neon, export=1 mov x2, #FDEC_STRIDE - ld1 {v16.4h}, [x1] - ld1 {v0.8b}, [x0], x2 + ld1 {v16.4h}, [x1] + ld1 {v0.8b}, [x0], x2 srshr v16.4h, v16.4h, #6 - ld1 {v1.8b}, [x0], x2 + ld1 {v1.8b}, [x0], x2 dup v20.8h, v16.h[0] dup v21.8h, v16.h[1] - ld1 {v2.8b}, [x0], x2 + ld1 {v2.8b}, [x0], x2 dup v22.8h, v16.h[2] dup v23.8h, v16.h[3] - ld1 {v3.8b}, [x0], x2 + ld1 {v3.8b}, [x0], x2 trn1 v20.2d, v20.2d, v21.2d - ld1 {v4.8b}, [x0], x2 + ld1 {v4.8b}, [x0], x2 trn1 v21.2d, v22.2d, v23.2d - ld1 {v5.8b}, [x0], x2 + ld1 {v5.8b}, [x0], x2 neg v22.8h, v20.8h - ld1 {v6.8b}, [x0], x2 + ld1 {v6.8b}, [x0], x2 neg v23.8h, v21.8h - ld1 {v7.8b}, [x0], x2 + ld1 {v7.8b}, [x0], x2 sub x0, x0, #8*FDEC_STRIDE @@ -560,14 +550,14 @@ function add8x8_idct_dc_neon, export=1 uqsub v6.8b, v6.8b, v23.8b uqsub v7.8b, v7.8b, v23.8b - st1 {v0.8b}, [x0], x2 - st1 {v1.8b}, [x0], x2 - st1 {v2.8b}, [x0], x2 - st1 {v3.8b}, [x0], x2 - st1 {v4.8b}, [x0], x2 - st1 {v5.8b}, [x0], x2 - st1 {v6.8b}, [x0], x2 - st1 {v7.8b}, [x0], x2 + st1 {v0.8b}, [x0], x2 + st1 {v1.8b}, [x0], x2 + st1 {v2.8b}, [x0], x2 + st1 {v3.8b}, [x0], x2 + st1 {v4.8b}, [x0], x2 + st1 {v5.8b}, [x0], x2 + st1 {v6.8b}, [x0], x2 + st1 {v7.8b}, [x0], x2 ret endfunc @@ -590,16 +580,16 @@ endfunc sqxtun2 v20.16b, v25.8h sqxtun2 v21.16b, v27.8h - uqadd v4.16b, v4.16b, v20.16b - uqadd v5.16b, v5.16b, v20.16b - uqadd v6.16b, v6.16b, v20.16b - uqadd v7.16b, v7.16b, v20.16b + uqadd v4.16b, v4.16b, v20.16b + uqadd v5.16b, v5.16b, v20.16b + uqadd v6.16b, v6.16b, v20.16b + uqadd v7.16b, v7.16b, v20.16b - uqsub v4.16b, v4.16b, v21.16b - uqsub v5.16b, v5.16b, v21.16b - uqsub v6.16b, v6.16b, v21.16b + uqsub v4.16b, v4.16b, v21.16b + uqsub v5.16b, v5.16b, v21.16b + uqsub v6.16b, v6.16b, v21.16b st1 {v4.16b}, [x2], x3 - uqsub v7.16b, v7.16b, v21.16b + uqsub v7.16b, v7.16b, v21.16b st1 {v5.16b}, [x2], x3 st1 {v6.16b}, [x2], x3 st1 {v7.16b}, [x2], x3 @@ -609,7 +599,7 @@ function add16x16_idct_dc_neon, export=1 mov x2, x0 mov x3, #FDEC_STRIDE - ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] + ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1] srshr v0.4h, v0.4h, #6 srshr v1.4h, v1.4h, #6 @@ -623,16 +613,16 @@ function add16x16_idct_dc_neon, export=1 endfunc .macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7 - ld1 {\t0\().8b}, [x1], x3 - ld1 {\t1\().8b}, [x2], x4 - ld1 {\t2\().8b}, [x1], x3 - ld1 {\t3\().8b}, [x2], x4 + ld1 {\t0\().8b}, [x1], x3 + ld1 {\t1\().8b}, [x2], x4 + ld1 {\t2\().8b}, [x1], x3 + ld1 {\t3\().8b}, [x2], x4 usubl \t0\().8h, \t0\().8b, \t1\().8b - ld1 {\t4\().8b}, [x1], x3 - ld1 {\t5\().8b}, [x2], x4 + ld1 {\t4\().8b}, [x1], x3 + ld1 {\t5\().8b}, [x2], x4 usubl \t1\().8h, \t2\().8b, \t3\().8b - ld1 {\t6\().8b}, [x1], x3 - ld1 {\t7\().8b}, [x2], x4 + ld1 {\t6\().8b}, [x1], x3 + ld1 {\t7\().8b}, [x2], x4 add \dst\().8h, \t0\().8h, \t1\().8h usubl \t2\().8h, \t4\().8b, \t5\().8b usubl \t3\().8h, \t6\().8b, \t7\().8b @@ -641,11 +631,11 @@ endfunc .endm function sub8x8_dct_dc_neon, export=1 - mov x3, #FENC_STRIDE - mov x4, #FDEC_STRIDE + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE - sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 - sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 + sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 + sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 transpose v2.2d, v3.2d, v0.2d, v1.2d SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h @@ -656,20 +646,20 @@ function sub8x8_dct_dc_neon, export=1 addp v0.8h, v2.8h, v3.8h addp v0.8h, v0.8h, v0.8h - st1 {v0.4h}, [x0] + st1 {v0.4h}, [x0] ret endfunc function sub8x16_dct_dc_neon, export=1 - mov x3, #FENC_STRIDE - mov x4, #FDEC_STRIDE - sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 - sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 - sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23 - sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31 + mov x3, #FENC_STRIDE + mov x4, #FDEC_STRIDE + sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23 + sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31 + sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23 + sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31 - addp v4.8h, v0.8h, v2.8h - addp v5.8h, v1.8h, v3.8h + addp v4.8h, v0.8h, v2.8h + addp v5.8h, v1.8h, v3.8h transpose v2.4s, v3.4s, v4.4s, v5.4s SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h @@ -685,46 +675,46 @@ function sub8x16_dct_dc_neon, export=1 addp v0.8h, v2.8h, v3.8h - st1 {v0.8h}, [x0] + st1 {v0.8h}, [x0] ret endfunc function zigzag_interleave_8x8_cavlc_neon, export=1 - mov x3, #7 - movi v31.4s, #1 - ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64 - ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64 - umax v16.8h, v0.8h, v4.8h - umax v17.8h, v1.8h, v5.8h - umax v18.8h, v2.8h, v6.8h - umax v19.8h, v3.8h, v7.8h - st1 {v0.8h}, [x0], #16 - st1 {v4.8h}, [x0], #16 - umaxp v16.8h, v16.8h, v17.8h - umaxp v18.8h, v18.8h, v19.8h - st1 {v1.8h}, [x0], #16 - st1 {v5.8h}, [x0], #16 - umaxp v16.8h, v16.8h, v18.8h - st1 {v2.8h}, [x0], #16 - st1 {v6.8h}, [x0], #16 - cmhs v16.4s, v16.4s, v31.4s - st1 {v3.8h}, [x0], #16 - and v16.16b, v16.16b, v31.16b - st1 {v7.8h}, [x0], #16 - st1 {v16.b}[0], [x2], #1 - st1 {v16.b}[4], [x2], x3 - st1 {v16.b}[8], [x2], #1 - st1 {v16.b}[12], [x2] + mov x3, #7 + movi v31.4s, #1 + ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64 + ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64 + umax v16.8h, v0.8h, v4.8h + umax v17.8h, v1.8h, v5.8h + umax v18.8h, v2.8h, v6.8h + umax v19.8h, v3.8h, v7.8h + st1 {v0.8h}, [x0], #16 + st1 {v4.8h}, [x0], #16 + umaxp v16.8h, v16.8h, v17.8h + umaxp v18.8h, v18.8h, v19.8h + st1 {v1.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + umaxp v16.8h, v16.8h, v18.8h + st1 {v2.8h}, [x0], #16 + st1 {v6.8h}, [x0], #16 + cmhs v16.4s, v16.4s, v31.4s + st1 {v3.8h}, [x0], #16 + and v16.16b, v16.16b, v31.16b + st1 {v7.8h}, [x0], #16 + st1 {v16.b}[0], [x2], #1 + st1 {v16.b}[4], [x2], x3 + st1 {v16.b}[8], [x2], #1 + st1 {v16.b}[12], [x2] ret endfunc function zigzag_scan_4x4_frame_neon, export=1 movrel x2, scan4x4_frame - ld1 {v0.16b,v1.16b}, [x1] - ld1 {v16.16b,v17.16b}, [x2] + ld1 {v0.16b,v1.16b}, [x1] + ld1 {v16.16b,v17.16b}, [x2] tbl v2.16b, {v0.16b,v1.16b}, v16.16b tbl v3.16b, {v0.16b,v1.16b}, v17.16b - st1 {v2.16b,v3.16b}, [x0] + st1 {v2.16b,v3.16b}, [x0] ret endfunc @@ -734,18 +724,18 @@ function zigzag_sub_4x4\ac\()_\f\()_neon, export=1 mov x4, #FDEC_STRIDE movrel x5, sub4x4_\f mov x6, x2 - ld1 {v0.s}[0], [x1], x9 - ld1 {v0.s}[1], [x1], x9 - ld1 {v0.s}[2], [x1], x9 - ld1 {v0.s}[3], [x1], x9 - ld1 {v16.16b}, [x5] - ld1 {v1.s}[0], [x2], x4 - ld1 {v1.s}[1], [x2], x4 - ld1 {v1.s}[2], [x2], x4 - ld1 {v1.s}[3], [x2], x4 + ld1 {v0.s}[0], [x1], x9 + ld1 {v0.s}[1], [x1], x9 + ld1 {v0.s}[2], [x1], x9 + ld1 {v0.s}[3], [x1], x9 + ld1 {v16.16b}, [x5] + ld1 {v1.s}[0], [x2], x4 + ld1 {v1.s}[1], [x2], x4 + ld1 {v1.s}[2], [x2], x4 + ld1 {v1.s}[3], [x2], x4 tbl v2.16b, {v0.16b}, v16.16b tbl v3.16b, {v1.16b}, v16.16b - st1 {v0.s}[0], [x6], x4 + st1 {v0.s}[0], [x6], x4 usubl v4.8h, v2.8b, v3.8b .ifc \ac, ac dup h7, v4.h[0] @@ -754,14 +744,14 @@ function zigzag_sub_4x4\ac\()_\f\()_neon, export=1 strh w5, [x3] .endif usubl2 v5.8h, v2.16b, v3.16b - st1 {v0.s}[1], [x6], x4 + st1 {v0.s}[1], [x6], x4 umax v6.8h, v4.8h, v5.8h umaxv h6, v6.8h - st1 {v0.s}[2], [x6], x4 + st1 {v0.s}[2], [x6], x4 fmov w7, s6 - st1 {v0.s}[3], [x6], x4 + st1 {v0.s}[3], [x6], x4 cmp w7, #0 - st1 {v4.8h,v5.8h}, [x0] + st1 {v4.8h,v5.8h}, [x0] cset w0, ne ret endfunc @@ -774,23 +764,23 @@ zigzag_sub_4x4 frame, ac function zigzag_scan_4x4_field_neon, export=1 movrel x2, scan4x4_field - ld1 {v0.8h,v1.8h}, [x1] - ld1 {v16.16b}, [x2] + ld1 {v0.8h,v1.8h}, [x1] + ld1 {v16.16b}, [x2] tbl v0.16b, {v0.16b}, v16.16b - st1 {v0.8h,v1.8h}, [x0] + st1 {v0.8h,v1.8h}, [x0] ret endfunc function zigzag_scan_8x8_frame_neon, export=1 movrel x2, scan8x8_frame - ld1 {v0.8h,v1.8h}, [x1], #32 - ld1 {v2.8h,v3.8h}, [x1], #32 - ld1 {v4.8h,v5.8h}, [x1], #32 - ld1 {v6.8h,v7.8h}, [x1] - ld1 {v16.16b,v17.16b}, [x2], #32 - ld1 {v18.16b,v19.16b}, [x2], #32 - ld1 {v20.16b,v21.16b}, [x2], #32 - ld1 {v22.16b,v23.16b}, [x2], #32 + ld1 {v0.8h,v1.8h}, [x1], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v4.8h,v5.8h}, [x1], #32 + ld1 {v6.8h,v7.8h}, [x1] + ld1 {v16.16b,v17.16b}, [x2], #32 + ld1 {v18.16b,v19.16b}, [x2], #32 + ld1 {v20.16b,v21.16b}, [x2], #32 + ld1 {v22.16b,v23.16b}, [x2], #32 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b @@ -807,10 +797,10 @@ function zigzag_scan_8x8_frame_neon, export=1 mov v29.h[7], v3.h[6] mov v30.h[0], v2.h[7] mov v30.h[1], v3.h[7] - st1 {v24.8h,v25.8h}, [x0], #32 - st1 {v26.8h,v27.8h}, [x0], #32 - st1 {v28.8h,v29.8h}, [x0], #32 - st1 {v30.8h,v31.8h}, [x0] + st1 {v24.8h,v25.8h}, [x0], #32 + st1 {v26.8h,v27.8h}, [x0], #32 + st1 {v28.8h,v29.8h}, [x0], #32 + st1 {v30.8h,v31.8h}, [x0] ret endfunc @@ -843,14 +833,14 @@ endconst function zigzag_scan_8x8_field_neon, export=1 movrel x2, scan8x8_field - ld1 {v0.8h,v1.8h}, [x1], #32 - ld1 {v2.8h,v3.8h}, [x1], #32 - ld1 {v4.8h,v5.8h}, [x1], #32 - ld1 {v6.8h,v7.8h}, [x1] - ld1 {v16.16b,v17.16b}, [x2], #32 - ld1 {v18.16b,v19.16b}, [x2], #32 - ld1 {v20.16b,v21.16b}, [x2], #32 - ld1 {v22.16b}, [x2] + ld1 {v0.8h,v1.8h}, [x1], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v4.8h,v5.8h}, [x1], #32 + ld1 {v6.8h,v7.8h}, [x1] + ld1 {v16.16b,v17.16b}, [x2], #32 + ld1 {v18.16b,v19.16b}, [x2], #32 + ld1 {v20.16b,v21.16b}, [x2], #32 + ld1 {v22.16b}, [x2] ext v31.16b, v7.16b, v7.16b, #4 tbl v24.16b, {v0.16b,v1.16b}, v16.16b tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b @@ -860,10 +850,10 @@ function zigzag_scan_8x8_field_neon, export=1 tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b ext v31.16b, v6.16b, v31.16b, #12 - st1 {v24.8h,v25.8h}, [x0], #32 - st1 {v26.8h,v27.8h}, [x0], #32 - st1 {v28.8h,v29.8h}, [x0], #32 - st1 {v30.8h,v31.8h}, [x0] + st1 {v24.8h,v25.8h}, [x0], #32 + st1 {v26.8h,v27.8h}, [x0], #32 + st1 {v28.8h,v29.8h}, [x0], #32 + st1 {v30.8h,v31.8h}, [x0] ret endfunc @@ -873,24 +863,24 @@ function zigzag_sub_8x8_\f\()_neon, export=1 mov x5, #FENC_STRIDE mov x6, #FDEC_STRIDE mov x7, x2 - ld1 {v0.d}[0], [x1], x5 - ld1 {v0.d}[1], [x1], x5 - ld1 {v1.d}[0], [x1], x5 - ld1 {v1.d}[1], [x1], x5 - ld1 {v2.d}[0], [x1], x5 - ld1 {v2.d}[1], [x1], x5 - ld1 {v3.d}[0], [x1], x5 - ld1 {v3.d}[1], [x1] - ld1 {v4.d}[0], [x2], x6 - ld1 {v4.d}[1], [x2], x6 - ld1 {v5.d}[0], [x2], x6 - ld1 {v5.d}[1], [x2], x6 - ld1 {v6.d}[0], [x2], x6 - ld1 {v6.d}[1], [x2], x6 - ld1 {v7.d}[0], [x2], x6 - ld1 {v7.d}[1], [x2] - ld1 {v16.16b,v17.16b}, [x4], #32 - ld1 {v18.16b,v19.16b}, [x4], #32 + ld1 {v0.d}[0], [x1], x5 + ld1 {v0.d}[1], [x1], x5 + ld1 {v1.d}[0], [x1], x5 + ld1 {v1.d}[1], [x1], x5 + ld1 {v2.d}[0], [x1], x5 + ld1 {v2.d}[1], [x1], x5 + ld1 {v3.d}[0], [x1], x5 + ld1 {v3.d}[1], [x1] + ld1 {v4.d}[0], [x2], x6 + ld1 {v4.d}[1], [x2], x6 + ld1 {v5.d}[0], [x2], x6 + ld1 {v5.d}[1], [x2], x6 + ld1 {v6.d}[0], [x2], x6 + ld1 {v6.d}[1], [x2], x6 + ld1 {v7.d}[0], [x2], x6 + ld1 {v7.d}[1], [x2] + ld1 {v16.16b,v17.16b}, [x4], #32 + ld1 {v18.16b,v19.16b}, [x4], #32 tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b @@ -915,18 +905,18 @@ function zigzag_sub_8x8_\f\()_neon, export=1 umax v21.8h, v22.8h, v23.8h umax v20.8h, v20.8h, v21.8h umaxv h22, v20.8h - st1 {v0.d}[0], [x7], x6 - st1 {v0.d}[1], [x7], x6 - st1 {v1.d}[0], [x7], x6 - st1 {v1.d}[1], [x7], x6 - st1 {v2.d}[0], [x7], x6 - st1 {v2.d}[1], [x7], x6 - st1 {v3.d}[0], [x7], x6 - st1 {v3.d}[1], [x7] - st1 {v4.8h,v5.8h}, [x0], #32 - st1 {v6.8h,v7.8h}, [x0], #32 - st1 {v16.8h,v17.8h}, [x0], #32 - st1 {v18.8h,v19.8h}, [x0] + st1 {v0.d}[0], [x7], x6 + st1 {v0.d}[1], [x7], x6 + st1 {v1.d}[0], [x7], x6 + st1 {v1.d}[1], [x7], x6 + st1 {v2.d}[0], [x7], x6 + st1 {v2.d}[1], [x7], x6 + st1 {v3.d}[0], [x7], x6 + st1 {v3.d}[1], [x7] + st1 {v4.8h,v5.8h}, [x0], #32 + st1 {v6.8h,v7.8h}, [x0], #32 + st1 {v16.8h,v17.8h}, [x0], #32 + st1 {v18.8h,v19.8h}, [x0] fmov w9, s22 cmp w9, #0 cset w0, ne diff --git a/common/aarch64/dct.h b/common/aarch64/dct.h index 1590b0d77..40066cb63 100644 --- a/common/aarch64/dct.h +++ b/common/aarch64/dct.h @@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: aarch64 transform and zigzag ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -91,4 +91,13 @@ int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel #define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon) void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz ); +#define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve) +void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 ); + +#define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2) +void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] ); + +#define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve) +void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz ); + #endif diff --git a/common/aarch64/deblock-a-common.S b/common/aarch64/deblock-a-common.S new file mode 100644 index 000000000..c871cb739 --- /dev/null +++ b/common/aarch64/deblock-a-common.S @@ -0,0 +1,43 @@ +/***************************************************************************** + * deblock-a-common.S: aarch64 deblocking + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: Mans Rullgard + * Janne Grunau + * David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +// This file contains the NEON macros that are intended to be used by +// the SVE/SVE2 functions as well + +.macro h264_loop_filter_start + cmp w2, #0 + ldr w6, [x4] + ccmp w3, #0, #0, ne + mov v24.s[0], w6 + and w8, w6, w6, lsl #16 + b.eq 1f + ands w8, w8, w8, lsl #8 + b.ge 2f +1: + ret +2: +.endm diff --git a/common/aarch64/deblock-a-sve.S b/common/aarch64/deblock-a-sve.S new file mode 100644 index 000000000..e38efc9cb --- /dev/null +++ b/common/aarch64/deblock-a-sve.S @@ -0,0 +1,98 @@ +/***************************************************************************** + * deblock-a-sve.S: aarch64 deblocking + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" +#include "deblock-a-common.S" + +.arch armv8-a+sve + +.macro h264_loop_filter_chroma_sve + ptrue p0.b, vl16 + + dup v22.16b, w2 // alpha + uxtl v24.8h, v24.8b + uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0) + uxtl v4.8h, v0.8b + uxtl2 v5.8h, v0.16b + uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0) + usubw v4.8h, v4.8h, v16.8b + usubw2 v5.8h, v5.8h, v16.16b + sli v24.8h, v24.8h, #8 + shl v4.8h, v4.8h, #2 + shl v5.8h, v5.8h, #2 + uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0) + uxtl v24.4s, v24.4h + uaddw v4.8h, v4.8h, v18.8b + uaddw2 v5.8h, v5.8h, v18.16b + + cmphi p1.b, p0/z, z22.b, z26.b + usubw v4.8h, v4.8h, v2.8b + usubw2 v5.8h, v5.8h, v2.16b + sli v24.4s, v24.4s, #16 + dup v22.16b, w3 // beta + rshrn v4.8b, v4.8h, #3 + rshrn2 v4.16b, v5.8h, #3 + cmphi p2.b, p0/z, z22.b, z28.b + cmphi p3.b, p0/z, z22.b, z30.b + smin v4.16b, v4.16b, v24.16b + neg v25.16b, v24.16b + and p1.b, p0/z, p1.b, p2.b + smax v4.16b, v4.16b, v25.16b + and p1.b, p0/z, p1.b, p3.b + uxtl v22.8h, v0.8b + uxtl2 v23.8h, v0.16b + + uxtl v28.8h, v16.8b + uxtl2 v29.8h, v16.16b + saddw v28.8h, v28.8h, v4.8b + saddw2 v29.8h, v29.8h, v4.16b + ssubw v22.8h, v22.8h, v4.8b + ssubw2 v23.8h, v23.8h, v4.16b + sqxtun v16.8b, v28.8h + sqxtun v0.8b, v22.8h + sqxtun2 v16.16b, v29.8h + sqxtun2 v0.16b, v23.8h +.endm + +function deblock_v_chroma_sve, export=1 + h264_loop_filter_start + + sub x0, x0, x1, lsl #1 + // No performance improvement if sve load is used. So, continue using + // NEON load here + ld1 {v18.16b}, [x0], x1 + ld1 {v16.16b}, [x0], x1 + ld1 {v0.16b}, [x0], x1 + ld1 {v2.16b}, [x0] + + h264_loop_filter_chroma_sve + + sub x0, x0, x1, lsl #1 + st1b {z16.b}, p1, [x0] + add x0, x0, x1 + st1b {z0.b}, p1, [x0] + + ret +endfunc diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S index 344d8458d..718fe155d 100644 --- a/common/aarch64/deblock-a.S +++ b/common/aarch64/deblock-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.S: aarch64 deblocking ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Mans Rullgard * Janne Grunau @@ -25,20 +25,7 @@ *****************************************************************************/ #include "asm.S" - -.macro h264_loop_filter_start - cmp w2, #0 - ldr w6, [x4] - ccmp w3, #0, #0, ne - mov v24.s[0], w6 - and w8, w6, w6, lsl #16 - b.eq 1f - ands w8, w8, w8, lsl #8 - b.ge 2f -1: - ret -2: -.endm +#include "deblock-a-common.S" .macro h264_loop_filter_luma dup v22.16b, w2 // alpha @@ -714,10 +701,10 @@ function deblock_strength_neon, export=1 bframe: // load bytes ref add x2, x2, #16 - ld1 {v31.d}[1], [x1], #8 - ld1 {v1.16b}, [x1], #16 + ld1 {v31.d}[1], [x1], #8 + ld1 {v1.16b}, [x1], #16 movi v0.16b, #0 - ld1 {v2.16b}, [x1], #16 + ld1 {v2.16b}, [x1], #16 ext v3.16b, v0.16b, v1.16b, #15 ext v0.16b, v0.16b, v2.16b, #15 unzip v21.4s, v22.4s, v1.4s, v2.4s @@ -729,20 +716,20 @@ bframe: orr v4.16b, v4.16b, v0.16b orr v5.16b, v5.16b, v1.16b - ld1 {v21.8h}, [x2], #16 // mv + 0x10 - ld1 {v19.8h}, [x2], #16 // mv + 0x20 - ld1 {v22.8h}, [x2], #16 // mv + 0x30 - ld1 {v18.8h}, [x2], #16 // mv + 0x40 - ld1 {v23.8h}, [x2], #16 // mv + 0x50 + ld1 {v21.8h}, [x2], #16 // mv + 0x10 + ld1 {v19.8h}, [x2], #16 // mv + 0x20 + ld1 {v22.8h}, [x2], #16 // mv + 0x30 + ld1 {v18.8h}, [x2], #16 // mv + 0x40 + ld1 {v23.8h}, [x2], #16 // mv + 0x50 ext v19.16b, v19.16b, v22.16b, #12 ext v18.16b, v18.16b, v23.16b, #12 sabd v0.8h, v22.8h, v19.8h - ld1 {v19.8h}, [x2], #16 // mv + 0x60 + ld1 {v19.8h}, [x2], #16 // mv + 0x60 sabd v1.8h, v23.8h, v18.8h - ld1 {v24.8h}, [x2], #16 // mv + 0x70 + ld1 {v24.8h}, [x2], #16 // mv + 0x70 uqxtn v0.8b, v0.8h - ld1 {v18.8h}, [x2], #16 // mv + 0x80 - ld1 {v25.8h}, [x2], #16 // mv + 0x90 + ld1 {v18.8h}, [x2], #16 // mv + 0x80 + ld1 {v25.8h}, [x2], #16 // mv + 0x90 uqxtn2 v0.16b, v1.8h ext v19.16b, v19.16b, v24.16b, #12 ext v18.16b, v18.16b, v25.16b, #12 @@ -777,10 +764,10 @@ bframe: movi v6.16b, #1 // load bytes nnz - ld1 {v31.d}[1], [x0], #8 - ld1 {v1.16b}, [x0], #16 + ld1 {v31.d}[1], [x0], #8 + ld1 {v1.16b}, [x0], #16 movi v0.16b, #0 - ld1 {v2.16b}, [x0], #16 + ld1 {v2.16b}, [x0], #16 ext v3.16b, v0.16b, v1.16b, #15 ext v0.16b, v0.16b, v2.16b, #15 unzip v21.4s, v22.4s, v1.4s, v2.4s @@ -788,7 +775,7 @@ bframe: ext v21.16b, v31.16b, v22.16b, #12 movrel x7, transpose_table - ld1 {v7.16b}, [x7] + ld1 {v7.16b}, [x7] orr v0.16b, v20.16b, v22.16b orr v1.16b, v21.16b, v22.16b umin v0.16b, v0.16b, v6.16b @@ -800,8 +787,8 @@ bframe: umax v4.16b, v4.16b, v0.16b umax v5.16b, v5.16b, v1.16b tbl v6.16b, {v4.16b}, v7.16b - st1 {v5.16b}, [x3], x6 // bs[1] - st1 {v6.16b}, [x3] // bs[0] + st1 {v5.16b}, [x3], x6 // bs[1] + st1 {v6.16b}, [x3] // bs[0] ret endfunc diff --git a/common/aarch64/deblock.h b/common/aarch64/deblock.h index 8eb9d036d..e4c60e346 100644 --- a/common/aarch64/deblock.h +++ b/common/aarch64/deblock.h @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.h: aarch64 deblocking ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * @@ -55,4 +55,7 @@ void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, i #define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon) void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta ); +#define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve) +void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); + #endif diff --git a/common/aarch64/mc-a-common.S b/common/aarch64/mc-a-common.S new file mode 100644 index 000000000..fa620ecc0 --- /dev/null +++ b/common/aarch64/mc-a-common.S @@ -0,0 +1,66 @@ +/**************************************************************************** + * mc-a-common.S: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: David Conrad + * Janne Grunau + * Mans Rullgard + * Stefan Groenroos + * David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +// This file contains the NEON macros and functions that are intended to be used by +// the SVE/SVE2 functions as well + +#if BIT_DEPTH == 8 + +// 0 < weight < 64 +.macro load_weights_add_add + mov w6, w6 +.endm + +// weight > 64 +.macro load_weights_add_sub + neg w7, w7 +.endm + +// weight < 0 +.macro load_weights_sub_add + neg w6, w6 +.endm + +function pixel_avg_w4_neon +1: subs w9, w9, #2 + ld1 {v0.s}[0], [x2], x3 + ld1 {v2.s}[0], [x4], x5 + urhadd v0.8b, v0.8b, v2.8b + ld1 {v1.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x5 + urhadd v1.8b, v1.8b, v3.8b + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc + +#else // BIT_DEPTH == 10 + +#endif diff --git a/common/aarch64/mc-a-sve.S b/common/aarch64/mc-a-sve.S new file mode 100644 index 000000000..632aa7b7c --- /dev/null +++ b/common/aarch64/mc-a-sve.S @@ -0,0 +1,108 @@ +/***************************************************************************** + * mc-a-sve.S: aarch64 motion compensation + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" +#include "mc-a-common.S" + +.arch armv8-a+sve + +#if BIT_DEPTH == 8 + +// void pixel_avg( uint8_t *dst, intptr_t dst_stride, +// uint8_t *src1, intptr_t src1_stride, +// uint8_t *src2, intptr_t src2_stride, int weight ); +.macro AVGH_SVE w h +function pixel_avg_\w\()x\h\()_sve, export=1 + mov w10, #64 + cmp w6, #32 + mov w9, #\h + b.eq pixel_avg_w\w\()_neon + subs w7, w10, w6 + b.lt pixel_avg_weight_w\w\()_add_sub_sve // weight > 64 + cmp w6, #0 + b.ge pixel_avg_weight_w\w\()_add_add_sve + b pixel_avg_weight_w\w\()_sub_add_sve // weight < 0 +endfunc +.endm + +AVGH_SVE 4, 2 +AVGH_SVE 4, 4 +AVGH_SVE 4, 8 +AVGH_SVE 4, 16 + +// 0 < weight < 64 +.macro weight_add_add_sve dst, s1, s2, h= + mul \dst, \s1, v30.8h + mla \dst, \s2, v31.8h +.endm + +// weight > 64 +.macro weight_add_sub_sve dst, s1, s2, h= + mul \dst, \s1, v30.8h + mls \dst, \s2, v31.8h +.endm + +// weight < 0 +.macro weight_sub_add_sve dst, s1, s2, h= + mul \dst, \s2, v31.8h + mls \dst, \s1, v30.8h +.endm + +.macro AVG_WEIGHT_SVE ext +function pixel_avg_weight_w4_\ext\()_sve + load_weights_\ext + ptrue p0.b, vl8 + dup v30.8h, w6 + dup v31.8h, w7 +1: // height loop + subs w9, w9, #2 + ld1b {z0.h}, p0/z, [x2] + add x2, x2, x3 + ld1b {z1.h}, p0/z, [x4] + add x4, x4, x5 + weight_\ext\()_sve v4.8h, v0.8h, v1.8h + ld1b {z2.h}, p0/z, [x2] + add x2, x2, x3 + ld1b {z3.h}, p0/z, [x4] + add x4, x4, x5 + + sqrshrun v0.8b, v4.8h, #6 + weight_\ext\()_sve v5.8h, v2.8h, v3.8h + st1 {v0.s}[0], [x0], x1 + sqrshrun v1.8b, v5.8h, #6 + st1 {v1.s}[0], [x0], x1 + b.gt 1b + ret +endfunc +.endm + +AVG_WEIGHT_SVE add_add +AVG_WEIGHT_SVE add_sub +AVG_WEIGHT_SVE sub_add + +#else // BIT_DEPTH == 10 + + +#endif diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S index 40371dfe7..9c52d0454 100644 --- a/common/aarch64/mc-a.S +++ b/common/aarch64/mc-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * mc.S: aarch64 motion compensation ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -27,6 +27,7 @@ *****************************************************************************/ #include "asm.S" +#include "mc-a-common.S" // note: prefetch stuff assumes 64-byte cacheline @@ -85,6 +86,220 @@ endfunc prefetch_fenc 420 prefetch_fenc 422 +function mbtree_propagate_cost_neon, export=1 + ld1r {v5.4s}, [x5] +8: + subs w6, w6, #8 + ld1 {v1.8h}, [x1], #16 + ld1 {v2.8h}, [x2], #16 + ld1 {v3.8h}, [x3], #16 + ld1 {v4.8h}, [x4], #16 + bic v3.8h, #0xc0, lsl #8 + umin v3.8h, v2.8h, v3.8h + umull v20.4s, v2.4h, v4.4h // propagate_intra + umull2 v21.4s, v2.8h, v4.8h // propagate_intra + usubl v22.4s, v2.4h, v3.4h // propagate_num + usubl2 v23.4s, v2.8h, v3.8h // propagate_num + uxtl v26.4s, v2.4h // propagate_denom + uxtl2 v27.4s, v2.8h // propagate_denom + uxtl v24.4s, v1.4h + uxtl2 v25.4s, v1.8h + ucvtf v20.4s, v20.4s + ucvtf v21.4s, v21.4s + ucvtf v26.4s, v26.4s + ucvtf v27.4s, v27.4s + ucvtf v22.4s, v22.4s + ucvtf v23.4s, v23.4s + frecpe v28.4s, v26.4s + frecpe v29.4s, v27.4s + ucvtf v24.4s, v24.4s + ucvtf v25.4s, v25.4s + frecps v30.4s, v28.4s, v26.4s + frecps v31.4s, v29.4s, v27.4s + fmla v24.4s, v20.4s, v5.4s // propagate_amount + fmla v25.4s, v21.4s, v5.4s // propagate_amount + fmul v28.4s, v28.4s, v30.4s + fmul v29.4s, v29.4s, v31.4s + fmul v16.4s, v24.4s, v22.4s + fmul v17.4s, v25.4s, v23.4s + fmul v18.4s, v16.4s, v28.4s + fmul v19.4s, v17.4s, v29.4s + fcvtns v20.4s, v18.4s + fcvtns v21.4s, v19.4s + sqxtn v0.4h, v20.4s + sqxtn2 v0.8h, v21.4s + st1 {v0.8h}, [x0], #16 + b.gt 8b + ret +endfunc + +const pw_0to15, align=5 + .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +endconst + +function mbtree_propagate_list_internal_neon, export=1 + movrel x11, pw_0to15 + dup v31.8h, w4 // bipred_weight + movi v30.8h, #0xc0, lsl #8 + ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y + movi v28.4s, #4 + movi v27.8h, #31 + movi v26.8h, #32 + dup v24.8h, w5 // mb_y + zip1 v29.8h, v29.8h, v24.8h +8: + subs w6, w6, #8 + ld1 {v1.8h}, [x1], #16 // propagate_amount + ld1 {v2.8h}, [x2], #16 // lowres_cost + and v2.16b, v2.16b, v30.16b + cmeq v25.8h, v2.8h, v30.8h + umull v16.4s, v1.4h, v31.4h + umull2 v17.4s, v1.8h, v31.8h + rshrn v16.4h, v16.4s, #6 + rshrn2 v16.8h, v17.4s, #6 + bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 ) + // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 + ld1 {v4.8h,v5.8h}, [x0], #32 + sshr v6.8h, v4.8h, #5 + sshr v7.8h, v5.8h, #5 + add v6.8h, v6.8h, v29.8h + add v29.8h, v29.8h, v28.8h + add v7.8h, v7.8h, v29.8h + add v29.8h, v29.8h, v28.8h + st1 {v6.8h,v7.8h}, [x3], #32 + and v4.16b, v4.16b, v27.16b + and v5.16b, v5.16b, v27.16b + uzp1 v6.8h, v4.8h, v5.8h // x & 31 + uzp2 v7.8h, v4.8h, v5.8h // y & 31 + sub v4.8h, v26.8h, v6.8h // 32 - (x & 31) + sub v5.8h, v26.8h, v7.8h // 32 - (y & 31) + mul v19.8h, v6.8h, v7.8h // idx3weight = y*x; + mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x); + mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x; + mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ; + umull v6.4s, v19.4h, v25.4h + umull2 v7.4s, v19.8h, v25.8h + umull v4.4s, v18.4h, v25.4h + umull2 v5.4s, v18.8h, v25.8h + umull v2.4s, v17.4h, v25.4h + umull2 v3.4s, v17.8h, v25.8h + umull v0.4s, v16.4h, v25.4h + umull2 v1.4s, v16.8h, v25.8h + rshrn v19.4h, v6.4s, #10 + rshrn2 v19.8h, v7.4s, #10 + rshrn v18.4h, v4.4s, #10 + rshrn2 v18.8h, v5.4s, #10 + rshrn v17.4h, v2.4s, #10 + rshrn2 v17.8h, v3.4s, #10 + rshrn v16.4h, v0.4s, #10 + rshrn2 v16.8h, v1.4s, #10 + zip1 v0.8h, v16.8h, v17.8h + zip2 v1.8h, v16.8h, v17.8h + zip1 v2.8h, v18.8h, v19.8h + zip2 v3.8h, v18.8h, v19.8h + st1 {v0.8h,v1.8h}, [x3], #32 + st1 {v2.8h,v3.8h}, [x3], #32 + b.ge 8b + ret +endfunc + +function memcpy_aligned_neon, export=1 + tst x2, #16 + b.eq 32f + sub x2, x2, #16 + ldr q0, [x1], #16 + str q0, [x0], #16 +32: + tst x2, #32 + b.eq 640f + sub x2, x2, #32 + ldp q0, q1, [x1], #32 + stp q0, q1, [x0], #32 +640: + cbz x2, 1f +64: + subs x2, x2, #64 + ldp q0, q1, [x1, #32] + ldp q2, q3, [x1], #64 + stp q0, q1, [x0, #32] + stp q2, q3, [x0], #64 + b.gt 64b +1: + ret +endfunc + +function memzero_aligned_neon, export=1 + movi v0.16b, #0 + movi v1.16b, #0 +1: + subs x1, x1, #128 + stp q0, q1, [x0, #96] + stp q0, q1, [x0, #64] + stp q0, q1, [x0, #32] + stp q0, q1, [x0], 128 + b.gt 1b + ret +endfunc + +// void mbtree_fix8_pack( int16_t *dst, float *src, int count ) +function mbtree_fix8_pack_neon, export=1 + subs w3, w2, #8 + b.lt 2f +1: + subs w3, w3, #8 + ld1 {v0.4s,v1.4s}, [x1], #32 + fcvtzs v0.4s, v0.4s, #8 + fcvtzs v1.4s, v1.4s, #8 + sqxtn v2.4h, v0.4s + sqxtn2 v2.8h, v1.4s + rev16 v3.16b, v2.16b + st1 {v3.8h}, [x0], #16 + b.ge 1b +2: + adds w3, w3, #8 + b.eq 4f +3: + subs w3, w3, #1 + ldr s0, [x1], #4 + fcvtzs w4, s0, #8 + rev16 w5, w4 + strh w5, [x0], #2 + b.gt 3b +4: + ret +endfunc + +// void mbtree_fix8_unpack( float *dst, int16_t *src, int count ) +function mbtree_fix8_unpack_neon, export=1 + subs w3, w2, #8 + b.lt 2f +1: + subs w3, w3, #8 + ld1 {v0.8h}, [x1], #16 + rev16 v1.16b, v0.16b + sxtl v2.4s, v1.4h + sxtl2 v3.4s, v1.8h + scvtf v4.4s, v2.4s, #8 + scvtf v5.4s, v3.4s, #8 + st1 {v4.4s,v5.4s}, [x0], #32 + b.ge 1b +2: + adds w3, w3, #8 + b.eq 4f +3: + subs w3, w3, #1 + ldrh w4, [x1], #2 + rev16 w5, w4 + sxth w6, w5 + scvtf s0, w6, #8 + str s0, [x0], #4 + b.gt 3b +4: + ret +endfunc + +#if BIT_DEPTH == 8 + // void pixel_avg( uint8_t *dst, intptr_t dst_stride, // uint8_t *src1, intptr_t src1_stride, // uint8_t *src2, intptr_t src2_stride, int weight ); @@ -113,9 +328,6 @@ AVGH 16, 8 AVGH 16, 16 // 0 < weight < 64 -.macro load_weights_add_add - mov w6, w6 -.endm .macro weight_add_add dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s1, v30.16b @@ -127,9 +339,6 @@ AVGH 16, 16 .endm // weight > 64 -.macro load_weights_add_sub - neg w7, w7 -.endm .macro weight_add_sub dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s1, v30.16b @@ -141,9 +350,6 @@ AVGH 16, 16 .endm // weight < 0 -.macro load_weights_sub_add - neg w6, w6 -.endm .macro weight_sub_add dst, s1, s2, h= .ifc \h, 2 umull2 \dst, \s2, v31.16b @@ -161,16 +367,16 @@ function pixel_avg_weight_w4_\ext\()_neon dup v31.8b, w7 1: // height loop subs w9, w9, #2 - ld1 {v0.s}[0], [x2], x3 - ld1 {v1.s}[0], [x4], x5 + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x4], x5 weight_\ext v4.8h, v0.8b, v1.8b - ld1 {v2.s}[0], [x2], x3 - ld1 {v3.s}[0], [x4], x5 + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x5 sqrshrun v0.8b, v4.8h, #6 weight_\ext v5.8h, v2.8b, v3.8b - st1 {v0.s}[0], [x0], x1 + st1 {v0.s}[0], [x0], x1 sqrshrun v1.8b, v5.8h, #6 - st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 b.gt 1b ret endfunc @@ -181,26 +387,26 @@ function pixel_avg_weight_w8_\ext\()_neon dup v31.8b, w7 1: // height loop subs w9, w9, #4 - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x4], x5 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x4], x5 weight_\ext v16.8h, v0.8b, v1.8b - ld1 {v2.8b}, [x2], x3 - ld1 {v3.8b}, [x4], x5 + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x4], x5 weight_\ext v17.8h, v2.8b, v3.8b - ld1 {v4.8b}, [x2], x3 - ld1 {v5.8b}, [x4], x5 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x4], x5 weight_\ext v18.8h, v4.8b, v5.8b - ld1 {v6.8b}, [x2], x3 - ld1 {v7.8b}, [x4], x5 + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x4], x5 weight_\ext v19.8h, v6.8b, v7.8b sqrshrun v0.8b, v16.8h, #6 sqrshrun v1.8b, v17.8h, #6 sqrshrun v2.8b, v18.8h, #6 sqrshrun v3.8b, v19.8h, #6 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 - st1 {v2.8b}, [x0], x1 - st1 {v3.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 b.gt 1b ret endfunc @@ -211,20 +417,20 @@ function pixel_avg_weight_w16_\ext\()_neon dup v31.16b, w7 1: // height loop subs w9, w9, #2 - ld1 {v0.16b}, [x2], x3 - ld1 {v1.16b}, [x4], x5 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x4], x5 weight_\ext v16.8h, v0.8b, v1.8b weight_\ext v17.8h, v0.16b, v1.16b, 2 - ld1 {v2.16b}, [x2], x3 - ld1 {v3.16b}, [x4], x5 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x4], x5 weight_\ext v18.8h, v2.8b, v3.8b weight_\ext v19.8h, v2.16b, v3.16b, 2 sqrshrun v0.8b, v16.8h, #6 sqrshrun v1.8b, v18.8h, #6 sqrshrun2 v0.16b, v17.8h, #6 sqrshrun2 v1.16b, v19.8h, #6 - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 b.gt 1b ret endfunc @@ -234,60 +440,46 @@ AVG_WEIGHT add_add AVG_WEIGHT add_sub AVG_WEIGHT sub_add -function pixel_avg_w4_neon -1: subs w9, w9, #2 - ld1 {v0.s}[0], [x2], x3 - ld1 {v2.s}[0], [x4], x5 - urhadd v0.8b, v0.8b, v2.8b - ld1 {v1.s}[0], [x2], x3 - ld1 {v3.s}[0], [x4], x5 - urhadd v1.8b, v1.8b, v3.8b - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 - b.gt 1b - ret -endfunc - function pixel_avg_w8_neon 1: subs w9, w9, #4 - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x4], x5 - ld1 {v2.8b}, [x2], x3 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x4], x5 + ld1 {v2.8b}, [x2], x3 urhadd v0.8b, v0.8b, v1.8b - ld1 {v3.8b}, [x4], x5 - st1 {v0.8b}, [x0], x1 - ld1 {v4.8b}, [x2], x3 + ld1 {v3.8b}, [x4], x5 + st1 {v0.8b}, [x0], x1 + ld1 {v4.8b}, [x2], x3 urhadd v1.8b, v2.8b, v3.8b - ld1 {v5.8b}, [x4], x5 - st1 {v1.8b}, [x0], x1 - ld1 {v6.8b}, [x2], x3 - ld1 {v7.8b}, [x4], x5 + ld1 {v5.8b}, [x4], x5 + st1 {v1.8b}, [x0], x1 + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x4], x5 urhadd v2.8b, v4.8b, v5.8b urhadd v3.8b, v6.8b, v7.8b - st1 {v2.8b}, [x0], x1 - st1 {v3.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 b.gt 1b ret endfunc function pixel_avg_w16_neon 1: subs w9, w9, #4 - ld1 {v0.16b}, [x2], x3 - ld1 {v1.16b}, [x4], x5 - ld1 {v2.16b}, [x2], x3 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x4], x5 + ld1 {v2.16b}, [x2], x3 urhadd v0.16b, v0.16b, v1.16b - ld1 {v3.16b}, [x4], x5 - st1 {v0.16b}, [x0], x1 - ld1 {v4.16b}, [x2], x3 + ld1 {v3.16b}, [x4], x5 + st1 {v0.16b}, [x0], x1 + ld1 {v4.16b}, [x2], x3 urhadd v1.16b, v2.16b, v3.16b - ld1 {v5.16b}, [x4], x5 - st1 {v1.16b}, [x0], x1 - ld1 {v6.16b}, [x2], x3 - ld1 {v7.16b}, [x4], x5 + ld1 {v5.16b}, [x4], x5 + st1 {v1.16b}, [x0], x1 + ld1 {v6.16b}, [x2], x3 + ld1 {v7.16b}, [x4], x5 urhadd v2.16b, v4.16b, v5.16b urhadd v3.16b, v6.16b, v7.16b - st1 {v2.16b}, [x0], x1 - st1 {v3.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 b.gt 1b ret endfunc @@ -295,14 +487,14 @@ endfunc function pixel_avg2_w4_neon, export=1 1: subs w5, w5, #2 - ld1 {v0.s}[0], [x2], x3 - ld1 {v2.s}[0], [x4], x3 + ld1 {v0.s}[0], [x2], x3 + ld1 {v2.s}[0], [x4], x3 urhadd v0.8b, v0.8b, v2.8b - ld1 {v1.s}[0], [x2], x3 - ld1 {v3.s}[0], [x4], x3 + ld1 {v1.s}[0], [x2], x3 + ld1 {v3.s}[0], [x4], x3 urhadd v1.8b, v1.8b, v3.8b - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 b.gt 1b ret endfunc @@ -310,14 +502,14 @@ endfunc function pixel_avg2_w8_neon, export=1 1: subs w5, w5, #2 - ld1 {v0.8b}, [x2], x3 - ld1 {v2.8b}, [x4], x3 + ld1 {v0.8b}, [x2], x3 + ld1 {v2.8b}, [x4], x3 urhadd v0.8b, v0.8b, v2.8b - ld1 {v1.8b}, [x2], x3 - ld1 {v3.8b}, [x4], x3 + ld1 {v1.8b}, [x2], x3 + ld1 {v3.8b}, [x4], x3 urhadd v1.8b, v1.8b, v3.8b - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 b.gt 1b ret endfunc @@ -325,14 +517,14 @@ endfunc function pixel_avg2_w16_neon, export=1 1: subs w5, w5, #2 - ld1 {v0.16b}, [x2], x3 - ld1 {v2.16b}, [x4], x3 + ld1 {v0.16b}, [x2], x3 + ld1 {v2.16b}, [x4], x3 urhadd v0.16b, v0.16b, v2.16b - ld1 {v1.16b}, [x2], x3 - ld1 {v3.16b}, [x4], x3 + ld1 {v1.16b}, [x2], x3 + ld1 {v3.16b}, [x4], x3 urhadd v1.16b, v1.16b, v3.16b - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 b.gt 1b ret endfunc @@ -341,18 +533,18 @@ function pixel_avg2_w20_neon, export=1 sub x1, x1, #16 1: subs w5, w5, #2 - ld1 {v0.16b,v1.16b}, [x2], x3 - ld1 {v2.16b,v3.16b}, [x4], x3 + ld1 {v0.16b,v1.16b}, [x2], x3 + ld1 {v2.16b,v3.16b}, [x4], x3 urhadd v0.16b, v0.16b, v2.16b urhadd v1.8b, v1.8b, v3.8b - ld1 {v4.16b,v5.16b}, [x2], x3 - ld1 {v6.16b,v7.16b}, [x4], x3 + ld1 {v4.16b,v5.16b}, [x2], x3 + ld1 {v6.16b,v7.16b}, [x4], x3 urhadd v4.16b, v4.16b, v6.16b urhadd v5.8b, v5.8b, v7.8b - st1 {v0.16b}, [x0], #16 - st1 {v1.s}[0], [x0], x1 - st1 {v4.16b}, [x0], #16 - st1 {v5.s}[0], [x0], x1 + st1 {v0.16b}, [x0], #16 + st1 {v1.s}[0], [x0], x1 + st1 {v4.16b}, [x0], #16 + st1 {v5.s}[0], [x0], x1 b.gt 1b ret endfunc @@ -378,8 +570,8 @@ function mc_weight_w20_neon, export=1 sub x1, x1, #16 1: subs w9, w9, #2 - ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 - ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 + ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 + ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 umull v22.8h, v16.8b, v0.8b umull v23.8h, v17.8b, v0.8b zip1 v18.2s, v18.2s, v21.2s @@ -401,10 +593,10 @@ function mc_weight_w20_neon, export=1 sqxtun v6.8b, v24.8h sqxtun v5.8b, v25.8h sqxtun2 v5.16b, v26.8h - st1 {v4.16b}, [x0], #16 - st1 {v6.s}[0], [x0], x1 - st1 {v5.16b}, [x0], #16 - st1 {v6.s}[1], [x0], x1 + st1 {v4.16b}, [x0], #16 + st1 {v6.s}[0], [x0], x1 + st1 {v5.16b}, [x0], #16 + st1 {v6.s}[1], [x0], x1 b.gt 1b ret endfunc @@ -414,8 +606,8 @@ function mc_weight_w16_neon, export=1 weight16_loop: 1: subs w9, w9, #2 - ld1 {v4.16b}, [x2], x3 - ld1 {v5.16b}, [x2], x3 + ld1 {v4.16b}, [x2], x3 + ld1 {v5.16b}, [x2], x3 umull v22.8h, v4.8b, v0.8b umull2 v23.8h, v4.16b, v0.16b umull v24.8h, v5.8b, v0.8b @@ -432,8 +624,8 @@ weight16_loop: sqxtun2 v4.16b, v23.8h sqxtun v5.8b, v24.8h sqxtun2 v5.16b, v25.8h - st1 {v4.16b}, [x0], x1 - st1 {v5.16b}, [x0], x1 + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 b.gt 1b ret endfunc @@ -442,8 +634,8 @@ function mc_weight_w8_neon, export=1 weight_prologue full 1: subs w9, w9, #2 - ld1 {v16.8b}, [x2], x3 - ld1 {v17.8b}, [x2], x3 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 umull v4.8h, v16.8b, v0.8b umull v5.8h, v17.8b, v0.8b srshl v4.8h, v4.8h, v2.8h @@ -452,8 +644,8 @@ function mc_weight_w8_neon, export=1 add v5.8h, v5.8h, v1.8h sqxtun v16.8b, v4.8h sqxtun v17.8b, v5.8h - st1 {v16.8b}, [x0], x1 - st1 {v17.8b}, [x0], x1 + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 b.gt 1b ret endfunc @@ -462,14 +654,14 @@ function mc_weight_w4_neon, export=1 weight_prologue full 1: subs w9, w9, #2 - ld1 {v16.s}[0], [x2], x3 - ld1 {v16.s}[1], [x2], x3 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 umull v4.8h, v16.8b, v0.8b srshl v4.8h, v4.8h, v2.8h add v4.8h, v4.8h, v1.8h sqxtun v16.8b, v4.8h - st1 {v16.s}[0], [x0], x1 - st1 {v16.s}[1], [x0], x1 + st1 {v16.s}[0], [x0], x1 + st1 {v16.s}[1], [x0], x1 b.gt 1b ret endfunc @@ -479,10 +671,10 @@ function mc_weight_w20_nodenom_neon, export=1 sub x1, x1, #16 1: subs w9, w9, #2 - ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 + ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3 mov v27.16b, v1.16b mov v28.16b, v1.16b - ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 + ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3 mov v31.16b, v1.16b mov v29.16b, v1.16b mov v30.16b, v1.16b @@ -497,10 +689,10 @@ function mc_weight_w20_nodenom_neon, export=1 sqxtun v5.8b, v29.8h sqxtun2 v5.16b, v30.8h sqxtun v6.8b, v31.8h - st1 {v4.16b}, [x0], #16 - st1 {v6.s}[0], [x0], x1 - st1 {v5.16b}, [x0], #16 - st1 {v6.s}[1], [x0], x1 + st1 {v4.16b}, [x0], #16 + st1 {v6.s}[0], [x0], x1 + st1 {v5.16b}, [x0], #16 + st1 {v6.s}[1], [x0], x1 b.gt 1b ret endfunc @@ -509,10 +701,10 @@ function mc_weight_w16_nodenom_neon, export=1 weight_prologue nodenom 1: subs w9, w9, #2 - ld1 {v6.16b}, [x2], x3 + ld1 {v6.16b}, [x2], x3 mov v27.16b, v1.16b mov v28.16b, v1.16b - ld1 {v7.16b}, [x2], x3 + ld1 {v7.16b}, [x2], x3 mov v29.16b, v1.16b mov v30.16b, v1.16b umlal v27.8h, v6.8b, v0.8b @@ -523,8 +715,8 @@ function mc_weight_w16_nodenom_neon, export=1 sqxtun2 v4.16b, v28.8h sqxtun v5.8b, v29.8h sqxtun2 v5.16b, v30.8h - st1 {v4.16b}, [x0], x1 - st1 {v5.16b}, [x0], x1 + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 b.gt 1b ret endfunc @@ -533,16 +725,16 @@ function mc_weight_w8_nodenom_neon, export=1 weight_prologue nodenom 1: subs w9, w9, #2 - ld1 {v16.8b}, [x2], x3 + ld1 {v16.8b}, [x2], x3 mov v27.16b, v1.16b - ld1 {v17.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 mov v29.16b, v1.16b umlal v27.8h, v16.8b, v0.8b umlal v29.8h, v17.8b, v0.8b sqxtun v4.8b, v27.8h sqxtun v5.8b, v29.8h - st1 {v4.8b}, [x0], x1 - st1 {v5.8b}, [x0], x1 + st1 {v4.8b}, [x0], x1 + st1 {v5.8b}, [x0], x1 b.gt 1b ret endfunc @@ -551,13 +743,13 @@ function mc_weight_w4_nodenom_neon, export=1 weight_prologue nodenom 1: subs w9, w9, #2 - ld1 {v16.s}[0], [x2], x3 - ld1 {v16.s}[1], [x2], x3 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 mov v27.16b, v1.16b umlal v27.8h, v16.8b, v0.8b sqxtun v4.8b, v27.8h - st1 {v4.s}[0], [x0], x1 - st1 {v4.s}[1], [x0], x1 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 b.gt 1b ret endfunc @@ -573,17 +765,17 @@ function mc_weight_w20_\name\()_neon, export=1 1: subs w5, w5, #2 ldr s18, [x2, #16] - ld1 {v16.16b}, [x2], x3 + ld1 {v16.16b}, [x2], x3 ldr s19, [x2, #16] - ld1 {v17.16b}, [x2], x3 + ld1 {v17.16b}, [x2], x3 \op v18.8b, v18.8b, v1.8b \op v16.16b, v16.16b, v1.16b \op v19.8b, v19.8b, v1.8b \op v17.16b, v17.16b, v1.16b str s18, [x0, #16] - st1 {v16.16b}, [x0], x1 + st1 {v16.16b}, [x0], x1 str s19, [x0, #16] - st1 {v17.16b}, [x0], x1 + st1 {v17.16b}, [x0], x1 b.gt 1b ret endfunc @@ -592,12 +784,12 @@ function mc_weight_w16_\name\()_neon, export=1 weight_simple_prologue 1: subs w5, w5, #2 - ld1 {v16.16b}, [x2], x3 - ld1 {v17.16b}, [x2], x3 + ld1 {v16.16b}, [x2], x3 + ld1 {v17.16b}, [x2], x3 \op v16.16b, v16.16b, v1.16b \op v17.16b, v17.16b, v1.16b - st1 {v16.16b}, [x0], x1 - st1 {v17.16b}, [x0], x1 + st1 {v16.16b}, [x0], x1 + st1 {v17.16b}, [x0], x1 b.gt 1b ret endfunc @@ -606,12 +798,12 @@ function mc_weight_w8_\name\()_neon, export=1 weight_simple_prologue 1: subs w5, w5, #2 - ld1 {v16.8b}, [x2], x3 - ld1 {v17.8b}, [x2], x3 + ld1 {v16.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 \op v16.8b, v16.8b, v1.8b \op v17.8b, v17.8b, v1.8b - st1 {v16.8b}, [x0], x1 - st1 {v17.8b}, [x0], x1 + st1 {v16.8b}, [x0], x1 + st1 {v17.8b}, [x0], x1 b.gt 1b ret endfunc @@ -620,11 +812,11 @@ function mc_weight_w4_\name\()_neon, export=1 weight_simple_prologue 1: subs w5, w5, #2 - ld1 {v16.s}[0], [x2], x3 - ld1 {v16.s}[1], [x2], x3 + ld1 {v16.s}[0], [x2], x3 + ld1 {v16.s}[1], [x2], x3 \op v16.8b, v16.8b, v1.8b - st1 {v16.s}[0], [x0], x1 - st1 {v16.s}[1], [x0], x1 + st1 {v16.s}[0], [x0], x1 + st1 {v16.s}[1], [x0], x1 b.gt 1b ret endfunc @@ -638,42 +830,42 @@ weight_simple offsetsub, uqsub function mc_copy_w4_neon, export=1 1: subs w4, w4, #4 - ld1 {v0.s}[0], [x2], x3 - ld1 {v1.s}[0], [x2], x3 - ld1 {v2.s}[0], [x2], x3 - ld1 {v3.s}[0], [x2], x3 - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 - st1 {v2.s}[0], [x0], x1 - st1 {v3.s}[0], [x0], x1 + ld1 {v0.s}[0], [x2], x3 + ld1 {v1.s}[0], [x2], x3 + ld1 {v2.s}[0], [x2], x3 + ld1 {v3.s}[0], [x2], x3 + st1 {v0.s}[0], [x0], x1 + st1 {v1.s}[0], [x0], x1 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 b.gt 1b ret endfunc function mc_copy_w8_neon, export=1 1: subs w4, w4, #4 - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 - ld1 {v2.8b}, [x2], x3 - ld1 {v3.8b}, [x2], x3 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 - st1 {v2.8b}, [x0], x1 - st1 {v3.8b}, [x0], x1 + ld1 {v0.8b}, [x2], x3 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 b.gt 1b ret endfunc function mc_copy_w16_neon, export=1 1: subs w4, w4, #4 - ld1 {v0.16b}, [x2], x3 - ld1 {v1.16b}, [x2], x3 - ld1 {v2.16b}, [x2], x3 - ld1 {v3.16b}, [x2], x3 - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x0], x1 - st1 {v2.16b}, [x0], x1 - st1 {v3.16b}, [x0], x1 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x2], x3 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x2], x3 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 b.gt 1b ret endfunc @@ -725,14 +917,14 @@ function mc_chroma_w\width\()_neon CHROMA_MC_START b.eq 2f - ld2 {v28.8b,v29.8b}, [x3], x4 + ld2 {v28.8b,v29.8b}, [x3], x4 dup v0.8b, w9 // cA dup v1.8b, w10 // cB ext v6.8b, v28.8b, v6.8b, #1 ext v7.8b, v29.8b, v7.8b, #1 - ld2 {v30.8b,v31.8b}, [x3], x4 + ld2 {v30.8b,v31.8b}, [x3], x4 dup v2.8b, w11 // cC dup v3.8b, w12 // cD @@ -753,7 +945,7 @@ function mc_chroma_w\width\()_neon umull v17.8h, v5.8b, v0.8b umlal v17.8h, v21.8b, v2.8b - ld2 {v28.8b,v29.8b}, [x3], x4 + ld2 {v28.8b,v29.8b}, [x3], x4 transpose v24.2d, v25.2d, v16.2d, v17.2d ext v6.8b, v28.8b, v6.8b, #1 @@ -769,7 +961,7 @@ function mc_chroma_w\width\()_neon umull v19.8h, v21.8b, v0.8b umlal v19.8h, v5.8b, v2.8b - ld2 {v30.8b,v31.8b}, [x3], x4 + ld2 {v30.8b,v31.8b}, [x3], x4 transpose v26.2d, v27.2d, v18.2d, v19.2d ext v22.8b, v30.8b, v22.8b, #1 @@ -785,10 +977,10 @@ function mc_chroma_w\width\()_neon //pld [x3] //pld [x3, x4] - st1 {v16.\vsize}[0], [x0], x2 - st1 {v16.\vsize}[idx2], [x1], x2 - st1 {v17.\vsize}[0], [x0], x2 - st1 {v17.\vsize}[idx2], [x1], x2 + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[idx2], [x1], x2 + st1 {v17.\vsize}[0], [x0], x2 + st1 {v17.\vsize}[idx2], [x1], x2 b.gt 1b ret @@ -800,15 +992,15 @@ function mc_chroma_w\width\()_neon b.eq 4f - ld1 {v4.8b}, [x3], x4 - ld1 {v6.8b}, [x3], x4 + ld1 {v4.8b}, [x3], x4 + ld1 {v6.8b}, [x3], x4 3: // vertical interpolation loop subs w15, w15, #2 umull v16.8h, v4.8b, v0.8b - ld1 {v4.8b}, [x3], x4 + ld1 {v4.8b}, [x3], x4 umlal v16.8h, v6.8b, v1.8b umull v17.8h, v6.8b, v0.8b - ld1 {v6.8b}, [x3], x4 + ld1 {v6.8b}, [x3], x4 umlal v17.8h, v4.8b, v1.8b rshrn v20.8b, v16.8h, #6 // uvuvuvuv @@ -820,17 +1012,17 @@ function mc_chroma_w\width\()_neon //pld [x3] //pld [x3, x4] - st1 {v16.\vsize}[0], [x0], x2 - st1 {v16.\vsize}[idx2], [x0], x2 - st1 {v17.\vsize}[0], [x1], x2 - st1 {v17.\vsize}[idx2], [x1], x2 + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[idx2], [x0], x2 + st1 {v17.\vsize}[0], [x1], x2 + st1 {v17.\vsize}[idx2], [x1], x2 b.gt 3b ret 4: // dy is 0 - ld1 {v4.8b,v5.8b}, [x3], x4 - ld1 {v6.8b,v7.8b}, [x3], x4 + ld1 {v4.8b,v5.8b}, [x3], x4 + ld1 {v6.8b,v7.8b}, [x3], x4 ext v5.8b, v4.8b, v5.8b, #2 ext v7.8b, v6.8b, v7.8b, #2 @@ -841,8 +1033,8 @@ function mc_chroma_w\width\()_neon umull v17.8h, v6.8b, v0.8b umlal v17.8h, v7.8b, v1.8b - ld1 {v4.8b,v5.8b}, [x3], x4 - ld1 {v6.8b,v7.8b}, [x3], x4 + ld1 {v4.8b,v5.8b}, [x3], x4 + ld1 {v6.8b,v7.8b}, [x3], x4 rshrn v20.8b, v16.8h, #6 rshrn v21.8b, v17.8h, #6 ext v5.8b, v4.8b, v5.8b, #2 @@ -853,24 +1045,24 @@ function mc_chroma_w\width\()_neon //pld [x3] //pld [x3, x4] - st1 {v16.\vsize}[0], [x0], x2 - st1 {v16.\vsize}[idx2], [x0], x2 - st1 {v17.\vsize}[0], [x1], x2 - st1 {v17.\vsize}[idx2], [x1], x2 + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[idx2], [x0], x2 + st1 {v17.\vsize}[0], [x1], x2 + st1 {v17.\vsize}[idx2], [x1], x2 b.gt 5b ret endfunc .endm - CHROMA_MC 2, h - CHROMA_MC 4, s + CHROMA_MC 2, h + CHROMA_MC 4, s function mc_chroma_w8_neon CHROMA_MC_START b.eq 2f - ld2 {v4.16b,v5.16b}, [x3], x4 - ld2 {v20.16b,v21.16b}, [x3], x4 + ld2 {v4.16b,v5.16b}, [x3], x4 + ld2 {v20.16b,v21.16b}, [x3], x4 dup v0.8b, w9 // cA dup v1.8b, w10 // cB @@ -895,7 +1087,7 @@ function mc_chroma_w8_neon umlal v17.8h, v21.8b, v2.8b umlal v17.8h, v23.8b, v3.8b - ld2 {v4.16b,v5.16b}, [x3], x4 + ld2 {v4.16b,v5.16b}, [x3], x4 ext v6.16b, v4.16b, v4.16b, #1 ext v7.16b, v5.16b, v5.16b, #1 @@ -910,7 +1102,7 @@ function mc_chroma_w8_neon umlal v19.8h, v5.8b, v2.8b umlal v19.8h, v7.8b, v3.8b - ld2 {v20.16b,v21.16b}, [x3], x4 + ld2 {v20.16b,v21.16b}, [x3], x4 rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 @@ -923,10 +1115,10 @@ function mc_chroma_w8_neon //pld [x3] //pld [x3, x4] - st1 {v16.8b}, [x0], x2 - st1 {v17.8b}, [x1], x2 - st1 {v18.8b}, [x0], x2 - st1 {v19.8b}, [x1], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 b.gt 1b ret @@ -938,8 +1130,8 @@ function mc_chroma_w8_neon b.eq 4f - ld2 {v4.8b,v5.8b}, [x3], x4 - ld2 {v6.8b,v7.8b}, [x3], x4 + ld2 {v4.8b,v5.8b}, [x3], x4 + ld2 {v6.8b,v7.8b}, [x3], x4 3: // vertical interpolation loop subs w15, w15, #2 umull v16.8h, v4.8b, v0.8b //U @@ -947,14 +1139,14 @@ function mc_chroma_w8_neon umull v17.8h, v5.8b, v0.8b //V umlal v17.8h, v7.8b, v1.8b - ld2 {v4.8b,v5.8b}, [x3], x4 + ld2 {v4.8b,v5.8b}, [x3], x4 umull v18.8h, v6.8b, v0.8b umlal v18.8h, v4.8b, v1.8b umull v19.8h, v7.8b, v0.8b umlal v19.8h, v5.8b, v1.8b - ld2 {v6.8b,v7.8b}, [x3], x4 + ld2 {v6.8b,v7.8b}, [x3], x4 rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 @@ -964,18 +1156,18 @@ function mc_chroma_w8_neon //pld [x3] //pld [x3, x4] - st1 {v16.8b}, [x0], x2 - st1 {v17.8b}, [x1], x2 - st1 {v18.8b}, [x0], x2 - st1 {v19.8b}, [x1], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 b.gt 3b ret 4: // dy is 0 - ld2 {v4.16b,v5.16b}, [x3], x4 + ld2 {v4.16b,v5.16b}, [x3], x4 ext v6.16b, v4.16b, v4.16b, #1 ext v7.16b, v5.16b, v5.16b, #1 - ld2 {v20.16b,v21.16b}, [x3], x4 + ld2 {v20.16b,v21.16b}, [x3], x4 ext v22.16b, v20.16b, v20.16b, #1 ext v23.16b, v21.16b, v21.16b, #1 5: // horizontal interpolation loop @@ -985,14 +1177,14 @@ function mc_chroma_w8_neon umull v17.8h, v5.8b, v0.8b //V umlal v17.8h, v7.8b, v1.8b - ld2 {v4.16b,v5.16b}, [x3], x4 + ld2 {v4.16b,v5.16b}, [x3], x4 umull v18.8h, v20.8b, v0.8b umlal v18.8h, v22.8b, v1.8b umull v19.8h, v21.8b, v0.8b umlal v19.8h, v23.8b, v1.8b - ld2 {v20.16b,v21.16b}, [x3], x4 + ld2 {v20.16b,v21.16b}, [x3], x4 rshrn v16.8b, v16.8h, #6 rshrn v17.8b, v17.8h, #6 @@ -1007,10 +1199,10 @@ function mc_chroma_w8_neon //pld [x3] //pld [x3, x4] - st1 {v16.8b}, [x0], x2 - st1 {v17.8b}, [x1], x2 - st1 {v18.8b}, [x0], x2 - st1 {v19.8b}, [x1], x2 + st1 {v16.8b}, [x0], x2 + st1 {v17.8b}, [x1], x2 + st1 {v18.8b}, [x0], x2 + st1 {v19.8b}, [x1], x2 b.gt 5b ret @@ -1035,16 +1227,16 @@ function hpel_filter_neon, export=1 add x7, x3, #16 // src pointer next 16b for horiz filter mov x5, x15 // restore width sub x3, x3, x4, lsl #1 // src - 2*stride - ld1 {v28.16b}, [x7], #16 // src[16:31] + ld1 {v28.16b}, [x7], #16 // src[16:31] add x9, x3, x5 // holds src - 2*stride + width - ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] - ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] - ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] - ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] - ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] - ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] + ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] + ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] + ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] + ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] + ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] + ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] ext v22.16b, v7.16b, v18.16b, #14 uaddl v1.8h, v16.8b, v21.8b @@ -1078,22 +1270,22 @@ function hpel_filter_neon, export=1 sqrshrun2 v4.16b, v5.8h, #5 umlsl2 v2.8h, v17.16b, v30.16b - ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] + ld1 {v16.16b}, [x3], x4 // src-2*stride[0:15] umlal2 v2.8h, v18.16b, v31.16b - ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] + ld1 {v17.16b}, [x3], x4 // src-1*stride[0:15] umlal2 v2.8h, v19.16b, v31.16b - ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] + ld1 {v18.16b}, [x3], x4 // src+0*stride[0:15] umlsl2 v2.8h, v20.16b, v30.16b - ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] - st1 {v4.16b}, [x0], #16 + ld1 {v19.16b}, [x3], x4 // src+1*stride[0:15] + st1 {v4.16b}, [x0], #16 sqrshrun2 v6.16b, v2.8h, #5 - ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] - ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] + ld1 {v20.16b}, [x3], x4 // src+2*stride[0:15] + ld1 {v21.16b}, [x3], x4 // src+3*stride[0:15] ext v22.16b, v0.16b, v1.16b, #12 ext v26.16b, v1.16b, v2.16b, #6 ext v23.16b, v0.16b, v1.16b, #14 - st1 {v6.16b}, [x1], #16 + st1 {v6.16b}, [x1], #16 uaddl v3.8h, v16.8b, v21.8b ext v25.16b, v1.16b, v2.16b, #4 umlsl v3.8h, v17.8b, v30.8b @@ -1132,7 +1324,7 @@ function hpel_filter_neon, export=1 add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 sqrshrun v4.8b, v4.8h, #6 - ld1 {v28.16b}, [x7], #16 // src[16:31] + ld1 {v28.16b}, [x7], #16 // src[16:31] mov v0.16b, v2.16b ext v23.16b, v7.16b, v18.16b, #15 sqrshrun2 v4.16b, v22.8h, #6 @@ -1142,7 +1334,7 @@ function hpel_filter_neon, export=1 ext v25.16b, v18.16b, v28.16b, #2 ext v26.16b, v18.16b, v28.16b, #3 - st1 {v4.16b}, [x2], #16 + st1 {v4.16b}, [x2], #16 b.gt 2b subs w6, w6, #1 @@ -1169,9 +1361,9 @@ function frame_init_lowres_core_neon, export=1 add x12, x0, x5 // src1 = src0 + src_stride add x13, x0, x5, lsl #1 // src2 = src1 + src_stride - ld2 {v0.16b,v1.16b}, [x11], #32 - ld2 {v2.16b,v3.16b}, [x12], #32 - ld2 {v4.16b,v5.16b}, [x13], #32 + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x] urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x] @@ -1180,9 +1372,9 @@ function frame_init_lowres_core_neon, export=1 urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] - ld2 {v0.16b,v1.16b}, [x11], #32 - ld2 {v2.16b,v3.16b}, [x12], #32 - ld2 {v4.16b,v5.16b}, [x13], #32 + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2] @@ -1193,19 +1385,19 @@ function frame_init_lowres_core_neon, export=1 urhadd v17.16b, v21.16b, v24.16b urhadd v19.16b, v23.16b, v25.16b - st1 {v16.16b}, [x1], #16 - st1 {v18.16b}, [x3], #16 - st1 {v17.16b}, [x2], #16 - st1 {v19.16b}, [x4], #16 + st1 {v16.16b}, [x1], #16 + st1 {v18.16b}, [x3], #16 + st1 {v17.16b}, [x2], #16 + st1 {v19.16b}, [x4], #16 b.le 3f subs w9, w9, #16 urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1] urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1] - ld2 {v0.16b,v1.16b}, [x11], #32 - ld2 {v2.16b,v3.16b}, [x12], #32 - ld2 {v4.16b,v5.16b}, [x13], #32 + ld2 {v0.16b,v1.16b}, [x11], #32 + ld2 {v2.16b,v3.16b}, [x12], #32 + ld2 {v4.16b,v5.16b}, [x13], #32 urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x] urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x] ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2] @@ -1216,10 +1408,10 @@ function frame_init_lowres_core_neon, export=1 urhadd v17.16b, v21.16b, v24.16b urhadd v19.16b, v23.16b, v25.16b - st1 {v16.16b}, [x1], #16 - st1 {v18.16b}, [x3], #16 - st1 {v17.16b}, [x2], #16 - st1 {v19.16b}, [x4], #16 + st1 {v16.16b}, [x1], #16 + st1 {v18.16b}, [x3], #16 + st1 {v17.16b}, [x2], #16 + st1 {v19.16b}, [x4], #16 b.gt 2b 3: subs w8, w8, #1 @@ -1241,13 +1433,13 @@ endfunc function load_deinterleave_chroma_fdec_neon, export=1 mov x4, #FDEC_STRIDE/2 load_deinterleave_chroma: - ld2 {v0.8b,v1.8b}, [x1], x2 - ld2 {v2.8b,v3.8b}, [x1], x2 + ld2 {v0.8b,v1.8b}, [x1], x2 + ld2 {v2.8b,v3.8b}, [x1], x2 subs w3, w3, #2 - st1 {v0.8b}, [x0], x4 - st1 {v1.8b}, [x0], x4 - st1 {v2.8b}, [x0], x4 - st1 {v3.8b}, [x0], x4 + st1 {v0.8b}, [x0], x4 + st1 {v1.8b}, [x0], x4 + st1 {v2.8b}, [x0], x4 + st1 {v3.8b}, [x0], x4 b.gt load_deinterleave_chroma ret @@ -1317,10 +1509,10 @@ function plane_copy_deinterleave_neon, export=1 sub x3, x3, x9 sub x5, x5, x9, lsl #1 1: - ld2 {v0.16b,v1.16b}, [x4], #32 + ld2 {v0.16b,v1.16b}, [x4], #32 subs w9, w9, #16 - st1 {v0.16b}, [x0], #16 - st1 {v1.16b}, [x2], #16 + st1 {v0.16b}, [x0], #16 + st1 {v1.16b}, [x2], #16 b.gt 1b add x4, x4, x5 @@ -1328,16 +1520,16 @@ function plane_copy_deinterleave_neon, export=1 add x0, x0, x1 add x2, x2, x3 mov w9, w6 - b.gt 1b + b.gt 1b ret endfunc .macro deinterleave_rgb subs x11, x11, #8 - st1 {v0.8b}, [x0], #8 - st1 {v1.8b}, [x2], #8 - st1 {v2.8b}, [x4], #8 + st1 {v0.8b}, [x0], #8 + st1 {v1.8b}, [x2], #8 + st1 {v2.8b}, [x4], #8 b.gt 1b subs w10, w10, #1 @@ -1368,14 +1560,14 @@ function plane_copy_deinterleave_rgb_neon, export=1 sub x7, x7, x11, lsl #1 sub x7, x7, x11 1: - ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24 + ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24 deinterleave_rgb ret 4: sub x7, x7, x11, lsl #2 1: - ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32 + ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32 deinterleave_rgb ret @@ -1388,10 +1580,10 @@ function plane_copy_interleave_core_neon, export=1 sub x3, x3, x9 sub x5, x5, x9 1: - ld1 {v0.16b}, [x2], #16 - ld1 {v1.16b}, [x4], #16 + ld1 {v0.16b}, [x2], #16 + ld1 {v1.16b}, [x4], #16 subs w9, w9, #16 - st2 {v0.16b,v1.16b}, [x0], #32 + st2 {v0.16b,v1.16b}, [x0], #32 b.gt 1b subs w7, w7, #1 @@ -1405,17 +1597,17 @@ function plane_copy_interleave_core_neon, export=1 endfunc function store_interleave_chroma_neon, export=1 - mov x5, #FDEC_STRIDE + mov x5, #FDEC_STRIDE 1: - ld1 {v0.8b}, [x2], x5 - ld1 {v1.8b}, [x3], x5 - ld1 {v2.8b}, [x2], x5 - ld1 {v3.8b}, [x3], x5 + ld1 {v0.8b}, [x2], x5 + ld1 {v1.8b}, [x3], x5 + ld1 {v2.8b}, [x2], x5 + ld1 {v3.8b}, [x3], x5 subs w4, w4, #2 zip1 v4.16b, v0.16b, v1.16b zip1 v5.16b, v2.16b, v3.16b - st1 {v4.16b}, [x0], x1 - st1 {v5.16b}, [x0], x1 + st1 {v4.16b}, [x0], x1 + st1 {v5.16b}, [x0], x1 b.gt 1b ret @@ -1433,17 +1625,17 @@ endfunc function integral_init4h_neon, export=1 sub x3, x0, x2, lsl #1 - ld1 {v6.8b,v7.8b}, [x1], #16 + ld1 {v6.8b,v7.8b}, [x1], #16 1: subs x2, x2, #16 - ld1 {v5.8h}, [x3], #16 + ld1 {v5.8h}, [x3], #16 integral4h v6, v7 - ld1 {v6.8b}, [x1], #8 - ld1 {v5.8h}, [x3], #16 - st1 {v0.8h}, [x0], #16 + ld1 {v6.8b}, [x1], #8 + ld1 {v5.8h}, [x3], #16 + st1 {v0.8h}, [x0], #16 integral4h v7, v6 - ld1 {v7.8b}, [x1], #8 - st1 {v0.8h}, [x0], #16 + ld1 {v7.8b}, [x1], #8 + st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc @@ -1468,17 +1660,17 @@ endfunc function integral_init8h_neon, export=1 sub x3, x0, x2, lsl #1 - ld1 {v16.8b,v17.8b}, [x1], #16 + ld1 {v16.8b,v17.8b}, [x1], #16 1: subs x2, x2, #16 - ld1 {v18.8h}, [x3], #16 + ld1 {v18.8h}, [x3], #16 integral8h v16, v17, v18 - ld1 {v16.8b}, [x1], #8 - ld1 {v18.8h}, [x3], #16 - st1 {v0.8h}, [x0], #16 + ld1 {v16.8b}, [x1], #8 + ld1 {v18.8h}, [x3], #16 + st1 {v0.8h}, [x0], #16 integral8h v17, v16, v18 - ld1 {v17.8b}, [x1], #8 - st1 {v0.8h}, [x0], #16 + ld1 {v17.8b}, [x1], #8 + st1 {v0.8h}, [x0], #16 b.gt 1b ret endfunc @@ -1488,11 +1680,11 @@ function integral_init4v_neon, export=1 add x4, x0, x2, lsl #3 add x8, x0, x2, lsl #4 sub x2, x2, #8 - ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48 - ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48 + ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48 + ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48 1: subs x2, x2, #16 - ld1 {v24.8h,v25.8h}, [x4], #32 + ld1 {v24.8h,v25.8h}, [x4], #32 ext v0.16b, v20.16b, v21.16b, #8 ext v1.16b, v21.16b, v22.16b, #8 ext v2.16b, v16.16b, v17.16b, #8 @@ -1503,16 +1695,16 @@ function integral_init4v_neon, export=1 add v1.8h, v1.8h, v21.8h add v2.8h, v2.8h, v16.8h add v3.8h, v3.8h, v17.8h - st1 {v24.8h}, [x1], #16 - st1 {v25.8h}, [x1], #16 + st1 {v24.8h}, [x1], #16 + st1 {v25.8h}, [x1], #16 mov v20.16b, v22.16b mov v16.16b, v18.16b sub v0.8h, v2.8h, v0.8h sub v1.8h, v3.8h, v1.8h - ld1 {v21.8h,v22.8h}, [x3], #32 - ld1 {v17.8h,v18.8h}, [x8], #32 - st1 {v0.8h}, [x0], #16 - st1 {v1.8h}, [x0], #16 + ld1 {v21.8h,v22.8h}, [x3], #32 + ld1 {v17.8h,v18.8h}, [x8], #32 + st1 {v0.8h}, [x0], #16 + st1 {v1.8h}, [x0], #16 b.gt 1b 2: ret @@ -1524,232 +1716,2065 @@ function integral_init8v_neon, export=1 ands x3, x1, #16 - 1 b.eq 1f subs x1, x1, #8 - ld1 {v0.8h}, [x0] - ld1 {v2.8h}, [x2], #16 + ld1 {v0.8h}, [x0] + ld1 {v2.8h}, [x2], #16 sub v4.8h, v2.8h, v0.8h - st1 {v4.8h}, [x0], #16 + st1 {v4.8h}, [x0], #16 b.le 2f 1: subs x1, x1, #16 - ld1 {v0.8h,v1.8h}, [x0] - ld1 {v2.8h,v3.8h}, [x2], #32 + ld1 {v0.8h,v1.8h}, [x0] + ld1 {v2.8h,v3.8h}, [x2], #32 sub v4.8h, v2.8h, v0.8h sub v5.8h, v3.8h, v1.8h - st1 {v4.8h}, [x0], #16 - st1 {v5.8h}, [x0], #16 + st1 {v4.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 b.gt 1b 2: ret endfunc -function mbtree_propagate_cost_neon, export=1 - ld1r {v5.4s}, [x5] -8: - subs w6, w6, #8 - ld1 {v1.8h}, [x1], #16 - ld1 {v2.8h}, [x2], #16 - ld1 {v3.8h}, [x3], #16 - ld1 {v4.8h}, [x4], #16 - bic v3.8h, #0xc0, lsl #8 - umin v3.8h, v2.8h, v3.8h - umull v20.4s, v2.4h, v4.4h // propagate_intra - umull2 v21.4s, v2.8h, v4.8h // propagate_intra - usubl v22.4s, v2.4h, v3.4h // propagate_num - usubl2 v23.4s, v2.8h, v3.8h // propagate_num - uxtl v26.4s, v2.4h // propagate_denom - uxtl2 v27.4s, v2.8h // propagate_denom - uxtl v24.4s, v1.4h - uxtl2 v25.4s, v1.8h - ucvtf v20.4s, v20.4s - ucvtf v21.4s, v21.4s - ucvtf v26.4s, v26.4s - ucvtf v27.4s, v27.4s - ucvtf v22.4s, v22.4s - ucvtf v23.4s, v23.4s - frecpe v28.4s, v26.4s - frecpe v29.4s, v27.4s - ucvtf v24.4s, v24.4s - ucvtf v25.4s, v25.4s - frecps v30.4s, v28.4s, v26.4s - frecps v31.4s, v29.4s, v27.4s - fmla v24.4s, v20.4s, v5.4s // propagate_amount - fmla v25.4s, v21.4s, v5.4s // propagate_amount - fmul v28.4s, v28.4s, v30.4s - fmul v29.4s, v29.4s, v31.4s - fmul v16.4s, v24.4s, v22.4s - fmul v17.4s, v25.4s, v23.4s - fmul v18.4s, v16.4s, v28.4s - fmul v19.4s, v17.4s, v29.4s - fcvtns v20.4s, v18.4s - fcvtns v21.4s, v19.4s - sqxtn v0.4h, v20.4s - sqxtn2 v0.8h, v21.4s +#else // BIT_DEPTH == 8 + +// void pixel_avg( pixel *dst, intptr_t dst_stride, +// pixel *src1, intptr_t src1_stride, +// pixel *src2, intptr_t src2_stride, int weight ); +.macro AVGH w h +function pixel_avg_\w\()x\h\()_neon, export=1 + mov w10, #64 + cmp w6, #32 + mov w9, #\h + b.eq pixel_avg_w\w\()_neon + subs w7, w10, w6 + b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64 + cmp w6, #0 + b.ge pixel_avg_weight_w\w\()_add_add_neon + b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0 +endfunc +.endm + +AVGH 4, 2 +AVGH 4, 4 +AVGH 4, 8 +AVGH 4, 16 +AVGH 8, 4 +AVGH 8, 8 +AVGH 8, 16 +AVGH 16, 8 +AVGH 16, 16 + +// 0 < weight < 64 +.macro load_weights_add_add + mov w6, w6 +.endm +.macro weight_add_add dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s1, v30.8h + umlal2 \dst, \s2, v31.8h +.else + umull \dst, \s1, v30.4h + umlal \dst, \s2, v31.4h +.endif +.endm + +// weight > 64 +.macro load_weights_add_sub + neg w7, w7 +.endm +.macro weight_add_sub dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s1, v30.8h + umlsl2 \dst, \s2, v31.8h +.else + umull \dst, \s1, v30.4h + umlsl \dst, \s2, v31.4h +.endif +.endm + +// weight < 0 +.macro load_weights_sub_add + neg w6, w6 +.endm +.macro weight_sub_add dst, s1, s2, h= +.ifc \h, 2 + umull2 \dst, \s2, v31.8h + umlsl2 \dst, \s1, v30.8h +.else + umull \dst, \s2, v31.4h + umlsl \dst, \s1, v30.4h +.endif +.endm + +.macro AVG_WEIGHT ext +function pixel_avg_weight_w4_\ext\()_neon + load_weights_\ext + dup v30.8h, w6 + dup v31.8h, w7 + lsl x3, x3, #1 + lsl x5, x5, #1 + lsl x1, x1, #1 +1: // height loop + subs w9, w9, #2 + ld1 {v0.d}[0], [x2], x3 + ld1 {v1.d}[0], [x4], x5 + weight_\ext v4.4s, v0.4h, v1.4h + ld1 {v2.d}[0], [x2], x3 + ld1 {v3.d}[0], [x4], x5 + + mvni v28.8h, #0xfc, lsl #8 + + sqrshrun v4.4h, v4.4s, #6 + weight_\ext v5.4s, v2.4h, v3.4h + smin v4.4h, v4.4h, v28.4h + sqrshrun v5.4h, v5.4s, #6 + + st1 {v4.d}[0], [x0], x1 + + smin v5.4h, v5.4h, v28.4h + + st1 {v5.d}[0], [x0], x1 + + b.gt 1b + ret +endfunc + +function pixel_avg_weight_w8_\ext\()_neon + load_weights_\ext + dup v30.8h, w6 + dup v31.8h, w7 + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 +1: // height loop + subs w9, w9, #4 + ld1 {v0.8h}, [x2], x3 + ld1 {v1.8h}, [x4], x5 + weight_\ext v16.4s, v0.4h, v1.4h + weight_\ext v17.4s, v0.8h, v1.8h, 2 + ld1 {v2.8h}, [x2], x3 + ld1 {v3.8h}, [x4], x5 + weight_\ext v18.4s, v2.4h, v3.4h + weight_\ext v19.4s, v2.8h, v3.8h, 2 + ld1 {v4.8h}, [x2], x3 + ld1 {v5.8h}, [x4], x5 + weight_\ext v20.4s, v4.4h, v5.4h + weight_\ext v21.4s, v4.8h, v5.8h, 2 + ld1 {v6.8h}, [x2], x3 + ld1 {v7.8h}, [x4], x5 + weight_\ext v22.4s, v6.4h, v7.4h + weight_\ext v23.4s, v6.8h, v7.8h, 2 + + mvni v28.8h, #0xfc, lsl #8 + + sqrshrun v0.4h, v16.4s, #6 + sqrshrun v2.4h, v18.4s, #6 + sqrshrun v4.4h, v20.4s, #6 + sqrshrun2 v0.8h, v17.4s, #6 + sqrshrun v6.4h, v22.4s, #6 + sqrshrun2 v2.8h, v19.4s, #6 + sqrshrun2 v4.8h, v21.4s, #6 + smin v0.8h, v0.8h, v28.8h + smin v2.8h, v2.8h, v28.8h + sqrshrun2 v6.8h, v23.4s, #6 + smin v4.8h, v4.8h, v28.8h + smin v6.8h, v6.8h, v28.8h + + st1 {v0.8h}, [x0], x1 + st1 {v2.8h}, [x0], x1 + st1 {v4.8h}, [x0], x1 + st1 {v6.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_weight_w16_\ext\()_neon + load_weights_\ext + dup v30.8h, w6 + dup v31.8h, w7 + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 +1: // height loop + subs w9, w9, #2 + + ld1 {v0.8h, v1.8h}, [x2], x3 + ld1 {v2.8h, v3.8h}, [x4], x5 + ld1 {v4.8h, v5.8h}, [x2], x3 + ld1 {v6.8h, v7.8h}, [x4], x5 + + weight_\ext v16.4s, v0.4h, v2.4h + weight_\ext v17.4s, v0.8h, v2.8h, 2 + weight_\ext v18.4s, v1.4h, v3.4h + weight_\ext v19.4s, v1.8h, v3.8h, 2 + weight_\ext v20.4s, v4.4h, v6.4h + weight_\ext v21.4s, v4.8h, v6.8h, 2 + weight_\ext v22.4s, v5.4h, v7.4h + weight_\ext v23.4s, v5.8h, v7.8h, 2 + + mvni v28.8h, #0xfc, lsl #8 + + sqrshrun v0.4h, v16.4s, #6 + sqrshrun v1.4h, v18.4s, #6 + sqrshrun v2.4h, v20.4s, #6 + sqrshrun2 v0.8h, v17.4s, #6 + sqrshrun2 v1.8h, v19.4s, #6 + sqrshrun2 v2.8h, v21.4s, #6 + smin v0.8h, v0.8h, v28.8h + smin v1.8h, v1.8h, v28.8h + sqrshrun v3.4h, v22.4s, #6 + smin v2.8h, v2.8h, v28.8h + sqrshrun2 v3.8h, v23.4s, #6 + smin v3.8h, v3.8h, v28.8h + + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc +.endm + +AVG_WEIGHT add_add +AVG_WEIGHT add_sub +AVG_WEIGHT sub_add + +function pixel_avg_w4_neon + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + +1: subs w9, w9, #2 + ld1 {v0.d}[0], [x2], x3 + ld1 {v2.d}[0], [x4], x5 + ld1 {v0.d}[1], [x2], x3 + ld1 {v2.d}[1], [x4], x5 + urhadd v0.8h, v0.8h, v2.8h + st1 {v0.d}[0], [x0], x1 + st1 {v0.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_w8_neon + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 +1: subs w9, w9, #4 + ld1 {v0.8h}, [x2], x3 + ld1 {v1.8h}, [x4], x5 + ld1 {v2.8h}, [x2], x3 + urhadd v0.8h, v0.8h, v1.8h + ld1 {v3.8h}, [x4], x5 + st1 {v0.8h}, [x0], x1 + ld1 {v4.8h}, [x2], x3 + urhadd v1.8h, v2.8h, v3.8h + ld1 {v5.8h}, [x4], x5 + st1 {v1.8h}, [x0], x1 + ld1 {v6.8h}, [x2], x3 + ld1 {v7.8h}, [x4], x5 + urhadd v2.8h, v4.8h, v5.8h + urhadd v3.8h, v6.8h, v7.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg_w16_neon + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + +1: subs w9, w9, #4 + + ld1 {v0.8h, v1.8h}, [x2], x3 + ld1 {v2.8h, v3.8h}, [x4], x5 + ld1 {v4.8h, v5.8h}, [x2], x3 + urhadd v0.8h, v0.8h, v2.8h + urhadd v1.8h, v1.8h, v3.8h + ld1 {v6.8h, v7.8h}, [x4], x5 + ld1 {v20.8h, v21.8h}, [x2], x3 + st1 {v0.8h, v1.8h}, [x0], x1 + urhadd v4.8h, v4.8h, v6.8h + urhadd v5.8h, v5.8h, v7.8h + ld1 {v22.8h, v23.8h}, [x4], x5 + ld1 {v24.8h, v25.8h}, [x2], x3 + st1 {v4.8h, v5.8h}, [x0], x1 + ld1 {v26.8h, v27.8h}, [x4], x5 + urhadd v20.8h, v20.8h, v22.8h + urhadd v21.8h, v21.8h, v23.8h + urhadd v24.8h, v24.8h, v26.8h + urhadd v25.8h, v25.8h, v27.8h + st1 {v20.8h, v21.8h}, [x0], x1 + st1 {v24.8h, v25.8h}, [x0], x1 + + b.gt 1b + ret +endfunc + +function pixel_avg2_w4_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w5, w5, #2 + ld1 {v0.4h}, [x2], x3 + ld1 {v2.4h}, [x4], x3 + ld1 {v1.4h}, [x2], x3 + ld1 {v3.4h}, [x4], x3 + urhadd v0.4h, v0.4h, v2.4h + urhadd v1.4h, v1.4h, v3.4h + + st1 {v0.4h}, [x0], x1 + st1 {v1.4h}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg2_w8_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w5, w5, #2 + ld1 {v0.8h}, [x2], x3 + ld1 {v2.8h}, [x4], x3 + ld1 {v1.8h}, [x2], x3 + ld1 {v3.8h}, [x4], x3 + urhadd v0.8h, v0.8h, v2.8h + urhadd v1.8h, v1.8h, v3.8h + + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg2_w16_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w5, w5, #2 + ld1 {v0.8h, v1.8h}, [x2], x3 + ld1 {v2.8h, v3.8h}, [x4], x3 + ld1 {v4.8h, v5.8h}, [x2], x3 + ld1 {v6.8h, v7.8h}, [x4], x3 + urhadd v0.8h, v0.8h, v2.8h + urhadd v1.8h, v1.8h, v3.8h + urhadd v4.8h, v4.8h, v6.8h + urhadd v5.8h, v5.8h, v7.8h + + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v4.8h, v5.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function pixel_avg2_w20_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 + sub x1, x1, #32 +1: + subs w5, w5, #2 + + ld1 {v0.8h, v1.8h, v2.8h}, [x2], x3 + ld1 {v3.8h, v4.8h, v5.8h}, [x4], x3 + ld1 {v20.8h, v21.8h, v22.8h}, [x2], x3 + ld1 {v23.8h, v24.8h, v25.8h}, [x4], x3 + + urhadd v0.8h, v0.8h, v3.8h + urhadd v1.8h, v1.8h, v4.8h + urhadd v2.4h, v2.4h, v5.4h + urhadd v20.8h, v20.8h, v23.8h + urhadd v21.8h, v21.8h, v24.8h + urhadd v22.4h, v22.4h, v25.4h + + st1 {v0.8h, v1.8h}, [x0], #32 + st1 {v2.4h}, [x0], x1 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.4h}, [x0], x1 + b.gt 1b + ret +endfunc + +// void mc_copy( pixel *dst, intptr_t dst_stride, pixel *src, intptr_t src_stride, int height ) +function mc_copy_w4_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w4, w4, #4 + ld1 {v0.d}[0], [x2], x3 + ld1 {v1.d}[0], [x2], x3 + ld1 {v2.d}[0], [x2], x3 + ld1 {v3.d}[0], [x2], x3 + st1 {v0.d}[0], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v3.d}[0], [x0], x1 + b.gt 1b + ret +endfunc + +function mc_copy_w8_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 +1: subs w4, w4, #4 + ld1 {v0.8h}, [x2], x3 + ld1 {v1.8h}, [x2], x3 + ld1 {v2.8h}, [x2], x3 + ld1 {v3.8h}, [x2], x3 + st1 {v0.8h}, [x0], x1 + st1 {v1.8h}, [x0], x1 + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function mc_copy_w16_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 +1: subs w4, w4, #4 + ld1 {v0.8h, v1.8h}, [x2], x3 + ld1 {v2.8h, v3.8h}, [x2], x3 + ld1 {v4.8h, v5.8h}, [x2], x3 + ld1 {v6.8h, v7.8h}, [x2], x3 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v2.8h, v3.8h}, [x0], x1 + st1 {v4.8h, v5.8h}, [x0], x1 + st1 {v6.8h, v7.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +.macro weight_prologue type + mov w9, w5 // height +.ifc \type, full + ldr w12, [x4, #32] // denom +.endif + ldp w4, w5, [x4, #32+4] // scale, offset + dup v0.8h, w4 + lsl w5, w5, #2 + dup v1.4s, w5 +.ifc \type, full + neg w12, w12 + dup v2.4s, w12 +.endif +.endm + +// void mc_weight( pixel *src, intptr_t src_stride, pixel *dst, +// intptr_t dst_stride, const x264_weight_t *weight, int h ) +function mc_weight_w20_neon, export=1 + weight_prologue full + lsl x3, x3, #1 + lsl x1, x1, #1 + sub x1, x1, #32 +1: + subs w9, w9, #2 + ld1 {v16.8h, v17.8h, v18.8h}, [x2], x3 + ld1 {v19.8h, v20.8h, v21.8h}, [x2], x3 + + umull v22.4s, v16.4h, v0.4h + umull2 v23.4s, v16.8h, v0.8h + umull v24.4s, v17.4h, v0.4h + umull2 v25.4s, v17.8h, v0.8h + umull v26.4s, v18.4h, v0.4h + umull v27.4s, v21.4h, v0.4h + + srshl v22.4s, v22.4s, v2.4s + srshl v23.4s, v23.4s, v2.4s + srshl v24.4s, v24.4s, v2.4s + srshl v25.4s, v25.4s, v2.4s + srshl v26.4s, v26.4s, v2.4s + srshl v27.4s, v27.4s, v2.4s + add v22.4s, v22.4s, v1.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v1.4s + add v25.4s, v25.4s, v1.4s + add v26.4s, v26.4s, v1.4s + add v27.4s, v27.4s, v1.4s + + sqxtun v22.4h, v22.4s + sqxtun2 v22.8h, v23.4s + sqxtun v23.4h, v24.4s + sqxtun2 v23.8h, v25.4s + sqxtun v24.4h, v26.4s + sqxtun2 v24.8h, v27.4s + + umull v16.4s, v19.4h, v0.4h + umull2 v17.4s, v19.8h, v0.8h + umull v18.4s, v20.4h, v0.4h + umull2 v19.4s, v20.8h, v0.8h + + srshl v16.4s, v16.4s, v2.4s + srshl v17.4s, v17.4s, v2.4s + srshl v18.4s, v18.4s, v2.4s + srshl v19.4s, v19.4s, v2.4s + add v16.4s, v16.4s, v1.4s + add v17.4s, v17.4s, v1.4s + add v18.4s, v18.4s, v1.4s + add v19.4s, v19.4s, v1.4s + + sqxtun v16.4h, v16.4s + sqxtun2 v16.8h, v17.4s + sqxtun v17.4h, v18.4s + sqxtun2 v17.8h, v19.4s + + mvni v31.8h, #0xfc, lsl #8 + + umin v22.8h, v22.8h, v31.8h + umin v23.8h, v23.8h, v31.8h + umin v24.8h, v24.8h, v31.8h + umin v16.8h, v16.8h, v31.8h + umin v17.8h, v17.8h, v31.8h + + st1 {v22.8h, v23.8h}, [x0], #32 + st1 {v24.d}[0], [x0], x1 + st1 {v16.8h, v17.8h}, [x0], #32 + st1 {v24.d}[1], [x0], x1 + + b.gt 1b + ret +endfunc + +function mc_weight_w16_neon, export=1 + weight_prologue full + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w9, w9, #2 + ld1 {v4.8h, v5.8h}, [x2], x3 + ld1 {v6.8h, v7.8h}, [x2], x3 + + umull v22.4s, v4.4h, v0.4h + umull2 v23.4s, v4.8h, v0.8h + umull v24.4s, v5.4h, v0.4h + umull2 v25.4s, v5.8h, v0.8h + + srshl v22.4s, v22.4s, v2.4s + srshl v23.4s, v23.4s, v2.4s + srshl v24.4s, v24.4s, v2.4s + srshl v25.4s, v25.4s, v2.4s + + add v22.4s, v22.4s, v1.4s + add v23.4s, v23.4s, v1.4s + add v24.4s, v24.4s, v1.4s + add v25.4s, v25.4s, v1.4s + + sqxtun v22.4h, v22.4s + sqxtun2 v22.8h, v23.4s + sqxtun v23.4h, v24.4s + sqxtun2 v23.8h, v25.4s + + umull v26.4s, v6.4h, v0.4h + umull2 v27.4s, v6.8h, v0.8h + umull v28.4s, v7.4h, v0.4h + umull2 v29.4s, v7.8h, v0.8h + + srshl v26.4s, v26.4s, v2.4s + srshl v27.4s, v27.4s, v2.4s + srshl v28.4s, v28.4s, v2.4s + srshl v29.4s, v29.4s, v2.4s + + add v26.4s, v26.4s, v1.4s + add v27.4s, v27.4s, v1.4s + add v28.4s, v28.4s, v1.4s + add v29.4s, v29.4s, v1.4s + + sqxtun v26.4h, v26.4s + sqxtun2 v26.8h, v27.4s + sqxtun v27.4h, v28.4s + sqxtun2 v27.8h, v29.4s + + mvni v31.8h, 0xfc, lsl #8 + + umin v22.8h, v22.8h, v31.8h + umin v23.8h, v23.8h, v31.8h + umin v26.8h, v26.8h, v31.8h + umin v27.8h, v27.8h, v31.8h + + st1 {v22.8h, v23.8h}, [x0], x1 + st1 {v26.8h, v27.8h}, [x0], x1 + + b.gt 1b + ret +endfunc + +function mc_weight_w8_neon, export=1 + weight_prologue full + lsl x3, x3, #1 + lsl x1, x1, #1 +1: + subs w9, w9, #2 + ld1 {v16.8h}, [x2], x3 + ld1 {v17.8h}, [x2], x3 + + umull v4.4s, v16.4h, v0.4h + umull2 v5.4s, v16.8h, v0.8h + umull v6.4s, v17.4h, v0.4h + umull2 v7.4s, v17.8h, v0.8h + + srshl v4.4s, v4.4s, v2.4s + srshl v5.4s, v5.4s, v2.4s + srshl v6.4s, v6.4s, v2.4s + srshl v7.4s, v7.4s, v2.4s + + add v4.4s, v4.4s, v1.4s + add v5.4s, v5.4s, v1.4s + add v6.4s, v6.4s, v1.4s + add v7.4s, v7.4s, v1.4s + + sqxtun v16.4h, v4.4s + sqxtun2 v16.8h, v5.4s + sqxtun v17.4h, v6.4s + sqxtun2 v17.8h, v7.4s + + mvni v28.8h, #0xfc, lsl #8 + + umin v16.8h, v16.8h, v28.8h + umin v17.8h, v17.8h, v28.8h + + st1 {v16.8h}, [x0], x1 + st1 {v17.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function mc_weight_w4_neon, export=1 + weight_prologue full + lsl x3, x3, #1 + lsl x1, x1, #1 +1: + subs w9, w9, #2 + ld1 {v16.d}[0], [x2], x3 + ld1 {v16.d}[1], [x2], x3 + umull v4.4s, v16.4h, v0.4h + umull2 v5.4s, v16.8h, v0.8h + srshl v4.4s, v4.4s, v2.4s + srshl v5.4s, v5.4s, v2.4s + add v4.4s, v4.4s, v1.4s + add v5.4s, v5.4s, v1.4s + + sqxtun v16.4h, v4.4s + sqxtun2 v16.8h, v5.4s + + mvni v28.8h, #0xfc, lsl #8 + + umin v16.8h, v16.8h, v28.8h + + st1 {v16.d}[0], [x0], x1 + st1 {v16.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function mc_weight_w20_nodenom_neon, export=1 + weight_prologue nodenom + lsl x3, x3, #1 + lsl x1, x1, #1 + sub x1, x1, #32 +1: + subs w9, w9, #2 + ld1 {v16.8h, v17.8h, v18.8h}, [x2], x3 + mov v20.16b, v1.16b + mov v21.16b, v1.16b + mov v22.16b, v1.16b + mov v23.16b, v1.16b + mov v24.16b, v1.16b + mov v25.16b, v1.16b + ld1 {v2.8h, v3.8h, v4.8h}, [x2], x3 + mov v26.16b, v1.16b + mov v27.16b, v1.16b + mov v28.16b, v1.16b + mov v29.16b, v1.16b + + umlal v20.4s, v16.4h, v0.4h + umlal2 v21.4s, v16.8h, v0.8h + umlal v22.4s, v17.4h, v0.4h + umlal2 v23.4s, v17.8h, v0.8h + umlal v24.4s, v18.4h, v0.4h + umlal v25.4s, v4.4h, v0.4h + umlal v26.4s, v2.4h, v0.4h + umlal2 v27.4s, v2.8h, v0.8h + umlal v28.4s, v3.4h, v0.4h + umlal2 v29.4s, v3.8h, v0.8h + + sqxtun v2.4h, v20.4s + sqxtun2 v2.8h, v21.4s + sqxtun v3.4h, v22.4s + sqxtun2 v3.8h, v23.4s + sqxtun v4.4h, v24.4s + sqxtun2 v4.8h, v25.4s + sqxtun v5.4h, v26.4s + sqxtun2 v5.8h, v27.4s + sqxtun v6.4h, v28.4s + sqxtun2 v6.8h, v29.4s + + mvni v31.8h, 0xfc, lsl #8 + + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + umin v4.8h, v4.8h, v31.8h + umin v5.8h, v5.8h, v31.8h + umin v6.8h, v6.8h, v31.8h + + st1 {v2.8h, v3.8h}, [x0], #32 + st1 {v4.d}[0], [x0], x1 + st1 {v5.8h, v6.8h}, [x0], #32 + st1 {v4.d}[1], [x0], x1 + + b.gt 1b + ret +endfunc + +function mc_weight_w16_nodenom_neon, export=1 + weight_prologue nodenom + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w9, w9, #2 + ld1 {v2.8h, v3.8h}, [x2], x3 + mov v27.16b, v1.16b + mov v28.16b, v1.16b + mov v29.16b, v1.16b + mov v30.16b, v1.16b + ld1 {v4.8h, v5.8h}, [x2], x3 + mov v20.16b, v1.16b + mov v21.16b, v1.16b + mov v22.16b, v1.16b + mov v23.16b, v1.16b + + umlal v27.4s, v2.4h, v0.4h + umlal2 v28.4s, v2.8h, v0.8h + umlal v29.4s, v3.4h, v0.4h + umlal2 v30.4s, v3.8h, v0.8h + + umlal v20.4s, v4.4h, v0.4h + umlal2 v21.4s, v4.8h, v0.8h + umlal v22.4s, v5.4h, v0.4h + umlal2 v23.4s, v5.8h, v0.8h + + sqxtun v2.4h, v27.4s + sqxtun2 v2.8h, v28.4s + sqxtun v3.4h, v29.4s + sqxtun2 v3.8h, v30.4s + + sqxtun v4.4h, v20.4s + sqxtun2 v4.8h, v21.4s + sqxtun v5.4h, v22.4s + sqxtun2 v5.8h, v23.4s + + mvni v31.8h, 0xfc, lsl #8 + + umin v2.8h, v2.8h, v31.8h + umin v3.8h, v3.8h, v31.8h + umin v4.8h, v4.8h, v31.8h + umin v5.8h, v5.8h, v31.8h + + st1 {v2.8h, v3.8h}, [x0], x1 + st1 {v4.8h, v5.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function mc_weight_w8_nodenom_neon, export=1 + weight_prologue nodenom + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w9, w9, #2 + ld1 {v16.8h}, [x2], x3 + mov v27.16b, v1.16b + ld1 {v17.8h}, [x2], x3 + mov v28.16b, v1.16b + mov v29.16b, v1.16b + mov v30.16b, v1.16b + + umlal v27.4s, v16.4h, v0.4h + umlal2 v28.4s, v16.8h, v0.8h + umlal v29.4s, v17.4h, v0.4h + umlal2 v30.4s, v17.8h, v0.8h + + sqxtun v4.4h, v27.4s + sqxtun2 v4.8h, v28.4s + sqxtun v5.4h, v29.4s + sqxtun2 v5.8h, v30.4s + + mvni v31.8h, 0xfc, lsl #8 + + umin v4.8h, v4.8h, v31.8h + umin v5.8h, v5.8h, v31.8h + + st1 {v4.8h}, [x0], x1 + st1 {v5.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function mc_weight_w4_nodenom_neon, export=1 + weight_prologue nodenom + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w9, w9, #2 + ld1 {v16.d}[0], [x2], x3 + ld1 {v16.d}[1], [x2], x3 + mov v27.16b, v1.16b + mov v28.16b, v1.16b + umlal v27.4s, v16.4h, v0.4h + umlal2 v28.4s, v16.8h, v0.8h + + sqxtun v4.4h, v27.4s + sqxtun2 v4.8h, v28.4s + + mvni v31.8h, 0xfc, lsl #8 + + umin v4.8h, v4.8h, v31.8h + + st1 {v4.d}[0], [x0], x1 + st1 {v4.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +.macro weight_simple_prologue + ldr w6, [x4] // offset + lsl w6, w6, #2 + dup v1.8h, w6 +.endm + +.macro weight_simple name op +function mc_weight_w20_\name\()_neon, export=1 + weight_simple_prologue + lsl x1, x1, #1 + lsl x3, x3, #1 + sub x1, x1, #32 +1: + subs w5, w5, #2 + ld1 {v2.8h, v3.8h, v4.8h}, [x2], x3 + ld1 {v5.8h, v6.8h, v7.8h}, [x2], x3 + + zip1 v4.2d, v4.2d, v7.2d + + \op v2.8h, v2.8h, v1.8h + \op v3.8h, v3.8h, v1.8h + \op v4.8h, v4.8h, v1.8h + \op v5.8h, v5.8h, v1.8h + \op v6.8h, v6.8h, v1.8h + + mvni v31.8h, #0xfc, lsl #8 + + umin v2.8h, v2.8h, v28.8h + umin v3.8h, v3.8h, v28.8h + umin v4.8h, v4.8h, v28.8h + umin v5.8h, v5.8h, v28.8h + umin v6.8h, v6.8h, v28.8h + + st1 {v2.8h, v3.8h}, [x0], #32 + st1 {v4.d}[0], [x0], x1 + st1 {v5.8h, v6.8h}, [x0], #32 + st1 {v4.d}[1], [x0], x1 + + b.gt 1b + ret +endfunc + +function mc_weight_w16_\name\()_neon, export=1 + weight_simple_prologue + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w5, w5, #2 + ld1 {v16.8h, v17.8h}, [x2], x3 + ld1 {v18.8h, v19.8h}, [x2], x3 + + \op v16.8h, v16.8h, v1.8h + \op v17.8h, v17.8h, v1.8h + \op v18.8h, v18.8h, v1.8h + \op v19.8h, v19.8h, v1.8h + + mvni v28.8h, #0xfc, lsl #8 + + umin v16.8h, v16.8h, v28.8h + umin v17.8h, v17.8h, v28.8h + umin v18.8h, v18.8h, v28.8h + umin v19.8h, v19.8h, v28.8h + + st1 {v16.8h, v17.8h}, [x0], x1 + st1 {v18.8h, v19.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function mc_weight_w8_\name\()_neon, export=1 + weight_simple_prologue + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w5, w5, #2 + ld1 {v16.8h}, [x2], x3 + ld1 {v17.8h}, [x2], x3 + \op v16.8h, v16.8h, v1.8h + \op v17.8h, v17.8h, v1.8h + + mvni v28.8h, 0xfc, lsl #8 + + umin v16.8h, v16.8h, v28.8h + umin v17.8h, v17.8h, v28.8h + + st1 {v16.8h}, [x0], x1 + st1 {v17.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function mc_weight_w4_\name\()_neon, export=1 + weight_simple_prologue + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + subs w5, w5, #2 + ld1 {v16.d}[0], [x2], x3 + ld1 {v16.d}[1], [x2], x3 + \op v16.8h, v16.8h, v1.8h + mvni v28.8h, 0xfc, lsl #8 + + umin v16.8h, v16.8h, v28.8h + + st1 {v16.d}[0], [x0], x1 + st1 {v16.d}[1], [x0], x1 + b.gt 1b + ret +endfunc +.endm + +weight_simple offsetadd, uqadd +weight_simple offsetsub, uqsub + +// void mc_chroma( pixel *dst_u, pixel *dst_v, +// intptr_t i_dst_stride, +// pixel *src, intptr_t i_src_stride, +// int dx, int dy, int i_width, int i_height ); +function mc_chroma_neon, export=1 + ldr w15, [sp] // height + sbfx x12, x6, #3, #29 // asr(3) and sign extend + sbfx x11, x5, #3, #29 // asr(3) and sign extend + cmp w7, #4 + lsl x4, x4, #1 + mul x12, x12, x4 + add x3, x3, x11, lsl #2 + + and w5, w5, #7 + and w6, w6, #7 + + add x3, x3, x12 + + b.gt mc_chroma_w8_neon + b.eq mc_chroma_w4_neon +endfunc + +.macro CHROMA_MC_START r00, r01, r10, r11 + mul w12, w5, w6 // cD = d8x *d8y + lsl w13, w5, #3 + add w9, w12, #64 + lsl w14, w6, #3 + tst w12, w12 + sub w9, w9, w13 + sub w10, w13, w12 // cB = d8x *(8-d8y); + sub w11, w14, w12 // cC = (8-d8x)*d8y + sub w9, w9, w14 // cA = (8-d8x)*(8-d8y); +.endm + +.macro CHROMA_MC width, vsize +function mc_chroma_w\width\()_neon + lsl x2, x2, #1 +// since the element size varies, there's a different index for the 2nd store +.if \width == 4 + .set idx2, 1 +.else + .set idx2, 2 +.endif + CHROMA_MC_START + b.eq 2f + + ld2 {v28.8h, v29.8h}, [x3], x4 + dup v0.8h, w9 // cA + dup v1.8h, w10 // cB + + ext v6.16b, v28.16b, v28.16b, #2 + ext v7.16b, v29.16b, v29.16b, #2 + + ld2 {v30.8h, v31.8h}, [x3], x4 + dup v2.8h, w11 // cC + dup v3.8h, w12 // cD + + ext v22.16b, v30.16b, v30.16b, #2 + ext v23.16b, v31.16b, v31.16b, #2 + + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + + trn1 v4.2d, v28.2d, v6.2d + trn1 v5.2d, v29.2d, v7.2d + trn1 v20.2d, v30.2d, v22.2d + trn1 v21.2d, v31.2d, v23.2d +1: // height loop, interpolate xy + subs w15, w15, #2 + + mul v16.8h, v4.8h, v0.8h + mul v17.8h, v5.8h, v0.8h + mla v16.8h, v20.8h, v2.8h + mla v17.8h, v21.8h, v2.8h + + ld2 {v28.8h, v29.8h}, [x3], x4 + transpose v24.2d, v25.2d, v16.2d, v17.2d + + ext v6.16b, v28.16b, v28.16b, #2 + ext v7.16b, v29.16b, v29.16b, #2 + trn1 v4.2d, v28.2d, v6.2d + trn1 v5.2d, v29.2d, v7.2d + + add v16.8h, v24.8h, v25.8h + urshr v16.8h, v16.8h, #6 + + mul v18.8h, v20.8h, v0.8h + mul v19.8h, v21.8h, v0.8h + mla v18.8h, v4.8h, v2.8h + mla v19.8h, v5.8h, v2.8h + + ld2 {v30.8h, v31.8h}, [x3], x4 + + transpose v26.2d, v27.2d, v18.2d, v19.2d + add v18.8h, v26.8h, v27.8h + urshr v18.8h, v18.8h, #6 + + ext v22.16b, v30.16b, v30.16b, #2 + ext v23.16b, v31.16b, v31.16b, #2 + trn1 v20.2d, v30.2d, v22.2d + trn1 v21.2d, v31.2d, v23.2d + + st1 {v16.\vsize}[0], [x0], x2 + st1 {v16.\vsize}[idx2], [x1], x2 + st1 {v18.\vsize}[0], [x0], x2 + st1 {v18.\vsize}[idx2], [x1], x2 + b.gt 1b + + ret +2: // dx or dy are 0 + tst w11, w11 + add w10, w10, w11 + dup v0.8h, w9 + dup v1.8h, w10 + + b.eq 4f + + ld1 {v4.8h}, [x3], x4 + ld1 {v6.8h}, [x3], x4 +3: // vertical interpolation loop + subs w15, w15, #2 + + mul v16.8h, v4.8h, v0.8h + mla v16.8h, v6.8h, v1.8h + ld1 {v4.8h}, [x3], x4 + mul v17.8h, v6.8h, v0.8h + mla v17.8h, v4.8h, v1.8h + ld1 {v6.8h}, [x3], x4 + + urshr v16.8h, v16.8h, #6 + urshr v17.8h, v17.8h, #6 + + uzp1 v18.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv + uzp2 v19.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv + + st1 {v18.\vsize}[0], [x0], x2 + st1 {v18.\vsize}[idx2], [x0], x2 + st1 {v19.\vsize}[0], [x1], x2 + st1 {v19.\vsize}[idx2], [x1], x2 + b.gt 3b + + ret + +4: // dy is 0 + ld1 {v4.8h, v5.8h}, [x3], x4 + ld1 {v6.8h, v7.8h}, [x3], x4 + + ext v5.16b, v4.16b, v5.16b, #4 + ext v7.16b, v6.16b, v7.16b, #4 +5: // horizontal interpolation loop + subs w15, w15, #2 + + mul v16.8h, v4.8h, v0.8h + mla v16.8h, v5.8h, v1.8h + mul v17.8h, v6.8h, v0.8h + mla v17.8h, v7.8h, v1.8h + + ld1 {v4.8h, v5.8h}, [x3], x4 + ld1 {v6.8h, v7.8h}, [x3], x4 + + urshr v16.8h, v16.8h, #6 + urshr v17.8h, v17.8h, #6 + + ext v5.16b, v4.16b, v5.16b, #4 + ext v7.16b, v6.16b, v7.16b, #4 + uzp1 v18.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv + uzp2 v19.8h, v16.8h, v17.8h // d16=uuuu|uuuu, d17=vvvv|vvvv + + st1 {v18.\vsize}[0], [x0], x2 + st1 {v18.\vsize}[idx2], [x0], x2 + st1 {v19.\vsize}[0], [x1], x2 + st1 {v19.\vsize}[idx2], [x1], x2 + b.gt 5b + + ret +endfunc +.endm + + CHROMA_MC 2, s + CHROMA_MC 4, d + +function mc_chroma_w8_neon + lsl x2, x2, #1 + CHROMA_MC_START + + b.eq 2f + sub x4, x4, #32 + ld2 {v4.8h, v5.8h}, [x3], #32 + ld2 {v6.8h, v7.8h}, [x3], x4 + + ld2 {v20.8h, v21.8h}, [x3], #32 + ld2 {v22.8h, v23.8h}, [x3], x4 + + dup v0.8h, w9 // cA + dup v1.8h, w10 // cB + + ext v24.16b, v4.16b, v6.16b, #2 + ext v26.16b, v6.16b, v4.16b, #2 + ext v28.16b, v20.16b, v22.16b, #2 + ext v30.16b, v22.16b, v20.16b, #2 + + ext v25.16b, v5.16b, v7.16b, #2 + ext v27.16b, v7.16b, v5.16b, #2 + ext v29.16b, v21.16b, v23.16b, #2 + ext v31.16b, v23.16b, v21.16b, #2 + + dup v2.8h, w11 // cC + dup v3.8h, w12 // cD + +1: // height loop, interpolate xy + subs w15, w15, #2 + + mul v16.8h, v4.8h, v0.8h + mul v17.8h, v5.8h, v0.8h + mla v16.8h, v24.8h, v1.8h + mla v17.8h, v25.8h, v1.8h + mla v16.8h, v20.8h, v2.8h + mla v17.8h, v21.8h, v2.8h + mla v16.8h, v28.8h, v3.8h + mla v17.8h, v29.8h, v3.8h + + urshr v16.8h, v16.8h, #6 + urshr v17.8h, v17.8h, #6 + + st1 {v16.8h}, [x0], x2 + st1 {v17.8h}, [x1], x2 + + ld2 {v4.8h, v5.8h}, [x3], #32 + ld2 {v6.8h, v7.8h}, [x3], x4 + + mul v16.8h, v20.8h, v0.8h + mul v17.8h, v21.8h, v0.8h + ext v24.16b, v4.16b, v6.16b, #2 + ext v26.16b, v6.16b, v4.16b, #2 + mla v16.8h, v28.8h, v1.8h + mla v17.8h, v29.8h, v1.8h + ext v25.16b, v5.16b, v7.16b, #2 + ext v27.16b, v7.16b, v5.16b, #2 + mla v16.8h, v4.8h, v2.8h + mla v17.8h, v5.8h, v2.8h + mla v16.8h, v24.8h, v3.8h + mla v17.8h, v25.8h, v3.8h + + urshr v16.8h, v16.8h, #6 + urshr v17.8h, v17.8h, #6 + + ld2 {v20.8h, v21.8h}, [x3], #32 + ld2 {v22.8h, v23.8h}, [x3], x4 + ext v28.16b, v20.16b, v22.16b, #2 + ext v30.16b, v22.16b, v20.16b, #2 + ext v29.16b, v21.16b, v23.16b, #2 + ext v31.16b, v23.16b, v21.16b, #2 + + st1 {v16.8h}, [x0], x2 + st1 {v17.8h}, [x1], x2 + b.gt 1b + + ret +2: // dx or dy are 0 + tst w11, w11 + add w10, w10, w11 + dup v0.8h, w9 + dup v1.8h, w10 + + b.eq 4f + + ld2 {v4.8h, v5.8h}, [x3], x4 + ld2 {v6.8h, v7.8h}, [x3], x4 +3: // vertical interpolation loop + subs w15, w15, #2 + + mul v16.8h, v4.8h, v0.8h + mul v17.8h, v5.8h, v0.8h + mla v16.8h, v6.8h, v1.8h + mla v17.8h, v7.8h, v1.8h + urshr v16.8h, v16.8h, #6 + urshr v17.8h, v17.8h, #6 + + st1 {v16.8h}, [x0], x2 + st1 {v17.8h}, [x1], x2 + + ld2 {v4.8h, v5.8h}, [x3], x4 + + mul v16.8h, v6.8h, v0.8h + mul v17.8h, v7.8h, v0.8h + ld2 {v6.8h, v7.8h}, [x3], x4 + mla v16.8h, v4.8h, v1.8h + mla v17.8h, v5.8h, v1.8h + urshr v16.8h, v16.8h, #6 + urshr v17.8h, v17.8h, #6 + + st1 {v16.8h}, [x0], x2 + st1 {v17.8h}, [x1], x2 + b.gt 3b + + ret +4: // dy is 0 + sub x4, x4, #32 + + ld2 {v4.8h, v5.8h}, [x3], #32 + ld2 {v6.8h, v7.8h}, [x3], x4 + ext v24.16b, v4.16b, v6.16b, #2 + ext v26.16b, v6.16b, v4.16b, #2 + ld2 {v20.8h, v21.8h}, [x3], #32 + ld2 {v22.8h, v23.8h}, [x3], x4 + ext v28.16b, v20.16b, v22.16b, #2 + ext v30.16b, v22.16b, v20.16b, #2 + + ext v25.16b, v5.16b, v7.16b, #2 + ext v27.16b, v7.16b, v5.16b, #2 + ext v29.16b, v21.16b, v23.16b, #2 + ext v31.16b, v23.16b, v21.16b, #2 + +5: // horizontal interpolation loop + subs w15, w15, #2 + + mul v16.8h, v4.8h, v0.8h + mul v17.8h, v5.8h, v0.8h + mla v16.8h, v24.8h, v1.8h + mla v17.8h, v25.8h, v1.8h + + urshr v16.8h, v16.8h, #6 + urshr v17.8h, v17.8h, #6 + + st1 {v16.8h}, [x0], x2 + st1 {v17.8h}, [x1], x2 + + mul v16.8h, v20.8h, v0.8h + mul v17.8h, v21.8h, v0.8h + ld2 {v4.8h, v5.8h}, [x3], #32 + ld2 {v6.8h, v7.8h}, [x3], x4 + mla v16.8h, v28.8h, v1.8h + mla v17.8h, v29.8h, v1.8h + ld2 {v20.8h,v21.8h}, [x3], #32 + ld2 {v22.8h,v23.8h}, [x3], x4 + + urshr v16.8h, v16.8h, #6 + urshr v17.8h, v17.8h, #6 + + ext v24.16b, v4.16b, v6.16b, #2 + ext v26.16b, v6.16b, v4.16b, #2 + ext v28.16b, v20.16b, v22.16b, #2 + ext v30.16b, v22.16b, v20.16b, #2 + ext v29.16b, v21.16b, v23.16b, #2 + ext v31.16b, v23.16b, v21.16b, #2 + ext v25.16b, v5.16b, v7.16b, #2 + ext v27.16b, v7.16b, v5.16b, #2 + + st1 {v16.8h}, [x0], x2 + st1 {v17.8h}, [x1], x2 + b.gt 5b + + ret +endfunc + +.macro integral4h p1, p2 + ext v1.16b, \p1\().16b, \p2\().16b, #2 + ext v2.16b, \p1\().16b, \p2\().16b, #4 + ext v3.16b, \p1\().16b, \p2\().16b, #6 + add v0.8h, \p1\().8h, v1.8h + add v4.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v4.8h + add v0.8h, v0.8h, v5.8h +.endm + +function integral_init4h_neon, export=1 + sub x3, x0, x2, lsl #1 + lsl x2, x2, #1 + ld1 {v6.8h,v7.8h}, [x1], #32 +1: + subs x2, x2, #32 + ld1 {v5.8h}, [x3], #16 + integral4h v6, v7 + ld1 {v6.8h}, [x1], #16 + ld1 {v5.8h}, [x3], #16 + st1 {v0.8h}, [x0], #16 + integral4h v7, v6 + ld1 {v7.8h}, [x1], #16 + st1 {v0.8h}, [x0], #16 + b.gt 1b + ret +endfunc + +.macro integral8h p1, p2, s + ext v1.16b, \p1\().16b, \p2\().16b, #2 + ext v2.16b, \p1\().16b, \p2\().16b, #4 + ext v3.16b, \p1\().16b, \p2\().16b, #6 + ext v4.16b, \p1\().16b, \p2\().16b, #8 + ext v5.16b, \p1\().16b, \p2\().16b, #10 + ext v6.16b, \p1\().16b, \p2\().16b, #12 + ext v7.16b, \p1\().16b, \p2\().16b, #14 + add v0.8h, \p1\().8h, v1.8h + add v2.8h, v2.8h, v3.8h + add v4.8h, v4.8h, v5.8h + add v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + add v0.8h, v0.8h, v4.8h + add v0.8h, v0.8h, \s\().8h +.endm + +function integral_init8h_neon, export=1 + sub x3, x0, x2, lsl #1 + lsl x2, x2, #1 + + ld1 {v16.8h, v17.8h}, [x1], #32 +1: + subs x2, x2, #32 + ld1 {v18.8h}, [x3], #16 + integral8h v16, v17, v18 + ld1 {v16.8h}, [x1], #16 + ld1 {v18.8h}, [x3], #16 + st1 {v0.8h}, [x0], #16 + integral8h v17, v16, v18 + ld1 {v17.8h}, [x1], #16 st1 {v0.8h}, [x0], #16 - b.gt 8b + b.gt 1b + ret +endfunc + +function integral_init4v_neon, export=1 + mov x3, x0 + add x4, x0, x2, lsl #3 + add x8, x0, x2, lsl #4 + lsl x2, x2, #1 + sub x2, x2, #16 + ld1 {v20.8h, v21.8h, v22.8h}, [x3], #48 + ld1 {v16.8h, v17.8h, v18.8h}, [x8], #48 +1: + subs x2, x2, #32 + ld1 {v24.8h, v25.8h}, [x4], #32 + ext v0.16b, v20.16b, v21.16b, #8 + ext v1.16b, v21.16b, v22.16b, #8 + ext v2.16b, v16.16b, v17.16b, #8 + ext v3.16b, v17.16b, v18.16b, #8 + sub v24.8h, v24.8h, v20.8h + sub v25.8h, v25.8h, v21.8h + add v0.8h, v0.8h, v20.8h + add v1.8h, v1.8h, v21.8h + add v2.8h, v2.8h, v16.8h + add v3.8h, v3.8h, v17.8h + st1 {v24.8h}, [x1], #16 + st1 {v25.8h}, [x1], #16 + mov v20.16b, v22.16b + mov v16.16b, v18.16b + sub v0.8h, v2.8h, v0.8h + sub v1.8h, v3.8h, v1.8h + ld1 {v21.8h, v22.8h}, [x3], #32 + ld1 {v17.8h, v18.8h}, [x8], #32 + st1 {v0.8h}, [x0], #16 + st1 {v1.8h}, [x0], #16 + b.gt 1b +2: + ret +endfunc + +function integral_init8v_neon, export=1 + add x2, x0, x1, lsl #4 + sub x1, x1, #8 + ands x3, x1, #16 - 1 + b.eq 1f + subs x1, x1, #8 + ld1 {v0.8h}, [x0] + ld1 {v2.8h}, [x2], #16 + sub v4.8h, v2.8h, v0.8h + st1 {v4.8h}, [x0], #16 + b.le 2f +1: + subs x1, x1, #16 + ld1 {v0.8h,v1.8h}, [x0] + ld1 {v2.8h,v3.8h}, [x2], #32 + sub v4.8h, v2.8h, v0.8h + sub v5.8h, v3.8h, v1.8h + st1 {v4.8h}, [x0], #16 + st1 {v5.8h}, [x0], #16 + b.gt 1b +2: ret endfunc -const pw_0to15, align=5 - .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 -endconst +// frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, +// pixel *dstv, pixel *dstc, intptr_t src_stride, +// intptr_t dst_stride, int width, int height ) +function frame_init_lowres_core_neon, export=1 + ldr w8, [sp] + lsl x5, x5, #1 + sub x10, x6, w7, uxtw // dst_stride - width + lsl x10, x10, #1 + and x10, x10, #~31 + + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + +1: + mov w9, w7 // width + mov x11, x0 // src0 + add x12, x0, x5 // src1 = src0 + src_stride + add x13, x0, x5, lsl #1 // src2 = src1 + src_stride + + ld2 {v0.8h, v1.8h}, [x11], #32 + ld2 {v2.8h, v3.8h}, [x11], #32 + ld2 {v4.8h, v5.8h}, [x12], #32 + ld2 {v6.8h, v7.8h}, [x12], #32 + ld2 {v28.8h, v29.8h}, [x13], #32 + ld2 {v30.8h, v31.8h}, [x13], #32 + + urhadd v20.8h, v0.8h, v4.8h + urhadd v21.8h, v2.8h, v6.8h + urhadd v22.8h, v4.8h, v28.8h + urhadd v23.8h, v6.8h, v30.8h +2: + subs w9, w9, #16 + + urhadd v24.8h, v1.8h, v5.8h + urhadd v25.8h, v3.8h, v7.8h + urhadd v26.8h, v5.8h, v29.8h + urhadd v27.8h, v7.8h, v31.8h + + ld2 {v0.8h, v1.8h}, [x11], #32 + ld2 {v2.8h, v3.8h}, [x11], #32 + ld2 {v4.8h, v5.8h}, [x12], #32 + ld2 {v6.8h, v7.8h}, [x12], #32 + ld2 {v28.8h, v29.8h}, [x13], #32 + ld2 {v30.8h, v31.8h}, [x13], #32 + + urhadd v16.8h, v0.8h, v4.8h + urhadd v17.8h, v2.8h, v6.8h + urhadd v18.8h, v4.8h, v28.8h + urhadd v19.8h, v6.8h, v30.8h + + ext v8.16b, v20.16b, v21.16b, #2 + ext v9.16b, v21.16b, v16.16b, #2 + ext v10.16b, v22.16b, v23.16b, #2 + ext v11.16b, v23.16b, v18.16b, #2 + + urhadd v12.8h, v20.8h, v24.8h + urhadd v8.8h, v24.8h, v8.8h + + urhadd v24.8h, v21.8h, v25.8h + urhadd v22.8h, v22.8h, v26.8h + urhadd v10.8h, v26.8h, v10.8h + urhadd v26.8h, v23.8h, v27.8h + urhadd v9.8h, v25.8h, v9.8h + urhadd v11.8h, v27.8h, v11.8h + + st1 {v12.8h}, [x1], #16 + st1 {v24.8h}, [x1], #16 + st1 {v22.8h}, [x3], #16 + st1 {v26.8h}, [x3], #16 + st1 {v8.8h, v9.8h}, [x2], #32 + st1 {v10.8h, v11.8h}, [x4], #32 + + b.le 3f + + subs w9, w9, #16 + + urhadd v24.8h, v1.8h, v5.8h + urhadd v25.8h, v3.8h, v7.8h + urhadd v26.8h, v5.8h, v29.8h + urhadd v27.8h, v7.8h, v31.8h + + ld2 {v0.8h, v1.8h}, [x11], #32 + ld2 {v2.8h, v3.8h}, [x11], #32 + ld2 {v4.8h, v5.8h}, [x12], #32 + ld2 {v6.8h, v7.8h}, [x12], #32 + ld2 {v28.8h, v29.8h}, [x13], #32 + ld2 {v30.8h, v31.8h}, [x13], #32 + + urhadd v20.8h, v0.8h, v4.8h + urhadd v21.8h, v2.8h, v6.8h + urhadd v22.8h, v4.8h, v28.8h + urhadd v23.8h, v6.8h, v30.8h + + ext v8.16b, v16.16b, v17.16b, #2 + ext v9.16b, v17.16b, v20.16b, #2 + ext v10.16b, v18.16b, v19.16b, #2 + ext v11.16b, v19.16b, v22.16b, #2 + + urhadd v12.8h, v16.8h, v24.8h + urhadd v13.8h, v17.8h, v25.8h + + urhadd v14.8h, v18.8h, v26.8h + urhadd v15.8h, v19.8h, v27.8h + + urhadd v16.8h, v24.8h, v8.8h + urhadd v17.8h, v25.8h, v9.8h + + urhadd v18.8h, v26.8h, v10.8h + urhadd v19.8h, v27.8h, v11.8h + + st1 {v12.8h, v13.8h}, [x1], #32 + st1 {v14.8h, v15.8h}, [x3], #32 + st1 {v16.8h, v17.8h}, [x2], #32 + st1 {v18.8h, v19.8h}, [x4], #32 + b.gt 2b +3: + subs w8, w8, #1 + add x0, x0, x5, lsl #1 + add x1, x1, x10 + add x2, x2, x10 + add x3, x3, x10 + add x4, x4, x10 + b.gt 1b + + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + + add sp, sp, #0x40 -function mbtree_propagate_list_internal_neon, export=1 - movrel x11, pw_0to15 - dup v31.8h, w4 // bipred_weight - movi v30.8h, #0xc0, lsl #8 - ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y - movi v28.4s, #4 - movi v27.8h, #31 - movi v26.8h, #32 - dup v24.8h, w5 // mb_y - zip1 v29.8h, v29.8h, v24.8h -8: - subs w6, w6, #8 - ld1 {v1.8h}, [x1], #16 // propagate_amount - ld1 {v2.8h}, [x2], #16 // lowres_cost - and v2.16b, v2.16b, v30.16b - cmeq v25.8h, v2.8h, v30.8h - umull v16.4s, v1.4h, v31.4h - umull2 v17.4s, v1.8h, v31.8h - rshrn v16.4h, v16.4s, #6 - rshrn2 v16.8h, v17.4s, #6 - bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 ) - // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 - ld1 {v4.8h,v5.8h}, [x0], #32 - sshr v6.8h, v4.8h, #5 - sshr v7.8h, v5.8h, #5 - add v6.8h, v6.8h, v29.8h - add v29.8h, v29.8h, v28.8h - add v7.8h, v7.8h, v29.8h - add v29.8h, v29.8h, v28.8h - st1 {v6.8h,v7.8h}, [x3], #32 - and v4.16b, v4.16b, v27.16b - and v5.16b, v5.16b, v27.16b - uzp1 v6.8h, v4.8h, v5.8h // x & 31 - uzp2 v7.8h, v4.8h, v5.8h // y & 31 - sub v4.8h, v26.8h, v6.8h // 32 - (x & 31) - sub v5.8h, v26.8h, v7.8h // 32 - (y & 31) - mul v19.8h, v6.8h, v7.8h // idx3weight = y*x; - mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x); - mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x; - mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ; - umull v6.4s, v19.4h, v25.4h - umull2 v7.4s, v19.8h, v25.8h - umull v4.4s, v18.4h, v25.4h - umull2 v5.4s, v18.8h, v25.8h - umull v2.4s, v17.4h, v25.4h - umull2 v3.4s, v17.8h, v25.8h - umull v0.4s, v16.4h, v25.4h - umull2 v1.4s, v16.8h, v25.8h - rshrn v19.4h, v6.4s, #10 - rshrn2 v19.8h, v7.4s, #10 - rshrn v18.4h, v4.4s, #10 - rshrn2 v18.8h, v5.4s, #10 - rshrn v17.4h, v2.4s, #10 - rshrn2 v17.8h, v3.4s, #10 - rshrn v16.4h, v0.4s, #10 - rshrn2 v16.8h, v1.4s, #10 - zip1 v0.8h, v16.8h, v17.8h - zip2 v1.8h, v16.8h, v17.8h - zip1 v2.8h, v18.8h, v19.8h - zip2 v3.8h, v18.8h, v19.8h - st1 {v0.8h,v1.8h}, [x3], #32 - st1 {v2.8h,v3.8h}, [x3], #32 - b.ge 8b ret endfunc -function memcpy_aligned_neon, export=1 - tst x2, #16 +function load_deinterleave_chroma_fenc_neon, export=1 + mov x4, #FENC_STRIDE/2 + lsl x4, x4, #1 + lsl x2, x2, #1 + b load_deinterleave_chroma +endfunc + +function load_deinterleave_chroma_fdec_neon, export=1 + mov x4, #FDEC_STRIDE/2 + lsl x4, x4, #1 + lsl x2, x2, #1 +load_deinterleave_chroma: + ld2 {v0.8h, v1.8h}, [x1], x2 + ld2 {v2.8h, v3.8h}, [x1], x2 + subs w3, w3, #2 + st1 {v0.8h}, [x0], x4 + st1 {v1.8h}, [x0], x4 + st1 {v2.8h}, [x0], x4 + st1 {v3.8h}, [x0], x4 + b.gt load_deinterleave_chroma + + ret +endfunc + +function store_interleave_chroma_neon, export=1 + mov x5, #FDEC_STRIDE + lsl x5, x5, #1 + lsl x1, x1, #1 +1: + ld1 {v0.8h}, [x2], x5 + ld1 {v1.8h}, [x3], x5 + ld1 {v2.8h}, [x2], x5 + ld1 {v3.8h}, [x3], x5 + subs w4, w4, #2 + zip1 v4.8h, v0.8h, v1.8h + zip1 v6.8h, v2.8h, v3.8h + zip2 v5.8h, v0.8h, v1.8h + zip2 v7.8h, v2.8h, v3.8h + + st1 {v4.8h, v5.8h}, [x0], x1 + st1 {v6.8h, v7.8h}, [x0], x1 + b.gt 1b + + ret +endfunc + +function plane_copy_core_neon, export=1 + add w8, w4, #31 // 32-bit write clears the upper 32-bit the register + and w4, w8, #~31 + // safe use of the full reg since negative width makes no sense + sub x1, x1, x4 + sub x3, x3, x4 + lsl x1, x1, #1 + lsl x3, x3, #1 +1: + mov w8, w4 +16: + tst w8, #16 b.eq 32f - sub x2, x2, #16 - ldr q0, [x1], #16 - str q0, [x0], #16 + subs w8, w8, #16 + ldp q0, q1, [x2], #32 + stp q0, q1, [x0], #32 + b.eq 0f 32: - tst x2, #32 - b.eq 640f - sub x2, x2, #32 - ldp q0, q1, [x1], #32 - stp q0, q1, [x0], #32 -640: - cbz x2, 1f -64: - subs x2, x2, #64 - ldp q0, q1, [x1, #32] - ldp q2, q3, [x1], #64 - stp q0, q1, [x0, #32] - stp q2, q3, [x0], #64 - b.gt 64b + subs w8, w8, #32 + ldp q0, q1, [x2], #32 + ldp q2, q3, [x2], #32 + stp q0, q1, [x0], #32 + stp q2, q3, [x0], #32 + b.gt 32b +0: + subs w5, w5, #1 + add x2, x2, x3 + add x0, x0, x1 + b.gt 1b + + ret +endfunc + +function plane_copy_swap_core_neon, export=1 + lsl w4, w4, #1 + add w8, w4, #31 // 32-bit write clears the upper 32-bit the register + and w4, w8, #~31 + sub x1, x1, x4 + sub x3, x3, x4 + lsl x1, x1, #1 + lsl x3, x3, #1 1: + mov w8, w4 + tbz w4, #4, 32f + subs w8, w8, #16 + ld1 {v0.8h, v1.8h}, [x2], #32 + rev32 v0.8h, v0.8h + rev32 v1.8h, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.eq 0f +32: + subs w8, w8, #32 + ld1 {v0.8h ,v1.8h, v2.8h, v3.8h}, [x2], #64 + rev32 v20.8h, v0.8h + rev32 v21.8h, v1.8h + rev32 v22.8h, v2.8h + rev32 v23.8h, v3.8h + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + b.gt 32b +0: + subs w5, w5, #1 + add x2, x2, x3 + add x0, x0, x1 + b.gt 1b + ret endfunc -function memzero_aligned_neon, export=1 - movi v0.16b, #0 - movi v1.16b, #0 +function plane_copy_deinterleave_neon, export=1 + add w9, w6, #15 + and w9, w9, #~15 + sub x1, x1, x9 + sub x3, x3, x9 + sub x5, x5, x9, lsl #1 + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 1: - subs x1, x1, #128 - stp q0, q1, [x0, #96] - stp q0, q1, [x0, #64] - stp q0, q1, [x0, #32] - stp q0, q1, [x0], 128 + ld2 {v0.8h, v1.8h}, [x4], #32 + ld2 {v2.8h, v3.8h}, [x4], #32 + subs w9, w9, #16 + st1 {v0.8h}, [x0], #16 + st1 {v2.8h}, [x0], #16 + st1 {v1.8h}, [x2], #16 + st1 {v3.8h}, [x2], #16 b.gt 1b + + add x4, x4, x5 + subs w7, w7, #1 + add x0, x0, x1 + add x2, x2, x3 + mov w9, w6 + b.gt 1b + ret endfunc -// void mbtree_fix8_pack( int16_t *dst, float *src, int count ) -function mbtree_fix8_pack_neon, export=1 - subs w3, w2, #8 - b.lt 2f +function plane_copy_interleave_core_neon, export=1 + add w9, w6, #15 + and w9, w9, #0xfffffff0 + sub x1, x1, x9, lsl #1 + sub x3, x3, x9 + sub x5, x5, x9 + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 1: - subs w3, w3, #8 - ld1 {v0.4s,v1.4s}, [x1], #32 - fcvtzs v0.4s, v0.4s, #8 - fcvtzs v1.4s, v1.4s, #8 - sqxtn v2.4h, v0.4s - sqxtn2 v2.8h, v1.4s - rev16 v3.16b, v2.16b - st1 {v3.8h}, [x0], #16 - b.ge 1b -2: - adds w3, w3, #8 - b.eq 4f -3: - subs w3, w3, #1 - ldr s0, [x1], #4 - fcvtzs w4, s0, #8 - rev16 w5, w4 - strh w5, [x0], #2 - b.gt 3b -4: + ld1 {v0.8h}, [x2], #16 + ld1 {v1.8h}, [x4], #16 + ld1 {v2.8h}, [x2], #16 + ld1 {v3.8h}, [x4], #16 + subs w9, w9, #16 + st2 {v0.8h, v1.8h}, [x0], #32 + st2 {v2.8h, v3.8h}, [x0], #32 + b.gt 1b + + subs w7, w7, #1 + add x0, x0, x1 + add x2, x2, x3 + add x4, x4, x5 + mov w9, w6 + b.gt 1b + ret endfunc -// void mbtree_fix8_unpack( float *dst, int16_t *src, int count ) -function mbtree_fix8_unpack_neon, export=1 - subs w3, w2, #8 - b.lt 2f +.macro deinterleave_rgb + subs x11, x11, #8 + st1 {v0.8h}, [x0], #16 + st1 {v1.8h}, [x2], #16 + st1 {v2.8h}, [x4], #16 + b.gt 1b + + subs w10, w10, #1 + add x0, x0, x1 + add x2, x2, x3 + add x4, x4, x5 + add x6, x6, x7 + mov x11, x9 + b.gt 1b +.endm + +function plane_copy_deinterleave_rgb_neon, export=1 +#if SYS_MACOSX + ldr w8, [sp] + ldp w9, w10, [sp, #4] +#else + ldr x8, [sp] + ldp x9, x10, [sp, #8] +#endif + cmp w8, #3 + uxtw x9, w9 + add x11, x9, #7 + and x11, x11, #~7 + sub x1, x1, x11 + sub x3, x3, x11 + sub x5, x5, x11 + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x5, x5, #1 + b.ne 4f + sub x7, x7, x11, lsl #1 + sub x7, x7, x11 + lsl x7, x7, #1 1: - subs w3, w3, #8 - ld1 {v0.8h}, [x1], #16 - rev16 v1.16b, v0.16b - sxtl v2.4s, v1.4h - sxtl2 v3.4s, v1.8h - scvtf v4.4s, v2.4s, #8 - scvtf v5.4s, v3.4s, #8 - st1 {v4.4s,v5.4s}, [x0], #32 - b.ge 1b -2: - adds w3, w3, #8 - b.eq 4f -3: - subs w3, w3, #1 - ldrh w4, [x1], #2 - rev16 w5, w4 - sxth w6, w5 - scvtf s0, w6, #8 - str s0, [x0], #4 - b.gt 3b + ld3 {v0.8h, v1.8h, v2.8h}, [x6], #48 + deinterleave_rgb + + ret 4: + sub x7, x7, x11, lsl #2 + lsl x7, x7, #1 +1: + ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + deinterleave_rgb + + ret +endfunc + +// void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src, +// intptr_t stride, int width, int height, int16_t *buf ) +function hpel_filter_neon, export=1 + lsl x5, x5, #1 + ubfm x9, x3, #3, #7 + add w15, w5, w9 + sub x13, x3, x9 // align src + sub x10, x0, x9 + sub x11, x1, x9 + sub x12, x2, x9 + movi v30.8h, #5 + movi v31.8h, #20 + + lsl x4, x4, #1 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + str q0, [sp, #-0x50]! + +1: // line start + mov x3, x13 + mov x2, x12 + mov x1, x11 + mov x0, x10 + add x7, x3, #32 // src pointer next 16b for horiz filter + mov x5, x15 // restore width + sub x3, x3, x4, lsl #1 // src - 2*stride + ld1 {v28.8h, v29.8h}, [x7], #32 // src[16:31] + add x9, x3, x5 // holds src - 2*stride + width + + ld1 {v8.8h, v9.8h}, [x3], x4 // src-2*stride[0:15] + ld1 {v10.8h, v11.8h}, [x3], x4 // src-1*stride[0:15] + ld1 {v12.8h, v13.8h}, [x3], x4 // src-0*stride[0:15] + ld1 {v14.8h, v15.8h}, [x3], x4 // src+1*stride[0:15] + ld1 {v16.8h, v17.8h}, [x3], x4 // src+2*stride[0:15] + ld1 {v18.8h, v19.8h}, [x3], x4 // src+3*stride[0:15] + + ext v22.16b, v7.16b, v12.16b, #12 + ext v23.16b, v12.16b, v13.16b, #12 + uaddl v1.4s, v8.4h, v18.4h + uaddl2 v20.4s, v8.8h, v18.8h + ext v24.16b, v12.16b, v13.16b, #6 + ext v25.16b, v13.16b, v28.16b, #6 + umlsl v1.4s, v10.4h, v30.4h + umlsl2 v20.4s, v10.8h, v30.8h + ext v26.16b, v7.16b, v12.16b, #14 + ext v27.16b, v12.16b, v13.16b, #14 + umlal v1.4s, v12.4h, v31.4h + umlal2 v20.4s, v12.8h, v31.8h + ext v3.16b, v12.16b, v13.16b, #2 + ext v4.16b, v13.16b, v28.16b, #2 + umlal v1.4s, v14.4h, v31.4h + umlal2 v20.4s, v14.8h, v31.8h + ext v21.16b, v12.16b, v13.16b, #4 + ext v5.16b, v13.16b, v28.16b, #4 + umlsl v1.4s, v16.4h, v30.4h + umlsl2 v20.4s, v16.8h, v30.8h + +2: // next 16 pixel of line + subs x5, x5, #32 + sub x3, x9, x5 // src - 2*stride += 16 + + uaddl v8.4s, v22.4h, v24.4h + uaddl2 v22.4s, v22.8h, v24.8h + uaddl v10.4s, v23.4h, v25.4h + uaddl2 v23.4s, v23.8h, v25.8h + + umlsl v8.4s, v26.4h, v30.4h + umlsl2 v22.4s, v26.8h, v30.8h + umlsl v10.4s, v27.4h, v30.4h + umlsl2 v23.4s, v27.8h, v30.8h + + umlal v8.4s, v12.4h, v31.4h + umlal2 v22.4s, v12.8h, v31.8h + umlal v10.4s, v13.4h, v31.4h + umlal2 v23.4s, v13.8h, v31.8h + + umlal v8.4s, v3.4h, v31.4h + umlal2 v22.4s, v3.8h, v31.8h + umlal v10.4s, v4.4h, v31.4h + umlal2 v23.4s, v4.8h, v31.8h + + umlsl v8.4s, v21.4h, v30.4h + umlsl2 v22.4s, v21.8h, v30.8h + umlsl v10.4s, v5.4h, v30.4h + umlsl2 v23.4s, v5.8h, v30.8h + + uaddl v5.4s, v9.4h, v19.4h + uaddl2 v2.4s, v9.8h, v19.8h + + sqrshrun v8.4h, v8.4s, #5 + sqrshrun2 v8.8h, v22.4s, #5 + sqrshrun v10.4h, v10.4s, #5 + sqrshrun2 v10.8h, v23.4s, #5 + + mov v6.16b, v12.16b + mov v7.16b, v13.16b + + mvni v23.8h, #0xfc, lsl #8 + + umin v8.8h, v8.8h, v23.8h + umin v10.8h, v10.8h, v23.8h + + st1 {v8.8h}, [x0], #16 + st1 {v10.8h}, [x0], #16 + + umlsl v5.4s, v11.4h, v30.4h + umlsl2 v2.4s, v11.8h, v30.8h + + ld1 {v8.8h, v9.8h}, [x3], x4 + umlal v5.4s, v13.4h, v31.4h + umlal2 v2.4s, v13.8h, v31.8h + ld1 {v10.8h, v11.8h}, [x3], x4 + umlal v5.4s, v15.4h, v31.4h + umlal2 v2.4s, v15.8h, v31.8h + ld1 {v12.8h, v13.8h}, [x3], x4 + umlsl v5.4s, v17.4h, v30.4h + umlsl2 v2.4s, v17.8h, v30.8h + ld1 {v14.8h, v15.8h}, [x3], x4 + + sqrshrun v4.4h, v5.4s, #5 + sqrshrun2 v4.8h, v2.4s, #5 + sqrshrun v18.4h, v1.4s, #5 + sqrshrun2 v18.8h, v20.4s, #5 + + mvni v17.8h, #0xfc, lsl #8 + + smin v4.8h, v4.8h, v17.8h + smin v18.8h, v18.8h, v17.8h + + st1 {v18.8h}, [x1], #16 + st1 {v4.8h}, [x1], #16 + + ld1 {v16.8h, v17.8h}, [x3], x4 // src+2*stride[0:15] + ld1 {v18.8h, v19.8h}, [x3], x4 // src+3*stride[0:15] + + str q9, [sp, #0x10] + str q15, [sp, #0x20] + str q17, [sp, #0x30] + str q19, [sp, #0x40] + + ldr q28, [sp] + + ext v22.16b, v28.16b, v1.16b, #8 + ext v9.16b, v1.16b, v20.16b, #8 + ext v26.16b, v1.16b, v20.16b, #12 + ext v17.16b, v20.16b, v5.16b, #12 + ext v23.16b, v28.16b, v1.16b, #12 + ext v19.16b, v1.16b, v20.16b, #12 + + uaddl v3.4s, v8.4h, v18.4h + uaddl2 v15.4s, v8.8h, v18.8h + umlsl v3.4s, v10.4h, v30.4h + umlsl2 v15.4s, v10.8h, v30.8h + umlal v3.4s, v12.4h, v31.4h + umlal2 v15.4s, v12.8h, v31.8h + umlal v3.4s, v14.4h, v31.4h + umlal2 v15.4s, v14.8h, v31.8h + umlsl v3.4s, v16.4h, v30.4h + umlsl2 v15.4s, v16.8h, v30.8h + + add v4.4s, v22.4s, v26.4s + add v26.4s, v9.4s, v17.4s + + ext v25.16b, v1.16b, v20.16b, #8 + ext v22.16b, v20.16b, v5.16b, #8 + ext v24.16b, v1.16b, v20.16b, #4 + ext v9.16b, v20.16b, v5.16b, #4 + + add v31.4s, v23.4s, v25.4s + add v19.4s, v19.4s, v22.4s + add v6.4s, v24.4s, v1.4s + add v17.4s, v9.4s, v20.4s + sub v4.4s, v4.4s, v31.4s // a-b + sub v26.4s, v26.4s, v19.4s // a-b + sub v31.4s, v31.4s, v6.4s // b-c + sub v19.4s, v19.4s, v17.4s // b-c + + ext v22.16b, v20.16b, v5.16b, #8 + ext v9.16b, v5.16b, v2.16b, #8 + ext v24.16b, v5.16b, v2.16b, #12 + ext v28.16b, v2.16b, v3.16b, #12 + ext v23.16b, v20.16b, v5.16b, #12 + ext v30.16b, v5.16b, v2.16b, #12 + ext v25.16b, v5.16b, v2.16b, #8 + ext v29.16b, v2.16b, v3.16b, #8 + + add v22.4s, v22.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v23.4s, v23.4s, v25.4s + add v29.4s, v29.4s, v30.4s + + ext v24.16b, v5.16b, v2.16b, #4 + ext v28.16b, v2.16b, v3.16b, #4 + + add v24.4s, v24.4s, v5.4s + add v28.4s, v28.4s, v2.4s + + sub v22.4s, v22.4s, v23.4s + sub v9.4s, v9.4s, v29.4s + sub v23.4s, v23.4s, v24.4s + sub v29.4s, v29.4s, v28.4s + + sshr v4.4s, v4.4s, #2 + sshr v0.4s, v26.4s, #2 + sshr v22.4s, v22.4s, #2 + sshr v9.4s, v9.4s, #2 + + sub v4.4s, v4.4s, v31.4s + sub v0.4s, v0.4s, v19.4s + sub v22.4s, v22.4s, v23.4s + sub v9.4s, v9.4s, v29.4s + + sshr v4.4s, v4.4s, #2 + sshr v0.4s, v0.4s, #2 + sshr v22.4s, v22.4s, #2 + sshr v9.4s, v9.4s, #2 + + add v4.4s, v4.4s, v6.4s + add v0.4s, v0.4s, v17.4s + add v22.4s, v22.4s, v24.4s + add v9.4s, v9.4s, v28.4s + + str q2, [sp] + + sqrshrun v4.4h, v4.4s, #6 + sqrshrun2 v4.8h, v0.4s, #6 + sqrshrun v22.4h, v22.4s, #6 + sqrshrun2 v22.8h, v9.4s, #6 + + mov v0.16b, v5.16b + + ld1 {v28.8h, v29.8h}, [x7], #32 // src[16:31] + + ldr q9, [sp, #0x10] + ldr q17, [sp, #0x30] + ldr q19, [sp, #0x40] + + ext v26.16b, v7.16b, v12.16b, #14 + ext v27.16b, v12.16b, v13.16b, #14 + + mvni v25.8h, 0xfc, lsl #8 + + smin v22.8h, v22.8h, v25.8h + smin v4.8h, v4.8h, v25.8h + + st1 {v4.8h}, [x2], #16 + st1 {v22.8h}, [x2], #16 + + mov v1.16b, v3.16b + mov v20.16b, v15.16b + + ldr q15, [sp, #0x20] + + ext v22.16b, v7.16b, v12.16b, #12 + ext v23.16b, v12.16b, v13.16b, #12 + ext v3.16b, v12.16b, v13.16b, #2 + ext v4.16b, v13.16b, v28.16b, #2 + ext v21.16b, v12.16b, v13.16b, #4 + ext v5.16b, v13.16b, v28.16b, #4 + ext v24.16b, v12.16b, v13.16b, #6 + ext v25.16b, v13.16b, v28.16b, #6 + + movi v30.8h, #5 + movi v31.8h, #20 + + b.gt 2b + + subs w6, w6, #1 + add x10, x10, x4 + add x11, x11, x4 + add x12, x12, x4 + add x13, x13, x4 + b.gt 1b + + add sp, sp, #0x50 + + ldp d8, d9, [sp] + ldp d10, d11, [sp, #0x10] + ldp d12, d13, [sp, #0x20] + ldp d14, d15, [sp, #0x30] + add sp, sp, #0x40 + ret endfunc + +#endif diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c index 489ca70e6..8551294bd 100644 --- a/common/aarch64/mc-c.c +++ b/common/aarch64/mc-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * mc-c.c: aarch64 motion compensation ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -28,11 +28,11 @@ #include "mc.h" #define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64) -void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int ); +void x264_prefetch_ref_aarch64( pixel *, intptr_t, int ); #define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64) -void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_420_aarch64( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64) -void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_prefetch_fenc_422_aarch64( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon) void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n ); @@ -40,32 +40,41 @@ void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n ); void x264_memzero_aligned_neon( void *dst, size_t n ); #define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon) -void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_16x16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon) -void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_16x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon) -void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon) -void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon) -void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_8x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon) -void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon) -void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon) -void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon) -void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x2_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); + +#define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve) +void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); +#define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve) +void x264_pixel_avg_4x8_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); +#define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve) +void x264_pixel_avg_4x4_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); +#define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve) +void x264_pixel_avg_4x2_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon) -void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ); #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon) -void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ); #define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon) -void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ); #define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon) -void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_pixel_avg2_w20_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ); #define x264_plane_copy_core_neon x264_template(plane_copy_core_neon) void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst, @@ -111,12 +120,12 @@ void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i #define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon) #define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon) #define MC_WEIGHT(func)\ -void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ -void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ -void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ -void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w20##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w16##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w8##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\ +void x264_mc_weight_w4##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\ \ -static void (* mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\ +static void (* mc##func##_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\ {\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w4##func##_neon,\ @@ -126,32 +135,30 @@ static void (* mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_ x264_mc_weight_w20##func##_neon,\ }; -#if !HIGH_BIT_DEPTH MC_WEIGHT() MC_WEIGHT(_nodenom) MC_WEIGHT(_offsetadd) MC_WEIGHT(_offsetsub) -#endif #define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon) -void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon) -void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon) -void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_mc_copy_w16_neon( pixel *, intptr_t, pixel *, intptr_t, int ); #define x264_mc_chroma_neon x264_template(mc_chroma_neon) -void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void x264_mc_chroma_neon( pixel *, pixel *, intptr_t, pixel *, intptr_t, int, int, int, int ); #define x264_integral_init4h_neon x264_template(integral_init4h_neon) -void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t ); +void x264_integral_init4h_neon( uint16_t *, pixel *, intptr_t ); #define x264_integral_init4v_neon x264_template(integral_init4v_neon) void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t ); #define x264_integral_init8h_neon x264_template(integral_init8h_neon) -void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t ); +void x264_integral_init8h_neon( uint16_t *, pixel *, intptr_t ); #define x264_integral_init8v_neon x264_template(integral_init8v_neon) void x264_integral_init8v_neon( uint16_t *, intptr_t ); #define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon) -void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); +void x264_frame_init_lowres_core_neon( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, intptr_t, int, int ); #define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon) void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int ); @@ -161,7 +168,25 @@ void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count ); #define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon) void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count ); -#if !HIGH_BIT_DEPTH +static void (* const pixel_avg_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ) = +{ + NULL, + x264_pixel_avg2_w4_neon, + x264_pixel_avg2_w8_neon, + x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function + x264_pixel_avg2_w16_neon, + x264_pixel_avg2_w20_neon, +}; + +static void (* const mc_copy_wtab_neon[5])( pixel *, intptr_t, pixel *, intptr_t, int ) = +{ + NULL, + x264_mc_copy_w4_neon, + x264_mc_copy_w8_neon, + NULL, + x264_mc_copy_w16_neon, +}; + static void weight_cache_neon( x264_t *h, x264_weight_t *w ) { if( w->i_scale == 1<i_denom ) @@ -183,39 +208,20 @@ static void weight_cache_neon( x264_t *h, x264_weight_t *w ) w->weightfn = mc_wtab_neon; } -static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = -{ - NULL, - x264_pixel_avg2_w4_neon, - x264_pixel_avg2_w8_neon, - x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function - x264_pixel_avg2_w16_neon, - x264_pixel_avg2_w20_neon, -}; - -static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = -{ - NULL, - x264_mc_copy_w4_neon, - x264_mc_copy_w8_neon, - NULL, - x264_mc_copy_w16_neon, -}; - -static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, - uint8_t *src[4], intptr_t i_src_stride, +static void mc_luma_neon( pixel *dst, intptr_t i_dst_stride, + pixel *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; + pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if( (mvy&3) == 3 ) // explicit if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); @@ -228,20 +234,20 @@ static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride, mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height ); } -static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride, - uint8_t *src[4], intptr_t i_src_stride, +static pixel *get_ref_neon( pixel *dst, intptr_t *i_dst_stride, + pixel *src[4], intptr_t i_src_stride, int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight ) { int qpel_idx = ((mvy&3)<<2) + (mvx&3); intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2); - uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; + pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset; if( (mvy&3) == 3 ) // explicit if() to force conditional add src1 += i_src_stride; if( qpel_idx & 5 ) /* qpel interpolation needed */ { - uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); + pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); pixel_avg_wtab_neon[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); @@ -262,19 +268,18 @@ static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride, } #define x264_hpel_filter_neon x264_template(hpel_filter_neon) -void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, - uint8_t *src, intptr_t stride, int width, +void x264_hpel_filter_neon( pixel *dsth, pixel *dstv, pixel *dstc, + pixel *src, intptr_t stride, int width, int height, int16_t *buf ); PLANE_COPY(16, neon) PLANE_COPY_SWAP(16, neon) PLANE_INTERLEAVE(neon) PROPAGATE_LIST(neon) -#endif // !HIGH_BIT_DEPTH void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf ) { -#if !HIGH_BIT_DEPTH + if( cpu&X264_CPU_ARMV8 ) { pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64; @@ -282,56 +287,70 @@ void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf ) pf->prefetch_ref = x264_prefetch_ref_aarch64; } - if( !(cpu&X264_CPU_NEON) ) - return; - - pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; - pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon; - pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; - pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; - - pf->plane_copy = plane_copy_neon; - pf->plane_copy_swap = plane_copy_swap_neon; - pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; - pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; - pf->plane_copy_interleave = plane_copy_interleave_neon; - - pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; - pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; - pf->store_interleave_chroma = x264_store_interleave_chroma_neon; - - pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; - pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; - pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; - pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; - pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; - pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; - pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; - pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; - pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon; - - pf->weight = mc_wtab_neon; - pf->offsetadd = mc_offsetadd_wtab_neon; - pf->offsetsub = mc_offsetsub_wtab_neon; - pf->weight_cache = weight_cache_neon; - - pf->mc_chroma = x264_mc_chroma_neon; - pf->mc_luma = mc_luma_neon; - pf->get_ref = get_ref_neon; - pf->hpel_filter = x264_hpel_filter_neon; - pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; - - pf->integral_init4h = x264_integral_init4h_neon; - pf->integral_init8h = x264_integral_init8h_neon; - pf->integral_init4v = x264_integral_init4v_neon; - pf->integral_init8v = x264_integral_init8v_neon; - - pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; - pf->mbtree_propagate_list = mbtree_propagate_list_neon; - pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon; - pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon; - - pf->memcpy_aligned = x264_memcpy_aligned_neon; - pf->memzero_aligned = x264_memzero_aligned_neon; + if( cpu&X264_CPU_NEON ) + { + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon; + pf->mbtree_propagate_list = mbtree_propagate_list_neon; + pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon; + pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon; + + pf->memcpy_aligned = x264_memcpy_aligned_neon; + pf->memzero_aligned = x264_memzero_aligned_neon; + + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon; + + pf->copy_16x16_unaligned = x264_mc_copy_w16_neon; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon; + pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; + pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + + pf->weight = mc_wtab_neon; + pf->offsetadd = mc_offsetadd_wtab_neon; + pf->offsetsub = mc_offsetsub_wtab_neon; + pf->weight_cache = weight_cache_neon; + + pf->mc_chroma = x264_mc_chroma_neon; + pf->mc_luma = mc_luma_neon; + pf->get_ref = get_ref_neon; + + pf->integral_init4h = x264_integral_init4h_neon; + pf->integral_init8h = x264_integral_init8h_neon; + pf->integral_init4v = x264_integral_init4v_neon; + pf->integral_init8v = x264_integral_init8v_neon; + + pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon; + + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; + + pf->store_interleave_chroma = x264_store_interleave_chroma_neon; + + pf->plane_copy = plane_copy_neon; + pf->plane_copy_swap = plane_copy_swap_neon; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; + pf->plane_copy_interleave = plane_copy_interleave_neon; + + pf->hpel_filter = x264_hpel_filter_neon; + } + +#if !HIGH_BIT_DEPTH +#if HAVE_SVE + if( cpu&X264_CPU_SVE ) + { + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sve; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sve; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sve; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sve; + } +#endif #endif // !HIGH_BIT_DEPTH } diff --git a/common/aarch64/mc.h b/common/aarch64/mc.h index ab59f57b9..05a15c169 100644 --- a/common/aarch64/mc.h +++ b/common/aarch64/mc.h @@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: aarch64 motion compensation ***************************************************************************** - * Copyright (C) 2014-2023 x264 project + * Copyright (C) 2014-2024 x264 project * * Authors: Janne Grunau * diff --git a/common/aarch64/pixel-a-common.S b/common/aarch64/pixel-a-common.S new file mode 100644 index 000000000..9e925e2fa --- /dev/null +++ b/common/aarch64/pixel-a-common.S @@ -0,0 +1,44 @@ +/**************************************************************************** + * pixel-a-common.S: aarch64 pixel metrics + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: David Conrad + * Janne Grunau + * David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +// This file contains the NEON macros and constants that are intended to be used by +// the SVE/SVE2 functions as well + +const mask_ac_4_8 +.short 0, -1, -1, -1, 0, -1, -1, -1 +.short 0, -1, -1, -1, -1, -1, -1, -1 +endconst + +.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d + SUMSUB_AB \s1, \d1, \a, \b + SUMSUB_AB \s2, \d2, \c, \d +.endm + +.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 + SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 + SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 +.endm diff --git a/common/aarch64/pixel-a-sve.S b/common/aarch64/pixel-a-sve.S new file mode 100644 index 000000000..f3a7690c4 --- /dev/null +++ b/common/aarch64/pixel-a-sve.S @@ -0,0 +1,523 @@ +/***************************************************************************** + * pixel-a-sve.S: aarch64 pixel metrics + ***************************************************************************** + * Copyright (C) 2009-2024 x264 project + * + * Authors: David Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "asm.S" +#include "pixel-a-common.S" + +.arch armv8-a+sve + +#if BIT_DEPTH == 8 + +.macro SSD_START_SVE_4 + ptrue p0.h, vl4 + ld1b {z16.h}, p0/z, [x0] + ld1b {z17.h}, p0/z, [x2] + add x0, x0, x1 + add x2, x2, x3 + sub v2.4h, v16.4h, v17.4h + ld1b {z16.h}, p0/z, [x0] + ld1b {z17.h}, p0/z, [x2] + add x0, x0, x1 + add x2, x2, x3 + smull v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_SVE_4 + sub v2.4h, v16.4h, v17.4h + ld1b {z16.h}, p0/z, [x0] + ld1b {z17.h}, p0/z, [x2] + add x0, x0, x1 + add x2, x2, x3 + smlal v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_END_SVE_4 + sub v2.4h, v16.4h, v17.4h + smlal v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_START_SVE_8 + ptrue p0.h, vl8 + ld1b {z16.h}, p0/z, [x0] + ld1b {z17.h}, p0/z, [x2] + add x0, x0, x1 + add x2, x2, x3 + sub v2.8h, v16.8h, v17.8h + ld1b {z16.h}, p0/z, [x0] + smull v0.4s, v2.4h, v2.4h + ld1b {z17.h}, p0/z, [x2] + smlal2 v0.4s, v2.8h, v2.8h + add x0, x0, x1 + add x2, x2, x3 +.endm + +.macro SSD_SVE_8 + sub v2.8h, v16.8h, v17.8h + ld1b {z16.h}, p0/z, [x0] + smlal v0.4s, v2.4h, v2.4h + ld1b {z17.h}, p0/z, [x2] + smlal2 v0.4s, v2.8h, v2.8h + add x0, x0, x1 + add x2, x2, x3 +.endm + +.macro SSD_END_SVE_8 + sub v2.8h, v16.8h, v17.8h + smlal v0.4s, v2.4h, v2.4h + smlal2 v0.4s, v2.8h, v2.8h +.endm + +.macro SSD_FUNC_SVE w h +function pixel_ssd_\w\()x\h\()_sve, export=1 + SSD_START_SVE_\w +.rept \h-2 + SSD_SVE_\w +.endr + SSD_END_SVE_\w + + addv s0, v0.4s + mov w0, v0.s[0] + ret +endfunc +.endm + +.macro load_diff_fly_sve_8x8 + ld1b {z1.h}, p0/z, [x2] + ld1b {z0.h}, p0/z, [x0] + add x2, x2, x3 + add x0, x0, x1 + ld1b {z3.h}, p0/z, [x2] + ld1b {z2.h}, p0/z, [x0] + add x2, x2, x3 + add x0, x0, x1 + sub v16.8h, v0.8h, v1.8h + sub v17.8h, v2.8h, v3.8h + ld1b {z5.h}, p0/z, [x2] + ld1b {z4.h}, p0/z, [x0] + add x2, x2, x3 + add x0, x0, x1 + ld1b {z7.h}, p0/z, [x2] + ld1b {z6.h}, p0/z, [x0] + add x2, x2, x3 + add x0, x0, x1 + sub v18.8h, v4.8h, v5.8h + sub v19.8h, v6.8h, v7.8h + ld1b {z1.h}, p0/z, [x2] + ld1b {z0.h}, p0/z, [x0] + add x2, x2, x3 + add x0, x0, x1 + ld1b {z3.h}, p0/z, [x2] + ld1b {z2.h}, p0/z, [x0] + add x2, x2, x3 + add x0, x0, x1 + sub v20.8h, v0.8h, v1.8h + sub v21.8h, v2.8h, v3.8h + ld1b {z5.h}, p0/z, [x2] + ld1b {z4.h}, p0/z, [x0] + add x2, x2, x3 + add x0, x0, x1 + ld1b {z7.h}, p0/z, [x2] + ld1b {z6.h}, p0/z, [x0] + add x2, x2, x3 + add x0, x0, x1 + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + sub v22.8h, v4.8h, v5.8h + sub v23.8h, v6.8h, v7.8h +.endm + +.macro pixel_var_sve_8 h +function pixel_var_8x\h\()_sve, export=1 + ptrue p0.h, vl8 + ld1b {z16.h}, p0/z, [x0] + add x0, x0, x1 + ld1b {z17.h}, p0/z, [x0] + add x0, x0, x1 + mov x2, \h - 4 + mul v1.8h, v16.8h, v16.8h + mul v2.8h, v17.8h, v17.8h + add v0.8h, v16.8h, v17.8h + ld1b {z18.h}, p0/z, [x0] + add x0, x0, x1 + uaddlp v1.4s, v1.8h + uaddlp v2.4s, v2.8h + ld1b {z19.h}, p0/z, [x0] + add x0, x0, x1 + +1: subs x2, x2, #4 + add v0.8h, v0.8h, v18.8h + mul v24.8h, v18.8h, v18.8h + ld1b {z20.h}, p0/z, [x0] + add x0, x0, x1 + add v0.8h, v0.8h, v19.8h + mul v25.8h, v19.8h, v19.8h + uadalp v1.4s, v24.8h + ld1b {z21.h}, p0/z, [x0] + add x0, x0, x1 + add v0.8h, v0.8h, v20.8h + mul v26.8h, v20.8h, v20.8h + uadalp v2.4s, v25.8h + ld1b {z18.h}, p0/z, [x0] + add x0, x0, x1 + add v0.8h, v0.8h, v21.8h + mul v27.8h, v21.8h, v21.8h + uadalp v1.4s, v26.8h + ld1b {z19.h}, p0/z, [x0] + add x0, x0, x1 + uadalp v2.4s, v27.8h + b.gt 1b + + add v0.8h, v0.8h, v18.8h + mul v28.8h, v18.8h, v18.8h + add v0.8h, v0.8h, v19.8h + mul v29.8h, v19.8h, v19.8h + uadalp v1.4s, v28.8h + uadalp v2.4s, v29.8h + + b var_end +endfunc +.endm + +function var_end + add v1.4s, v1.4s, v2.4s + uaddlv s0, v0.8h + uaddlv d1, v1.4s + mov w0, v0.s[0] + mov x1, v1.d[0] + orr x0, x0, x1, lsl #32 + ret +endfunc + +.macro SUMSUBL_AB_SVE sum, sub, a, b + add \sum, \a, \b + sub \sub, \a, \b +.endm + +function pixel_sa8d_8x8_sve, export=1 + ptrue p0.h, vl8 + mov x4, x30 + bl pixel_sa8d_8x8_sve + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + add w0, w0, #1 + lsr w0, w0, #1 + ret x4 +endfunc + +.macro sa8d_satd_sve_8x8 satd= +function pixel_sa8d_\satd\()8x8_sve + load_diff_fly_sve_8x8 + + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + + HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h +.ifc \satd, satd_ + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h + SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h + SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h + SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h + + transpose v4.4s, v6.4s, v24.4s, v26.4s + transpose v5.4s, v7.4s, v25.4s, v27.4s + transpose v24.4s, v26.4s, v0.4s, v2.4s + transpose v25.4s, v27.4s, v1.4s, v3.4s + + abs v0.8h, v4.8h + abs v1.8h, v5.8h + abs v2.8h, v6.8h + abs v3.8h, v7.8h + abs v4.8h, v24.8h + abs v5.8h, v25.8h + abs v6.8h, v26.8h + abs v7.8h, v27.8h + + umax v0.8h, v0.8h, v2.8h + umax v1.8h, v1.8h, v3.8h + umax v2.8h, v4.8h, v6.8h + umax v3.8h, v5.8h, v7.8h + + add v26.8h, v0.8h, v1.8h + add v27.8h, v2.8h, v3.8h +.endif + + SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h + SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h + SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h + SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h + + transpose v20.8h, v21.8h, v16.8h, v17.8h + transpose v4.8h, v5.8h, v0.8h, v1.8h + transpose v22.8h, v23.8h, v18.8h, v19.8h + transpose v6.8h, v7.8h, v2.8h, v3.8h + + SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h + SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h + SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h + SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h + + transpose v20.4s, v22.4s, v2.4s, v0.4s + transpose v21.4s, v23.4s, v3.4s, v1.4s + transpose v16.4s, v18.4s, v24.4s, v4.4s + transpose v17.4s, v19.4s, v25.4s, v5.4s + + SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h + SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + + transpose v16.2d, v20.2d, v0.2d, v4.2d + transpose v17.2d, v21.2d, v1.2d, v5.2d + transpose v18.2d, v22.2d, v2.2d, v6.2d + transpose v19.2d, v23.2d, v3.2d, v7.2d + + abs v16.8h, v16.8h + abs v20.8h, v20.8h + abs v17.8h, v17.8h + abs v21.8h, v21.8h + abs v18.8h, v18.8h + abs v22.8h, v22.8h + abs v19.8h, v19.8h + abs v23.8h, v23.8h + + umax v16.8h, v16.8h, v20.8h + umax v17.8h, v17.8h, v21.8h + umax v18.8h, v18.8h, v22.8h + umax v19.8h, v19.8h, v23.8h + + add v0.8h, v16.8h, v17.8h + add v1.8h, v18.8h, v19.8h + + ret +endfunc +.endm + +.macro HADAMARD_AC_SVE w h +function pixel_hadamard_ac_\w\()x\h\()_sve, export=1 + ptrue p0.h, vl8 + movrel x5, mask_ac_4_8 + mov x4, x30 + ld1 {v30.8h,v31.8h}, [x5] + movi v28.16b, #0 + movi v29.16b, #0 + + bl hadamard_ac_8x8_sve +.if \h > 8 + bl hadamard_ac_8x8_sve +.endif +.if \w > 8 + sub x0, x0, x1, lsl #3 + add x0, x0, #8 + bl hadamard_ac_8x8_sve +.endif +.if \w * \h == 256 + sub x0, x0, x1, lsl #4 + bl hadamard_ac_8x8_sve +.endif + + addv s1, v29.4s + addv s0, v28.4s + mov w1, v1.s[0] + mov w0, v0.s[0] + lsr w1, w1, #2 + lsr w0, w0, #1 + orr x0, x0, x1, lsl #32 + ret x4 +endfunc +.endm + +// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 +function hadamard_ac_8x8_sve + ld1b {z16.h}, p0/z, [x0] + add x0, x0, x1 + ld1b {z17.h}, p0/z, [x0] + add x0, x0, x1 + ld1b {z18.h}, p0/z, [x0] + add x0, x0, x1 + ld1b {z19.h}, p0/z, [x0] + add x0, x0, x1 + SUMSUBL_AB_SVE v0.8h, v1.8h, v16.8h, v17.8h + ld1b {z20.h}, p0/z, [x0] + add x0, x0, x1 + ld1b {z21.h}, p0/z, [x0] + add x0, x0, x1 + SUMSUBL_AB_SVE v2.8h, v3.8h, v18.8h, v19.8h + ld1b {z22.h}, p0/z, [x0] + add x0, x0, x1 + ld1b {z23.h}, p0/z, [x0] + add x0, x0, x1 + SUMSUBL_AB_SVE v4.8h, v5.8h, v20.8h, v21.8h + SUMSUBL_AB_SVE v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h + SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h + + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h + SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h + + transpose v0.4s, v2.4s, v16.4s, v18.4s + transpose v1.4s, v3.4s, v17.4s, v19.4s + transpose v4.4s, v6.4s, v20.4s, v22.4s + transpose v5.4s, v7.4s, v21.4s, v23.4s + + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h + + abs v0.8h, v16.8h + abs v4.8h, v20.8h + abs v1.8h, v17.8h + abs v5.8h, v21.8h + abs v2.8h, v18.8h + abs v6.8h, v22.8h + abs v3.8h, v19.8h + abs v7.8h, v23.8h + + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + and v0.16b, v0.16b, v30.16b + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + uadalp v28.4s, v0.8h + uadalp v28.4s, v1.8h + + SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h + SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h + SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h + SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h + + transpose v16.2d, v17.2d, v6.2d, v7.2d + transpose v18.2d, v19.2d, v4.2d, v5.2d + transpose v20.2d, v21.2d, v2.2d, v3.2d + + abs v16.8h, v16.8h + abs v17.8h, v17.8h + abs v18.8h, v18.8h + abs v19.8h, v19.8h + abs v20.8h, v20.8h + abs v21.8h, v21.8h + + transpose v7.2d, v6.2d, v1.2d, v0.2d + + umax v3.8h, v16.8h, v17.8h + umax v2.8h, v18.8h, v19.8h + umax v1.8h, v20.8h, v21.8h + + SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h + + add v2.8h, v2.8h, v3.8h + add v2.8h, v2.8h, v1.8h + and v4.16b, v4.16b, v31.16b + add v2.8h, v2.8h, v2.8h + abs v5.8h, v5.8h + abs v4.8h, v4.8h + add v2.8h, v2.8h, v5.8h + add v2.8h, v2.8h, v4.8h + uadalp v29.4s, v2.8h + ret +endfunc + +SSD_FUNC_SVE 4, 4 +SSD_FUNC_SVE 4, 8 +SSD_FUNC_SVE 4, 16 +SSD_FUNC_SVE 8, 4 +SSD_FUNC_SVE 8, 8 + +pixel_var_sve_8 8 +pixel_var_sve_8 16 + +sa8d_satd_sve_8x8 + +HADAMARD_AC_SVE 8, 8 +HADAMARD_AC_SVE 8, 16 +HADAMARD_AC_SVE 16, 8 +HADAMARD_AC_SVE 16, 16 + +#else /* BIT_DEPTH == 10 */ + +.macro SSD_START_SVE_4 + ptrue p0.s, vl4 + ld1h {z16.s}, p0/z, [x0] + ld1h {z17.s}, p0/z, [x2] + add x0, x0, x1, lsl #1 + add x2, x2, x3, lsl #1 + sub v2.4s, v16.4s, v17.4s + ld1h {z16.s}, p0/z, [x0] + ld1h {z17.s}, p0/z, [x2] + add x0, x0, x1, lsl #1 + add x2, x2, x3, lsl #1 + mul v0.4s, v2.4s, v2.4s +.endm + +.macro SSD_SVE_4 + sub v2.4s, v16.4s, v17.4s + ld1h {z16.s}, p0/z, [x0] + ld1h {z17.s}, p0/z, [x2] + add x0, x0, x1, lsl #1 + add x2, x2, x3, lsl #1 + mla v0.4s, v2.4s, v2.4s +.endm + +.macro SSD_END_SVE_4 + sub v2.4s, v16.4s, v17.4s + mla v0.4s, v2.4s, v2.4s +.endm + +.macro SSD_FUNC_SVE w h +function pixel_ssd_\w\()x\h\()_sve, export=1 + SSD_START_SVE_\w +.rept \h-2 + SSD_SVE_\w +.endr + SSD_END_SVE_\w + + addv s0, v0.4s + fmov w0, s0 + ret +endfunc +.endm + +SSD_FUNC_SVE 4, 4 +SSD_FUNC_SVE 4, 8 +SSD_FUNC_SVE 4, 16 + +#endif /* BIT_DEPTH == 8 */ diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S index b2250ba6c..449919b57 100644 --- a/common/aarch64/pixel-a.S +++ b/common/aarch64/pixel-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.S: aarch64 pixel metrics ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -25,6 +25,7 @@ *****************************************************************************/ #include "asm.S" +#include "pixel-a-common.S" const mask .rept 16 @@ -35,25 +36,26 @@ const mask .endr endconst -const mask_ac_4_8 -.short 0, -1, -1, -1, 0, -1, -1, -1 -.short 0, -1, -1, -1, -1, -1, -1, -1 -endconst +.macro SUMSUBL_AB sum, sub, a, b + uaddl \sum, \a, \b + usubl \sub, \a, \b +.endm #if BIT_DEPTH == 8 + .macro SAD_START_4 - ld1 {v1.s}[0], [x2], x3 - ld1 {v0.s}[0], [x0], x1 - ld1 {v1.s}[1], [x2], x3 - ld1 {v0.s}[1], [x0], x1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 uabdl v16.8h, v0.8b, v1.8b .endm .macro SAD_4 - ld1 {v1.s}[0], [x2], x3 - ld1 {v0.s}[0], [x0], x1 - ld1 {v1.s}[1], [x2], x3 - ld1 {v0.s}[1], [x0], x1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 uabal v16.8h, v0.8b, v1.8b .endm @@ -113,189 +115,82 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1 endfunc .endm -#else /* BIT_DEPTH == 8 */ - -.macro SAD_START_4 - lsl x1, x1, #1 - lsl x3, x3, #1 - ld1 {v1.d}[0], [x2], x3 - ld1 {v0.d}[0], [x0], x1 - ld1 {v1.d}[1], [x2], x3 - ld1 {v0.d}[1], [x0], x1 - uabdl v16.4s, v0.4h, v1.4h - uabdl2 v18.4s, v0.8h, v1.8h -.endm - -.macro SAD_4 - ld1 {v1.d}[0], [x2], x3 - ld1 {v0.d}[0], [x0], x1 - ld1 {v1.d}[1], [x2], x3 - ld1 {v0.d}[1], [x0], x1 - uabal v16.4s, v0.4h, v1.4h - uabal2 v18.4s, v0.8h, v1.8h -.endm - -.macro SAD_START_8 - lsl x1, x1, #1 - lsl x3, x3, #1 - ld1 {v1.8h}, [x2], x3 - ld1 {v0.8h}, [x0], x1 - ld1 {v3.8h}, [x2], x3 - ld1 {v2.8h}, [x0], x1 - uabdl v16.4s, v0.4h, v1.4h - uabdl2 v17.4s, v0.8h, v1.8h - uabdl v18.4s, v2.4h, v3.4h - uabdl2 v19.4s, v2.8h, v3.8h -.endm - -.macro SAD_8 - ld1 {v1.8h}, [x2], x3 - ld1 {v0.8h}, [x0], x1 - ld1 {v3.8h}, [x2], x3 - ld1 {v2.8h}, [x0], x1 - uabal v16.4s, v0.4h, v1.4h - uabal2 v17.4s, v0.8h, v1.8h - uabal v18.4s, v2.4h, v3.4h - uabal2 v19.4s, v2.8h, v3.8h -.endm - -.macro SAD_START_16 - lsl x1, x1, #1 - lsl x3, x3, #1 - ld2 {v0.8h, v1.8h}, [x2], x3 - ld2 {v2.8h, v3.8h}, [x0], x1 - ld2 {v4.8h, v5.8h}, [x2], x3 - ld2 {v6.8h, v7.8h}, [x0], x1 - uabdl v16.4s, v0.4h, v2.4h - uabdl2 v17.4s, v0.8h, v2.8h - uabdl v20.4s, v1.4h, v3.4h - uabdl2 v21.4s, v1.8h, v3.8h - uabdl v18.4s, v4.4h, v6.4h - uabdl2 v19.4s, v4.8h, v6.8h - uabdl v22.4s, v5.4h, v7.4h - uabdl2 v23.4s, v5.8h, v7.8h -.endm - -.macro SAD_16 - ld2 {v0.8h, v1.8h}, [x2], x3 - ld2 {v2.8h, v3.8h}, [x0], x1 - ld2 {v4.8h, v5.8h}, [x2], x3 - ld2 {v6.8h, v7.8h}, [x0], x1 - uabal v16.4s, v0.4h, v2.4h - uabal2 v17.4s, v0.8h, v2.8h - uabal v20.4s, v1.4h, v3.4h - uabal2 v21.4s, v1.8h, v3.8h - uabal v18.4s, v4.4h, v6.4h - uabal2 v19.4s, v4.8h, v6.8h - uabal v22.4s, v5.4h, v7.4h - uabal2 v23.4s, v5.8h, v7.8h -.endm - -.macro SAD_FUNC w, h, name -function pixel_sad\name\()_\w\()x\h\()_neon, export=1 - SAD_START_\w - -.rept \h / 2 - 1 - SAD_\w -.endr -.if \w > 8 - add v20.4s, v20.4s, v21.4s - add v16.4s, v16.4s, v20.4s - add v22.4s, v22.4s, v23.4s - add v18.4s, v18.4s, v22.4s -.endif -.if \w > 4 - add v16.4s, v16.4s, v17.4s - add v18.4s, v18.4s, v19.4s -.endif - add v16.4s, v16.4s, v18.4s - uaddlv s0, v16.8h - fmov w0, s0 - ret -endfunc -.endm - -#endif /* BIT_DEPTH == 8 */ - -SAD_FUNC 4, 4 -SAD_FUNC 4, 8 -SAD_FUNC 4, 16 -SAD_FUNC 8, 4 -SAD_FUNC 8, 8 -SAD_FUNC 8, 16 -SAD_FUNC 16, 8 -SAD_FUNC 16, 16 - .macro SAD_X_4 x, first=uabal - ld1 {v0.s}[0], [x0], x7 - ld1 {v1.s}[0], [x1], x5 - ld1 {v0.s}[1], [x0], x7 - ld1 {v1.s}[1], [x1], x5 + ld1 {v0.s}[0], [x0], x7 + ld1 {v1.s}[0], [x1], x5 + ld1 {v0.s}[1], [x0], x7 + ld1 {v1.s}[1], [x1], x5 + ld1 {v2.s}[0], [x2], x5 + ld1 {v2.s}[1], [x2], x5 \first v16.8h, v1.8b, v0.8b - ld1 {v2.s}[0], [x2], x5 - ld1 {v2.s}[1], [x2], x5 + ld1 {v3.s}[0], [x3], x5 + ld1 {v3.s}[1], [x3], x5 \first v17.8h, v2.8b, v0.8b - ld1 {v3.s}[0], [x3], x5 - ld1 {v3.s}[1], [x3], x5 +.if \x == 4 + ld1 {v4.s}[0], [x4], x5 + ld1 {v4.s}[1], [x4], x5 +.endif \first v18.8h, v3.8b, v0.8b .if \x == 4 - ld1 {v4.s}[0], [x4], x5 - ld1 {v4.s}[1], [x4], x5 \first v19.8h, v4.8b, v0.8b .endif .endm .macro SAD_X_8 x, first=uabal - ld1 {v0.8b}, [x0], x7 - ld1 {v1.8b}, [x1], x5 + ld1 {v0.8b}, [x0], x7 + ld1 {v1.8b}, [x1], x5 + ld1 {v2.8b}, [x2], x5 \first v16.8h, v1.8b, v0.8b - ld1 {v2.8b}, [x2], x5 - ld1 {v5.8b}, [x0], x7 + ld1 {v3.8b}, [x3], x5 \first v17.8h, v2.8b, v0.8b - ld1 {v3.8b}, [x3], x5 - ld1 {v1.8b}, [x1], x5 + ld1 {v5.8b}, [x0], x7 + ld1 {v1.8b}, [x1], x5 \first v18.8h, v3.8b, v0.8b + ld1 {v2.8b}, [x2], x5 uabal v16.8h, v1.8b, v5.8b - ld1 {v2.8b}, [x2], x5 - ld1 {v3.8b}, [x3], x5 + ld1 {v3.8b}, [x3], x5 uabal v17.8h, v2.8b, v5.8b +.if \x == 4 + ld1 {v4.8b}, [x4], x5 + ld1 {v1.8b}, [x4], x5 +.endif uabal v18.8h, v3.8b, v5.8b .if \x == 4 - ld1 {v4.8b}, [x4], x5 \first v19.8h, v4.8b, v0.8b - ld1 {v4.8b}, [x4], x5 - uabal v19.8h, v4.8b, v5.8b + uabal v19.8h, v1.8b, v5.8b .endif .endm .macro SAD_X_16 x, first=uabal - ld1 {v0.16b}, [x0], x7 - ld1 {v1.16b}, [x1], x5 + ld1 {v0.16b}, [x0], x7 + ld1 {v1.16b}, [x1], x5 + ld1 {v2.16b}, [x2], x5 \first v16.8h, v1.8b, v0.8b \first\()2 v20.8h, v1.16b, v0.16b - ld1 {v2.16b}, [x2], x5 - ld1 {v5.16b}, [x0], x7 + ld1 {v3.16b}, [x3], x5 \first v17.8h, v2.8b, v0.8b \first\()2 v21.8h, v2.16b, v0.16b - ld1 {v3.16b}, [x3], x5 - ld1 {v1.16b}, [x1], x5 + ld1 {v5.16b}, [x0], x7 + ld1 {v1.16b}, [x1], x5 \first v18.8h, v3.8b, v0.8b \first\()2 v22.8h, v3.16b, v0.16b + ld1 {v2.16b}, [x2], x5 uabal v16.8h, v1.8b, v5.8b uabal2 v20.8h, v1.16b, v5.16b - ld1 {v2.16b}, [x2], x5 - ld1 {v3.16b}, [x3], x5 + ld1 {v3.16b}, [x3], x5 uabal v17.8h, v2.8b, v5.8b uabal2 v21.8h, v2.16b, v5.16b +.if \x == 4 + ld1 {v4.16b}, [x4], x5 + ld1 {v1.16b}, [x4], x5 +.endif uabal v18.8h, v3.8b, v5.8b uabal2 v22.8h, v3.16b, v5.16b .if \x == 4 - ld1 {v4.16b}, [x4], x5 \first v19.8h, v4.8b, v0.8b \first\()2 v23.8h, v4.16b, v0.16b - ld1 {v4.16b}, [x4], x5 - uabal v19.8h, v4.8b, v5.8b - uabal2 v23.8h, v4.16b, v5.16b + uabal v19.8h, v1.8b, v5.8b + uabal2 v23.8h, v1.16b, v5.16b .endif .endm @@ -307,10 +202,10 @@ function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 .endif mov x7, #FENC_STRIDE - SAD_X_\w \x, uabdl + SAD_X_\w \x, uabdl .rept \h / 2 - 1 - SAD_X_\w \x + SAD_X_\w \x .endr .if \w > 8 @@ -337,36 +232,19 @@ function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 endfunc .endm -SAD_X_FUNC 3, 4, 4 -SAD_X_FUNC 3, 4, 8 -SAD_X_FUNC 3, 8, 4 -SAD_X_FUNC 3, 8, 8 -SAD_X_FUNC 3, 8, 16 -SAD_X_FUNC 3, 16, 8 -SAD_X_FUNC 3, 16, 16 - -SAD_X_FUNC 4, 4, 4 -SAD_X_FUNC 4, 4, 8 -SAD_X_FUNC 4, 8, 4 -SAD_X_FUNC 4, 8, 8 -SAD_X_FUNC 4, 8, 16 -SAD_X_FUNC 4, 16, 8 -SAD_X_FUNC 4, 16, 16 - - function pixel_vsad_neon, export=1 subs w2, w2, #2 - ld1 {v0.16b}, [x0], x1 - ld1 {v1.16b}, [x0], x1 + ld1 {v0.16b}, [x0], x1 + ld1 {v1.16b}, [x0], x1 uabdl v6.8h, v0.8b, v1.8b uabdl2 v7.8h, v0.16b, v1.16b b.le 2f 1: subs w2, w2, #2 - ld1 {v0.16b}, [x0], x1 + ld1 {v0.16b}, [x0], x1 uabal v6.8h, v1.8b, v0.8b uabal2 v7.8h, v1.16b, v0.16b - ld1 {v1.16b}, [x0], x1 + ld1 {v1.16b}, [x0], x1 b.lt 2f uabal v6.8h, v0.8b, v1.8b uabal2 v7.8h, v0.16b, v1.16b @@ -380,20 +258,20 @@ endfunc function pixel_asd8_neon, export=1 sub w4, w4, #2 - ld1 {v0.8b}, [x0], x1 - ld1 {v1.8b}, [x2], x3 - ld1 {v2.8b}, [x0], x1 - ld1 {v3.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 usubl v16.8h, v0.8b, v1.8b 1: subs w4, w4, #2 - ld1 {v4.8b}, [x0], x1 - ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 usubl v17.8h, v2.8b, v3.8b usubl v18.8h, v4.8b, v5.8b add v16.8h, v16.8h, v17.8h - ld1 {v2.8b}, [x0], x1 - ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 add v16.8h, v16.8h, v18.8h b.gt 1b usubl v17.8h, v2.8b, v3.8b @@ -405,18 +283,18 @@ function pixel_asd8_neon, export=1 endfunc .macro SSD_START_4 - ld1 {v16.s}[0], [x0], x1 - ld1 {v17.s}[0], [x2], x3 + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 usubl v2.8h, v16.8b, v17.8b - ld1 {v16.s}[0], [x0], x1 - ld1 {v17.s}[0], [x2], x3 + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 smull v0.4s, v2.4h, v2.4h .endm .macro SSD_4 usubl v2.8h, v16.8b, v17.8b - ld1 {v16.s}[0], [x0], x1 - ld1 {v17.s}[0], [x2], x3 + ld1 {v16.s}[0], [x0], x1 + ld1 {v17.s}[0], [x2], x3 smlal v0.4s, v2.4h, v2.4h .endm @@ -426,20 +304,20 @@ endfunc .endm .macro SSD_START_8 - ld1 {v16.8b}, [x0], x1 - ld1 {v17.8b}, [x2], x3 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x2], x3 usubl v2.8h, v16.8b, v17.8b - ld1 {v16.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 smull v0.4s, v2.4h, v2.4h - ld1 {v17.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 smlal2 v0.4s, v2.8h, v2.8h .endm .macro SSD_8 usubl v2.8h, v16.8b, v17.8b - ld1 {v16.8b}, [x0], x1 + ld1 {v16.8b}, [x0], x1 smlal v0.4s, v2.4h, v2.4h - ld1 {v17.8b}, [x2], x3 + ld1 {v17.8b}, [x2], x3 smlal2 v0.4s, v2.8h, v2.8h .endm @@ -450,8 +328,8 @@ endfunc .endm .macro SSD_START_16 - ld1 {v16.16b}, [x0], x1 - ld1 {v17.16b}, [x2], x3 + ld1 {v16.16b}, [x0], x1 + ld1 {v17.16b}, [x2], x3 usubl v2.8h, v16.8b, v17.8b usubl2 v3.8h, v16.16b, v17.16b ld1 {v16.16b}, [x0], x1 @@ -497,334 +375,89 @@ function pixel_ssd_\w\()x\h\()_neon, export=1 endfunc .endm -SSD_FUNC 4, 4 -SSD_FUNC 4, 8 -SSD_FUNC 4, 16 -SSD_FUNC 8, 4 -SSD_FUNC 8, 8 -SSD_FUNC 8, 16 -SSD_FUNC 16, 8 -SSD_FUNC 16, 16 - - -function pixel_ssd_nv12_core_neon, export=1 - sxtw x8, w4 - add x8, x8, #8 - and x8, x8, #~15 - movi v6.2d, #0 - movi v7.2d, #0 - sub x1, x1, x8, lsl #1 - sub x3, x3, x8, lsl #1 -1: - subs w8, w4, #16 - ld2 {v0.8b,v1.8b}, [x0], #16 - ld2 {v2.8b,v3.8b}, [x2], #16 - ld2 {v24.8b,v25.8b}, [x0], #16 - ld2 {v26.8b,v27.8b}, [x2], #16 - - usubl v16.8h, v0.8b, v2.8b - usubl v17.8h, v1.8b, v3.8b - smull v20.4s, v16.4h, v16.4h - smull v21.4s, v17.4h, v17.4h - usubl v18.8h, v24.8b, v26.8b - usubl v19.8h, v25.8b, v27.8b - smlal2 v20.4s, v16.8h, v16.8h - smlal2 v21.4s, v17.8h, v17.8h +function pixel_satd_4x4_neon, export=1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 - b.lt 4f - b.eq 3f -2: - smlal v20.4s, v18.4h, v18.4h - smlal v21.4s, v19.4h, v19.4h - ld2 {v0.8b,v1.8b}, [x0], #16 - ld2 {v2.8b,v3.8b}, [x2], #16 - smlal2 v20.4s, v18.8h, v18.8h - smlal2 v21.4s, v19.8h, v19.8h + usubl v0.8h, v0.8b, v1.8b + usubl v1.8h, v2.8b, v3.8b + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h - subs w8, w8, #16 - usubl v16.8h, v0.8b, v2.8b - usubl v17.8h, v1.8b, v3.8b - smlal v20.4s, v16.4h, v16.4h - smlal v21.4s, v17.4h, v17.4h - ld2 {v24.8b,v25.8b}, [x0], #16 - ld2 {v26.8b,v27.8b}, [x2], #16 - smlal2 v20.4s, v16.8h, v16.8h - smlal2 v21.4s, v17.8h, v17.8h - b.lt 4f + zip1 v0.2d, v2.2d, v3.2d + zip2 v1.2d, v2.2d, v3.2d + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h - usubl v18.8h, v24.8b, v26.8b - usubl v19.8h, v25.8b, v27.8b - b.gt 2b -3: - smlal v20.4s, v18.4h, v18.4h - smlal v21.4s, v19.4h, v19.4h - smlal2 v20.4s, v18.8h, v18.8h - smlal2 v21.4s, v19.8h, v19.8h -4: - subs w5, w5, #1 - uaddw v6.2d, v6.2d, v20.2s - uaddw v7.2d, v7.2d, v21.2s - add x0, x0, x1 - add x2, x2, x3 - uaddw2 v6.2d, v6.2d, v20.4s - uaddw2 v7.2d, v7.2d, v21.4s - b.gt 1b + trn1 v0.8h, v2.8h, v3.8h + trn2 v1.8h, v2.8h, v3.8h + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h - addp v6.2d, v6.2d, v7.2d - st1 {v6.d}[0], [x6] - st1 {v6.d}[1], [x7] + trn1 v0.4s, v2.4s, v3.4s + trn2 v1.4s, v2.4s, v3.4s + abs v0.8h, v0.8h + abs v1.8h, v1.8h + umax v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] ret endfunc -.macro pixel_var_8 h -function pixel_var_8x\h\()_neon, export=1 - ld1 {v16.8b}, [x0], x1 - ld1 {v17.8b}, [x0], x1 - mov x2, \h - 4 - umull v1.8h, v16.8b, v16.8b - uxtl v0.8h, v16.8b - umull v2.8h, v17.8b, v17.8b - uaddw v0.8h, v0.8h, v17.8b - ld1 {v18.8b}, [x0], x1 - uaddlp v1.4s, v1.8h - uaddlp v2.4s, v2.8h - ld1 {v19.8b}, [x0], x1 +function pixel_satd_4x8_neon, export=1 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + b satd_4x8_8x4_end_neon +endfunc -1: subs x2, x2, #4 - uaddw v0.8h, v0.8h, v18.8b - umull v24.8h, v18.8b, v18.8b - ld1 {v20.8b}, [x0], x1 - uaddw v0.8h, v0.8h, v19.8b - umull v25.8h, v19.8b, v19.8b - uadalp v1.4s, v24.8h - ld1 {v21.8b}, [x0], x1 - uaddw v0.8h, v0.8h, v20.8b - umull v26.8h, v20.8b, v20.8b - uadalp v2.4s, v25.8h - ld1 {v18.8b}, [x0], x1 - uaddw v0.8h, v0.8h, v21.8b - umull v27.8h, v21.8b, v21.8b - uadalp v1.4s, v26.8h - ld1 {v19.8b}, [x0], x1 - uadalp v2.4s, v27.8h - b.gt 1b +function pixel_satd_8x4_neon, export=1 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 +endfunc - uaddw v0.8h, v0.8h, v18.8b - umull v28.8h, v18.8b, v18.8b - uaddw v0.8h, v0.8h, v19.8b - umull v29.8h, v19.8b, v19.8b - uadalp v1.4s, v28.8h - uadalp v2.4s, v29.8h +function satd_4x8_8x4_end_neon + usubl v0.8h, v0.8b, v1.8b + usubl v1.8h, v2.8b, v3.8b + usubl v2.8h, v4.8b, v5.8b + usubl v3.8h, v6.8b, v7.8b - b var_end -endfunc -.endm + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h -pixel_var_8 8 -pixel_var_8 16 + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h -function pixel_var_16x16_neon, export=1 - ld1 {v16.16b}, [x0], x1 - ld1 {v17.16b}, [x0], x1 - mov x2, #14 - umull v1.8h, v16.8b, v16.8b - umull2 v2.8h, v16.16b, v16.16b - uxtl v0.8h, v16.8b - uaddlp v1.4s, v1.8h - uaddlp v2.4s, v2.8h - uaddw2 v0.8h, v0.8h, v16.16b + trn1 v0.8h, v4.8h, v5.8h + trn2 v1.8h, v4.8h, v5.8h + trn1 v2.8h, v6.8h, v7.8h + trn2 v3.8h, v6.8h, v7.8h -1: subs x2, x2, #2 - ld1 {v18.16b}, [x0], x1 - uaddw v0.8h, v0.8h, v17.8b - umull v3.8h, v17.8b, v17.8b - uaddw2 v0.8h, v0.8h, v17.16b - umull2 v4.8h, v17.16b, v17.16b - uadalp v1.4s, v3.8h - uadalp v2.4s, v4.8h - - ld1 {v17.16b}, [x0], x1 - uaddw v0.8h, v0.8h, v18.8b - umull v5.8h, v18.8b, v18.8b - uaddw2 v0.8h, v0.8h, v18.16b - umull2 v6.8h, v18.16b, v18.16b - uadalp v1.4s, v5.8h - uadalp v2.4s, v6.8h - b.gt 1b - - uaddw v0.8h, v0.8h, v17.8b - umull v3.8h, v17.8b, v17.8b - uaddw2 v0.8h, v0.8h, v17.16b - umull2 v4.8h, v17.16b, v17.16b - uadalp v1.4s, v3.8h - uadalp v2.4s, v4.8h -endfunc - -function var_end - add v1.4s, v1.4s, v2.4s - uaddlv s0, v0.8h - uaddlv d1, v1.4s - mov w0, v0.s[0] - mov x1, v1.d[0] - orr x0, x0, x1, lsl #32 - ret -endfunc - - -.macro pixel_var2_8 h -function pixel_var2_8x\h\()_neon, export=1 - mov x3, #16 - ld1 {v16.8b}, [x0], #8 - ld1 {v18.8b}, [x1], x3 - ld1 {v17.8b}, [x0], #8 - ld1 {v19.8b}, [x1], x3 - mov x5, \h - 2 - usubl v0.8h, v16.8b, v18.8b - usubl v1.8h, v17.8b, v19.8b - ld1 {v16.8b}, [x0], #8 - ld1 {v18.8b}, [x1], x3 - smull v2.4s, v0.4h, v0.4h - smull2 v3.4s, v0.8h, v0.8h - smull v4.4s, v1.4h, v1.4h - smull2 v5.4s, v1.8h, v1.8h - - usubl v6.8h, v16.8b, v18.8b - -1: subs x5, x5, #1 - ld1 {v17.8b}, [x0], #8 - ld1 {v19.8b}, [x1], x3 - smlal v2.4s, v6.4h, v6.4h - smlal2 v3.4s, v6.8h, v6.8h - usubl v7.8h, v17.8b, v19.8b - add v0.8h, v0.8h, v6.8h - ld1 {v16.8b}, [x0], #8 - ld1 {v18.8b}, [x1], x3 - smlal v4.4s, v7.4h, v7.4h - smlal2 v5.4s, v7.8h, v7.8h - usubl v6.8h, v16.8b, v18.8b - add v1.8h, v1.8h, v7.8h - b.gt 1b - - ld1 {v17.8b}, [x0], #8 - ld1 {v19.8b}, [x1], x3 - smlal v2.4s, v6.4h, v6.4h - smlal2 v3.4s, v6.8h, v6.8h - usubl v7.8h, v17.8b, v19.8b - add v0.8h, v0.8h, v6.8h - smlal v4.4s, v7.4h, v7.4h - add v1.8h, v1.8h, v7.8h - smlal2 v5.4s, v7.8h, v7.8h - - saddlv s0, v0.8h - saddlv s1, v1.8h - add v2.4s, v2.4s, v3.4s - add v4.4s, v4.4s, v5.4s - mov w0, v0.s[0] - mov w1, v1.s[0] - addv s2, v2.4s - addv s4, v4.4s - mul w0, w0, w0 - mul w1, w1, w1 - mov w3, v2.s[0] - mov w4, v4.s[0] - sub w0, w3, w0, lsr # 6 + (\h >> 4) - sub w1, w4, w1, lsr # 6 + (\h >> 4) - str w3, [x2] - add w0, w0, w1 - str w4, [x2, #4] - - ret -endfunc -.endm - -pixel_var2_8 8 -pixel_var2_8 16 - - -function pixel_satd_4x4_neon, export=1 - ld1 {v1.s}[0], [x2], x3 - ld1 {v0.s}[0], [x0], x1 - ld1 {v3.s}[0], [x2], x3 - ld1 {v2.s}[0], [x0], x1 - ld1 {v1.s}[1], [x2], x3 - ld1 {v0.s}[1], [x0], x1 - ld1 {v3.s}[1], [x2], x3 - ld1 {v2.s}[1], [x0], x1 - - usubl v0.8h, v0.8b, v1.8b - usubl v1.8h, v2.8b, v3.8b - SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h - - zip1 v0.2d, v2.2d, v3.2d - zip2 v1.2d, v2.2d, v3.2d - SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h - - trn1 v0.8h, v2.8h, v3.8h - trn2 v1.8h, v2.8h, v3.8h - SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h - - trn1 v0.4s, v2.4s, v3.4s - trn2 v1.4s, v2.4s, v3.4s - abs v0.8h, v0.8h - abs v1.8h, v1.8h - umax v0.8h, v0.8h, v1.8h - - uaddlv s0, v0.8h - mov w0, v0.s[0] - ret -endfunc - -function pixel_satd_4x8_neon, export=1 - ld1 {v1.s}[0], [x2], x3 - ld1 {v0.s}[0], [x0], x1 - ld1 {v3.s}[0], [x2], x3 - ld1 {v2.s}[0], [x0], x1 - ld1 {v5.s}[0], [x2], x3 - ld1 {v4.s}[0], [x0], x1 - ld1 {v7.s}[0], [x2], x3 - ld1 {v6.s}[0], [x0], x1 - ld1 {v1.s}[1], [x2], x3 - ld1 {v0.s}[1], [x0], x1 - ld1 {v3.s}[1], [x2], x3 - ld1 {v2.s}[1], [x0], x1 - ld1 {v5.s}[1], [x2], x3 - ld1 {v4.s}[1], [x0], x1 - ld1 {v7.s}[1], [x2], x3 - ld1 {v6.s}[1], [x0], x1 - b satd_4x8_8x4_end_neon -endfunc - -function pixel_satd_8x4_neon, export=1 - ld1 {v1.8b}, [x2], x3 - ld1 {v0.8b}, [x0], x1 - ld1 {v3.8b}, [x2], x3 - ld1 {v2.8b}, [x0], x1 - ld1 {v5.8b}, [x2], x3 - ld1 {v4.8b}, [x0], x1 - ld1 {v7.8b}, [x2], x3 - ld1 {v6.8b}, [x0], x1 -endfunc - -function satd_4x8_8x4_end_neon - usubl v0.8h, v0.8b, v1.8b - usubl v1.8h, v2.8b, v3.8b - usubl v2.8h, v4.8b, v5.8b - usubl v3.8h, v6.8b, v7.8b - - SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h - SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h - - SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h - SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h - - trn1 v0.8h, v4.8h, v5.8h - trn2 v1.8h, v4.8h, v5.8h - trn1 v2.8h, v6.8h, v7.8h - trn2 v3.8h, v6.8h, v7.8h - - SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h - SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h trn1 v0.4s, v16.4s, v18.4s trn2 v1.4s, v16.4s, v18.4s @@ -842,64 +475,85 @@ function satd_4x8_8x4_end_neon ret endfunc -function pixel_satd_8x8_neon, export=1 +function pixel_satd_4x16_neon, export=1 mov x4, x30 + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + usubl v16.8h, v0.8b, v1.8b + usubl v17.8h, v2.8b, v3.8b + usubl v18.8h, v4.8b, v5.8b + usubl v19.8h, v6.8b, v7.8b + ld1 {v1.s}[0], [x2], x3 + ld1 {v0.s}[0], [x0], x1 + ld1 {v3.s}[0], [x2], x3 + ld1 {v2.s}[0], [x0], x1 + ld1 {v5.s}[0], [x2], x3 + ld1 {v4.s}[0], [x0], x1 + ld1 {v7.s}[0], [x2], x3 + ld1 {v6.s}[0], [x0], x1 + ld1 {v1.s}[1], [x2], x3 + ld1 {v0.s}[1], [x0], x1 + ld1 {v3.s}[1], [x2], x3 + ld1 {v2.s}[1], [x0], x1 + ld1 {v5.s}[1], [x2], x3 + ld1 {v4.s}[1], [x0], x1 + ld1 {v7.s}[1], [x2], x3 + ld1 {v6.s}[1], [x0], x1 + usubl v20.8h, v0.8b, v1.8b + usubl v21.8h, v2.8b, v3.8b + usubl v22.8h, v4.8b, v5.8b + usubl v23.8h, v6.8b, v7.8b - bl satd_8x8_neon - add v0.8h, v0.8h, v1.8h - add v1.8h, v2.8h, v3.8h - add v0.8h, v0.8h, v1.8h - uaddlv s0, v0.8h - mov w0, v0.s[0] - ret x4 -endfunc + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h -function pixel_satd_8x16_neon, export=1 - mov x4, x30 + bl satd_8x4v_8x8h_neon - bl satd_8x8_neon - add v0.8h, v0.8h, v1.8h - add v1.8h, v2.8h, v3.8h add v30.8h, v0.8h, v1.8h - - bl satd_8x8_neon - add v0.8h, v0.8h, v1.8h - add v1.8h, v2.8h, v3.8h - add v31.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h add v0.8h, v30.8h, v31.8h uaddlv s0, v0.8h mov w0, v0.s[0] ret x4 endfunc -.macro SUMSUBL_AB sum, sub, a, b - uaddl \sum, \a, \b - usubl \sub, \a, \b -.endm - .macro load_diff_fly_8x8 - ld1 {v1.8b}, [x2], x3 - ld1 {v0.8b}, [x0], x1 - ld1 {v3.8b}, [x2], x3 - ld1 {v2.8b}, [x0], x1 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 usubl v16.8h, v0.8b, v1.8b - ld1 {v5.8b}, [x2], x3 - ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 usubl v17.8h, v2.8b, v3.8b - ld1 {v7.8b}, [x2], x3 - ld1 {v6.8b}, [x0], x1 + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 usubl v18.8h, v4.8b, v5.8b - ld1 {v1.8b}, [x2], x3 - ld1 {v0.8b}, [x0], x1 + ld1 {v1.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 usubl v19.8h, v6.8b, v7.8b - ld1 {v3.8b}, [x2], x3 - ld1 {v2.8b}, [x0], x1 + ld1 {v3.8b}, [x2], x3 + ld1 {v2.8b}, [x0], x1 usubl v20.8h, v0.8b, v1.8b - ld1 {v5.8b}, [x2], x3 - ld1 {v4.8b}, [x0], x1 + ld1 {v5.8b}, [x2], x3 + ld1 {v4.8b}, [x0], x1 usubl v21.8h, v2.8b, v3.8b - ld1 {v7.8b}, [x2], x3 - ld1 {v6.8b}, [x0], x1 + ld1 {v7.8b}, [x2], x3 + ld1 {v6.8b}, [x0], x1 SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h @@ -908,15 +562,35 @@ endfunc usubl v23.8h, v6.8b, v7.8b .endm -.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d - SUMSUB_AB \s1, \d1, \a, \b - SUMSUB_AB \s2, \d2, \c, \d -.endm +function pixel_satd_8x8_neon, export=1 + mov x4, x30 -.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4 - SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4 - SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 -.endm + bl satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function pixel_satd_8x16_neon, export=1 + mov x4, x30 + + bl satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v0.8h, v1.8h + + bl satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v31.8h, v0.8h, v1.8h + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc function satd_8x8_neon load_diff_fly_8x8 @@ -926,14 +600,13 @@ endfunc function satd_8x4v_8x8h_neon SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h - HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h - transpose v0.8h, v1.8h, v16.8h, v17.8h transpose v2.8h, v3.8h, v18.8h, v19.8h transpose v4.8h, v5.8h, v20.8h, v21.8h transpose v6.8h, v7.8h, v22.8h, v23.8h + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h @@ -961,37 +634,264 @@ function satd_8x4v_8x8h_neon ret endfunc -function pixel_satd_16x8_neon, export=1 - mov x4, x30 - - bl satd_16x4_neon - add v30.8h, v0.8h, v1.8h - add v31.8h, v2.8h, v3.8h - - bl satd_16x4_neon - add v0.8h, v0.8h, v1.8h - add v1.8h, v2.8h, v3.8h - add v30.8h, v30.8h, v0.8h - add v31.8h, v31.8h, v1.8h - - add v0.8h, v30.8h, v31.8h - uaddlv s0, v0.8h - mov w0, v0.s[0] - ret x4 -endfunc +function pixel_ssd_nv12_core_neon, export=1 + sxtw x8, w4 + add x8, x8, #8 + and x8, x8, #~15 + movi v6.2d, #0 + movi v7.2d, #0 + sub x1, x1, x8, lsl #1 + sub x3, x3, x8, lsl #1 +1: + subs w8, w4, #16 + ld2 {v0.8b,v1.8b}, [x0], #16 + ld2 {v2.8b,v3.8b}, [x2], #16 + ld2 {v24.8b,v25.8b}, [x0], #16 + ld2 {v26.8b,v27.8b}, [x2], #16 -function pixel_satd_16x16_neon, export=1 - mov x4, x30 + usubl v16.8h, v0.8b, v2.8b + usubl v17.8h, v1.8b, v3.8b + smull v20.4s, v16.4h, v16.4h + smull v21.4s, v17.4h, v17.4h + usubl v18.8h, v24.8b, v26.8b + usubl v19.8h, v25.8b, v27.8b + smlal2 v20.4s, v16.8h, v16.8h + smlal2 v21.4s, v17.8h, v17.8h - bl satd_16x4_neon - add v30.8h, v0.8h, v1.8h - add v31.8h, v2.8h, v3.8h + b.lt 4f + b.eq 3f +2: + smlal v20.4s, v18.4h, v18.4h + smlal v21.4s, v19.4h, v19.4h + ld2 {v0.8b,v1.8b}, [x0], #16 + ld2 {v2.8b,v3.8b}, [x2], #16 + smlal2 v20.4s, v18.8h, v18.8h + smlal2 v21.4s, v19.8h, v19.8h - bl satd_16x4_neon - add v0.8h, v0.8h, v1.8h - add v1.8h, v2.8h, v3.8h - add v30.8h, v30.8h, v0.8h - add v31.8h, v31.8h, v1.8h + subs w8, w8, #16 + usubl v16.8h, v0.8b, v2.8b + usubl v17.8h, v1.8b, v3.8b + smlal v20.4s, v16.4h, v16.4h + smlal v21.4s, v17.4h, v17.4h + ld2 {v24.8b,v25.8b}, [x0], #16 + ld2 {v26.8b,v27.8b}, [x2], #16 + smlal2 v20.4s, v16.8h, v16.8h + smlal2 v21.4s, v17.8h, v17.8h + b.lt 4f + + usubl v18.8h, v24.8b, v26.8b + usubl v19.8h, v25.8b, v27.8b + b.gt 2b +3: + smlal v20.4s, v18.4h, v18.4h + smlal v21.4s, v19.4h, v19.4h + smlal2 v20.4s, v18.8h, v18.8h + smlal2 v21.4s, v19.8h, v19.8h +4: + subs w5, w5, #1 + uaddw v6.2d, v6.2d, v20.2s + uaddw v7.2d, v7.2d, v21.2s + add x0, x0, x1 + add x2, x2, x3 + uaddw2 v6.2d, v6.2d, v20.4s + uaddw2 v7.2d, v7.2d, v21.4s + b.gt 1b + + addp v6.2d, v6.2d, v7.2d + st1 {v6.d}[0], [x6] + st1 {v6.d}[1], [x7] + + ret +endfunc + +.macro pixel_var_8 h +function pixel_var_8x\h\()_neon, export=1 + ld1 {v16.8b}, [x0], x1 + ld1 {v17.8b}, [x0], x1 + mov x2, \h - 4 + umull v1.8h, v16.8b, v16.8b + uxtl v0.8h, v16.8b + umull v2.8h, v17.8b, v17.8b + uaddw v0.8h, v0.8h, v17.8b + ld1 {v18.8b}, [x0], x1 + uaddlp v1.4s, v1.8h + uaddlp v2.4s, v2.8h + ld1 {v19.8b}, [x0], x1 + +1: subs x2, x2, #4 + uaddw v0.8h, v0.8h, v18.8b + umull v24.8h, v18.8b, v18.8b + ld1 {v20.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v19.8b + umull v25.8h, v19.8b, v19.8b + uadalp v1.4s, v24.8h + ld1 {v21.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v20.8b + umull v26.8h, v20.8b, v20.8b + uadalp v2.4s, v25.8h + ld1 {v18.8b}, [x0], x1 + uaddw v0.8h, v0.8h, v21.8b + umull v27.8h, v21.8b, v21.8b + uadalp v1.4s, v26.8h + ld1 {v19.8b}, [x0], x1 + uadalp v2.4s, v27.8h + b.gt 1b + + uaddw v0.8h, v0.8h, v18.8b + umull v28.8h, v18.8b, v18.8b + uaddw v0.8h, v0.8h, v19.8b + umull v29.8h, v19.8b, v19.8b + uadalp v1.4s, v28.8h + uadalp v2.4s, v29.8h + + b var_end +endfunc +.endm + +function pixel_var_16x16_neon, export=1 + ld1 {v16.16b}, [x0], x1 + ld1 {v17.16b}, [x0], x1 + mov x2, #14 + umull v1.8h, v16.8b, v16.8b + umull2 v2.8h, v16.16b, v16.16b + uxtl v0.8h, v16.8b + uaddlp v1.4s, v1.8h + uaddlp v2.4s, v2.8h + uaddw2 v0.8h, v0.8h, v16.16b + +1: subs x2, x2, #2 + ld1 {v18.16b}, [x0], x1 + uaddw v0.8h, v0.8h, v17.8b + umull v3.8h, v17.8b, v17.8b + uaddw2 v0.8h, v0.8h, v17.16b + umull2 v4.8h, v17.16b, v17.16b + uadalp v1.4s, v3.8h + uadalp v2.4s, v4.8h + + ld1 {v17.16b}, [x0], x1 + uaddw v0.8h, v0.8h, v18.8b + umull v5.8h, v18.8b, v18.8b + uaddw2 v0.8h, v0.8h, v18.16b + umull2 v6.8h, v18.16b, v18.16b + uadalp v1.4s, v5.8h + uadalp v2.4s, v6.8h + b.gt 1b + + uaddw v0.8h, v0.8h, v17.8b + umull v3.8h, v17.8b, v17.8b + uaddw2 v0.8h, v0.8h, v17.16b + umull2 v4.8h, v17.16b, v17.16b + uadalp v1.4s, v3.8h + uadalp v2.4s, v4.8h +endfunc + +function var_end + add v1.4s, v1.4s, v2.4s + uaddlv s0, v0.8h + uaddlv d1, v1.4s + mov w0, v0.s[0] + mov x1, v1.d[0] + orr x0, x0, x1, lsl #32 + ret +endfunc + +.macro pixel_var2_8 h +function pixel_var2_8x\h\()_neon, export=1 + mov x3, #16 + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 + mov x5, \h - 2 + usubl v0.8h, v16.8b, v18.8b + usubl v1.8h, v17.8b, v19.8b + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + smull v2.4s, v0.4h, v0.4h + smull2 v3.4s, v0.8h, v0.8h + smull v4.4s, v1.4h, v1.4h + smull2 v5.4s, v1.8h, v1.8h + + usubl v6.8h, v16.8b, v18.8b + +1: subs x5, x5, #1 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 + smlal v2.4s, v6.4h, v6.4h + smlal2 v3.4s, v6.8h, v6.8h + usubl v7.8h, v17.8b, v19.8b + add v0.8h, v0.8h, v6.8h + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + smlal v4.4s, v7.4h, v7.4h + smlal2 v5.4s, v7.8h, v7.8h + usubl v6.8h, v16.8b, v18.8b + add v1.8h, v1.8h, v7.8h + b.gt 1b + + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 + smlal v2.4s, v6.4h, v6.4h + smlal2 v3.4s, v6.8h, v6.8h + usubl v7.8h, v17.8b, v19.8b + add v0.8h, v0.8h, v6.8h + smlal v4.4s, v7.4h, v7.4h + add v1.8h, v1.8h, v7.8h + smlal2 v5.4s, v7.8h, v7.8h + + saddlv s0, v0.8h + saddlv s1, v1.8h + add v2.4s, v2.4s, v3.4s + add v4.4s, v4.4s, v5.4s + mov w0, v0.s[0] + mov w1, v1.s[0] + addv s2, v2.4s + addv s4, v4.4s + mul w0, w0, w0 + mul w1, w1, w1 + mov w3, v2.s[0] + mov w4, v4.s[0] + sub w0, w3, w0, lsr # 6 + (\h >> 4) + sub w1, w4, w1, lsr # 6 + (\h >> 4) + str w3, [x2] + add w0, w0, w1 + str w4, [x2, #4] + + ret +endfunc +.endm + +function pixel_satd_16x8_neon, export=1 + mov x4, x30 + + bl satd_16x4_neon + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + + bl satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function pixel_satd_16x16_neon, export=1 + mov x4, x30 + + bl satd_16x4_neon + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + + bl satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h bl satd_16x4_neon add v0.8h, v0.8h, v1.8h @@ -1012,18 +912,18 @@ function pixel_satd_16x16_neon, export=1 endfunc function satd_16x4_neon - ld1 {v1.16b}, [x2], x3 - ld1 {v0.16b}, [x0], x1 - ld1 {v3.16b}, [x2], x3 - ld1 {v2.16b}, [x0], x1 + ld1 {v1.16b}, [x2], x3 + ld1 {v0.16b}, [x0], x1 + ld1 {v3.16b}, [x2], x3 + ld1 {v2.16b}, [x0], x1 usubl v16.8h, v0.8b, v1.8b usubl2 v20.8h, v0.16b, v1.16b - ld1 {v5.16b}, [x2], x3 - ld1 {v4.16b}, [x0], x1 + ld1 {v5.16b}, [x2], x3 + ld1 {v4.16b}, [x0], x1 usubl v17.8h, v2.8b, v3.8b usubl2 v21.8h, v2.16b, v3.16b - ld1 {v7.16b}, [x2], x3 - ld1 {v6.16b}, [x0], x1 + ld1 {v7.16b}, [x2], x3 + ld1 {v6.16b}, [x0], x1 usubl v18.8h, v4.8b, v5.8b usubl2 v22.8h, v4.16b, v5.16b @@ -1036,62 +936,6 @@ function satd_16x4_neon b satd_8x4v_8x8h_neon endfunc -function pixel_satd_4x16_neon, export=1 - mov x4, x30 - ld1 {v1.s}[0], [x2], x3 - ld1 {v0.s}[0], [x0], x1 - ld1 {v3.s}[0], [x2], x3 - ld1 {v2.s}[0], [x0], x1 - ld1 {v5.s}[0], [x2], x3 - ld1 {v4.s}[0], [x0], x1 - ld1 {v7.s}[0], [x2], x3 - ld1 {v6.s}[0], [x0], x1 - ld1 {v1.s}[1], [x2], x3 - ld1 {v0.s}[1], [x0], x1 - ld1 {v3.s}[1], [x2], x3 - ld1 {v2.s}[1], [x0], x1 - ld1 {v5.s}[1], [x2], x3 - ld1 {v4.s}[1], [x0], x1 - ld1 {v7.s}[1], [x2], x3 - ld1 {v6.s}[1], [x0], x1 - usubl v16.8h, v0.8b, v1.8b - usubl v17.8h, v2.8b, v3.8b - usubl v18.8h, v4.8b, v5.8b - usubl v19.8h, v6.8b, v7.8b - ld1 {v1.s}[0], [x2], x3 - ld1 {v0.s}[0], [x0], x1 - ld1 {v3.s}[0], [x2], x3 - ld1 {v2.s}[0], [x0], x1 - ld1 {v5.s}[0], [x2], x3 - ld1 {v4.s}[0], [x0], x1 - ld1 {v7.s}[0], [x2], x3 - ld1 {v6.s}[0], [x0], x1 - ld1 {v1.s}[1], [x2], x3 - ld1 {v0.s}[1], [x0], x1 - ld1 {v3.s}[1], [x2], x3 - ld1 {v2.s}[1], [x0], x1 - ld1 {v5.s}[1], [x2], x3 - ld1 {v4.s}[1], [x0], x1 - ld1 {v7.s}[1], [x2], x3 - ld1 {v6.s}[1], [x0], x1 - usubl v20.8h, v0.8b, v1.8b - usubl v21.8h, v2.8b, v3.8b - usubl v22.8h, v4.8b, v5.8b - usubl v23.8h, v6.8b, v7.8b - - SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h - SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h - - bl satd_8x4v_8x8h_neon - - add v30.8h, v0.8h, v1.8h - add v31.8h, v2.8h, v3.8h - add v0.8h, v30.8h, v31.8h - uaddlv s0, v0.8h - mov w0, v0.s[0] - ret x4 -endfunc - function pixel_sa8d_8x8_neon, export=1 mov x4, x30 bl pixel_sa8d_8x8_neon @@ -1222,9 +1066,6 @@ function pixel_sa8d_\satd\()8x8_neon endfunc .endm -sa8d_satd_8x8 -sa8d_satd_8x8 satd_ - function pixel_sa8d_satd_16x16_neon, export=1 mov x4, x30 bl pixel_sa8d_satd_8x8_neon @@ -1295,11 +1136,6 @@ function pixel_hadamard_ac_\w\()x\h\()_neon, export=1 endfunc .endm -HADAMARD_AC 8, 8 -HADAMARD_AC 8, 16 -HADAMARD_AC 16, 8 -HADAMARD_AC 16, 16 - // v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 function hadamard_ac_8x8_neon ld1 {v16.8b}, [x0], x1 @@ -1392,16 +1228,15 @@ function hadamard_ac_8x8_neon ret endfunc - function pixel_ssim_4x4x2_core_neon, export=1 - ld1 {v0.8b}, [x0], x1 - ld1 {v2.8b}, [x2], x3 + ld1 {v0.8b}, [x0], x1 + ld1 {v2.8b}, [x2], x3 umull v16.8h, v0.8b, v0.8b umull v17.8h, v0.8b, v2.8b umull v18.8h, v2.8b, v2.8b - ld1 {v28.8b}, [x0], x1 - ld1 {v29.8b}, [x2], x3 + ld1 {v28.8b}, [x0], x1 + ld1 {v29.8b}, [x2], x3 umull v20.8h, v28.8b, v28.8b umull v21.8h, v28.8b, v29.8b umull v22.8h, v29.8b, v29.8b @@ -1412,8 +1247,8 @@ function pixel_ssim_4x4x2_core_neon, export=1 uadalp v16.4s, v18.8h uaddl v1.8h, v2.8b, v29.8b - ld1 {v26.8b}, [x0], x1 - ld1 {v27.8b}, [x2], x3 + ld1 {v26.8b}, [x0], x1 + ld1 {v27.8b}, [x2], x3 umull v23.8h, v26.8b, v26.8b umull v24.8h, v26.8b, v27.8b umull v25.8h, v27.8b, v27.8b @@ -1424,8 +1259,8 @@ function pixel_ssim_4x4x2_core_neon, export=1 uaddw v1.8h, v1.8h, v27.8b uadalp v16.4s, v22.8h - ld1 {v28.8b}, [x0], x1 - ld1 {v29.8b}, [x2], x3 + ld1 {v28.8b}, [x0], x1 + ld1 {v29.8b}, [x2], x3 umull v20.8h, v28.8b, v28.8b umull v21.8h, v28.8b, v29.8b umull v22.8h, v29.8b, v29.8b @@ -1448,14 +1283,14 @@ function pixel_ssim_4x4x2_core_neon, export=1 addp v2.4s, v16.4s, v16.4s addp v3.4s, v17.4s, v17.4s - st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4] + st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4] ret endfunc function pixel_ssim_end4_neon, export=1 mov x5, #4 - ld1 {v16.4s,v17.4s}, [x0], #32 - ld1 {v18.4s,v19.4s}, [x1], #32 + ld1 {v16.4s,v17.4s}, [x0], #32 + ld1 {v18.4s,v19.4s}, [x1], #32 mov w4, #0x99bb subs x2, x5, w2, uxtw mov w3, #416 // ssim_c1 = .01*.01*255*255*64 @@ -1463,13 +1298,13 @@ function pixel_ssim_end4_neon, export=1 add v0.4s, v16.4s, v18.4s add v1.4s, v17.4s, v19.4s add v0.4s, v0.4s, v1.4s - ld1 {v20.4s,v21.4s}, [x0], #32 - ld1 {v22.4s,v23.4s}, [x1], #32 + ld1 {v20.4s,v21.4s}, [x0], #32 + ld1 {v22.4s,v23.4s}, [x1], #32 add v2.4s, v20.4s, v22.4s add v3.4s, v21.4s, v23.4s add v1.4s, v1.4s, v2.4s - ld1 {v16.4s}, [x0], #16 - ld1 {v18.4s}, [x1], #16 + ld1 {v16.4s}, [x0], #16 + ld1 {v18.4s}, [x1], #16 add v16.4s, v16.4s, v18.4s add v2.4s, v2.4s, v3.4s add v3.4s, v3.4s, v16.4s @@ -1510,10 +1345,1513 @@ function pixel_ssim_end4_neon, export=1 b.eq 1f movrel x3, mask add x3, x3, x2, lsl #2 - ld1 {v29.4s}, [x3] + ld1 {v29.4s}, [x3] and v0.16b, v0.16b, v29.16b 1: faddp v0.4s, v0.4s, v0.4s faddp s0, v0.2s ret endfunc + +#else /* BIT_DEPTH == 8 */ + +.macro SAD_START_4 + lsl x1, x1, #1 + lsl x3, x3, #1 + ld1 {v1.d}[0], [x2], x3 + ld1 {v0.d}[0], [x0], x1 + ld1 {v1.d}[1], [x2], x3 + ld1 {v0.d}[1], [x0], x1 + uabdl v16.4s, v0.4h, v1.4h + uabdl2 v18.4s, v0.8h, v1.8h +.endm + +.macro SAD_4 + ld1 {v1.d}[0], [x2], x3 + ld1 {v0.d}[0], [x0], x1 + ld1 {v1.d}[1], [x2], x3 + ld1 {v0.d}[1], [x0], x1 + uabal v16.4s, v0.4h, v1.4h + uabal2 v18.4s, v0.8h, v1.8h +.endm + +.macro SAD_START_8 + lsl x1, x1, #1 + lsl x3, x3, #1 + ld1 {v1.8h}, [x2], x3 + ld1 {v0.8h}, [x0], x1 + ld1 {v3.8h}, [x2], x3 + ld1 {v2.8h}, [x0], x1 + uabdl v16.4s, v0.4h, v1.4h + uabdl2 v17.4s, v0.8h, v1.8h + uabdl v18.4s, v2.4h, v3.4h + uabdl2 v19.4s, v2.8h, v3.8h +.endm + +.macro SAD_8 + ld1 {v1.8h}, [x2], x3 + ld1 {v0.8h}, [x0], x1 + ld1 {v3.8h}, [x2], x3 + ld1 {v2.8h}, [x0], x1 + uabal v16.4s, v0.4h, v1.4h + uabal2 v17.4s, v0.8h, v1.8h + uabal v18.4s, v2.4h, v3.4h + uabal2 v19.4s, v2.8h, v3.8h +.endm + +.macro SAD_START_16 + lsl x1, x1, #1 + lsl x3, x3, #1 + ld2 {v0.8h, v1.8h}, [x2], x3 + ld2 {v2.8h, v3.8h}, [x0], x1 + ld2 {v4.8h, v5.8h}, [x2], x3 + ld2 {v6.8h, v7.8h}, [x0], x1 + uabdl v16.4s, v0.4h, v2.4h + uabdl2 v17.4s, v0.8h, v2.8h + uabdl v20.4s, v1.4h, v3.4h + uabdl2 v21.4s, v1.8h, v3.8h + uabdl v18.4s, v4.4h, v6.4h + uabdl2 v19.4s, v4.8h, v6.8h + uabdl v22.4s, v5.4h, v7.4h + uabdl2 v23.4s, v5.8h, v7.8h +.endm + +.macro SAD_16 + ld2 {v0.8h, v1.8h}, [x2], x3 + ld2 {v2.8h, v3.8h}, [x0], x1 + ld2 {v4.8h, v5.8h}, [x2], x3 + ld2 {v6.8h, v7.8h}, [x0], x1 + uabal v16.4s, v0.4h, v2.4h + uabal2 v17.4s, v0.8h, v2.8h + uabal v20.4s, v1.4h, v3.4h + uabal2 v21.4s, v1.8h, v3.8h + uabal v18.4s, v4.4h, v6.4h + uabal2 v19.4s, v4.8h, v6.8h + uabal v22.4s, v5.4h, v7.4h + uabal2 v23.4s, v5.8h, v7.8h +.endm + +.macro SAD_FUNC w, h, name +function pixel_sad\name\()_\w\()x\h\()_neon, export=1 + SAD_START_\w + +.rept \h / 2 - 1 + SAD_\w +.endr +.if \w > 8 + add v20.4s, v20.4s, v21.4s + add v16.4s, v16.4s, v20.4s + add v22.4s, v22.4s, v23.4s + add v18.4s, v18.4s, v22.4s +.endif +.if \w > 4 + add v16.4s, v16.4s, v17.4s + add v18.4s, v18.4s, v19.4s +.endif + add v16.4s, v16.4s, v18.4s + uaddlv s0, v16.8h + fmov w0, s0 + ret +endfunc +.endm + +.macro SAD_X_4 x, first=uaba + ld1 {v0.d}[0], [x0], x7 + ld1 {v1.d}[0], [x1], x5 + ld1 {v0.d}[1], [x0], x7 + ld1 {v1.d}[1], [x1], x5 + ld1 {v2.d}[0], [x2], x5 + ld1 {v2.d}[1], [x2], x5 + \first v16.8h, v1.8h, v0.8h + ld1 {v3.d}[0], [x3], x5 + ld1 {v3.d}[1], [x3], x5 + \first v17.8h, v2.8h, v0.8h +.if \x == 4 + ld1 {v4.d}[0], [x4], x5 + ld1 {v4.d}[1], [x4], x5 +.endif + \first v18.8h, v3.8h, v0.8h +.if \x == 4 + \first v19.8h, v4.8h, v0.8h +.endif +.endm + +.macro SAD_X_8 x, first=uaba + ld1 {v0.8h}, [x0], x7 + ld1 {v1.8h}, [x1], x5 + \first v16.8h, v1.8h, v0.8h + ld1 {v2.8h}, [x2], x5 + ld1 {v3.8h}, [x3], x5 + \first v17.8h, v2.8h, v0.8h + ld1 {v5.8h}, [x0], x7 + ld1 {v1.8h}, [x1], x5 + \first v18.8h, v3.8h, v0.8h + ld1 {v2.8h}, [x2], x5 + uaba v16.8h, v1.8h, v5.8h + ld1 {v3.8h}, [x3], x5 + uaba v17.8h, v2.8h, v5.8h +.if \x == 4 + ld1 {v4.8h}, [x4], x5 + ld1 {v1.8h}, [x4], x5 +.endif + uaba v18.8h, v3.8h, v5.8h +.if \x == 4 + \first v19.8h, v4.8h, v0.8h + uaba v19.8h, v1.8h, v5.8h +.endif +.endm + +.macro SAD_X_16 x, first=uaba + ld1 {v0.8h, v1.8h}, [x0], x7 + ld1 {v2.8h, v3.8h}, [x1], x5 + + ld1 {v4.8h, v5.8h}, [x2], x5 + \first v16.8h, v2.8h, v0.8h + \first v20.8h, v3.8h, v1.8h + ld1 {v24.8h, v25.8h}, [x3], x5 + \first v17.8h, v4.8h, v0.8h + \first v21.8h, v5.8h, v1.8h + + ld1 {v6.8h, v7.8h}, [x0], x7 + ld1 {v2.8h, v3.8h}, [x1], x5 + \first v18.8h, v24.8h, v0.8h + \first v22.8h, v25.8h, v1.8h + ld1 {v4.8h, v5.8h}, [x2], x5 + uaba v16.8h, v2.8h, v6.8h + uaba v20.8h, v3.8h, v7.8h + + ld1 {v24.8h, v25.8h}, [x3], x5 + uaba v17.8h, v4.8h, v6.8h + uaba v21.8h, v5.8h, v7.8h + +.if \x == 4 + ld1 {v26.8h, v27.8h}, [x4], x5 + ld1 {v28.8h, v29.8h}, [x4], x5 +.endif + uaba v18.8h, v24.8h, v6.8h + uaba v22.8h, v25.8h, v7.8h +.if \x == 4 + \first v19.8h, v26.8h, v0.8h + \first v23.8h, v27.8h, v1.8h + + uaba v19.8h, v28.8h, v6.8h + uaba v23.8h, v29.8h, v7.8h +.endif +.endm + +.macro SAD_X_FUNC x, w, h +function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1 +.if \x == 3 + mov x6, x5 + mov x5, x4 +.endif + mov x7, #FENC_STRIDE + lsl x5, x5, #1 + lsl x7, x7, #1 + + SAD_X_\w \x, uabd + +.rept \h / 2 - 1 + SAD_X_\w \x +.endr + +.if \w > 8 + add v16.8h, v16.8h, v20.8h + add v17.8h, v17.8h, v21.8h + add v18.8h, v18.8h, v22.8h +.if \x == 4 + add v19.8h, v19.8h, v23.8h +.endif +.endif +// add up the sads + uaddlv s0, v16.8h + uaddlv s1, v17.8h + uaddlv s2, v18.8h + + stp s0, s1, [x6], #8 +.if \x == 3 + str s2, [x6] +.else + uaddlv s3, v19.8h + stp s2, s3, [x6] +.endif + ret +endfunc +.endm + +function pixel_vsad_neon, export=1 + subs w2, w2, #2 + lsl x1, x1, #1 + + ld1 {v0.8h, v1.8h}, [x0], x1 + ld1 {v2.8h, v3.8h}, [x0], x1 + uabd v6.8h, v0.8h, v2.8h + uabd v7.8h, v1.8h, v3.8h + b.le 2f +1: + subs w2, w2, #2 + + ld1 {v0.8h, v1.8h}, [x0], x1 + uaba v6.8h, v2.8h, v0.8h + uaba v7.8h, v3.8h, v1.8h + ld1 {v2.8h, v3.8h}, [x0], x1 + b.lt 2f + uaba v6.8h, v0.8h, v2.8h + uaba v7.8h, v1.8h, v3.8h + b.gt 1b +2: + add v5.8h, v6.8h, v7.8h + uaddlv s0, v5.8h + fmov w0, s0 + ret +endfunc + +function pixel_asd8_neon, export=1 + sub w4, w4, #2 + lsl x1, x1, #1 + lsl x3, x3, #1 + + ld1 {v0.8h}, [x0], x1 + ld1 {v1.8h}, [x2], x3 + ld1 {v2.8h}, [x0], x1 + ld1 {v3.8h}, [x2], x3 + + sub v16.8h, v0.8h, v1.8h + +1: + subs w4, w4, #2 + ld1 {v4.8h}, [x0], x1 + ld1 {v5.8h}, [x2], x3 + + sub v17.8h, v2.8h, v3.8h + sub v18.8h, v4.8h, v5.8h + add v16.8h, v16.8h, v17.8h + + ld1 {v2.8h}, [x0], x1 + ld1 {v3.8h}, [x2], x3 + add v16.8h, v16.8h, v18.8h + + b.gt 1b + + sub v17.8h, v2.8h, v3.8h + add v16.8h, v16.8h, v17.8h + + saddlv s0, v16.8h + abs v0.4s, v0.4s + fmov w0, s0 + ret +endfunc + +.macro SSD_START_4 + ld1 {v16.d}[0], [x0], x1 + ld1 {v17.d}[0], [x2], x3 + sub v2.4h, v16.4h, v17.4h + ld1 {v16.d}[0], [x0], x1 + ld1 {v17.d}[0], [x2], x3 + smull v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_4 + sub v2.4h, v16.4h, v17.4h + ld1 {v16.d}[0], [x0], x1 + ld1 {v17.d}[0], [x2], x3 + smlal v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_END_4 + sub v2.4h, v16.4h, v17.4h + smlal v0.4s, v2.4h, v2.4h +.endm + +.macro SSD_START_8 + ld1 {v16.8h}, [x0], x1 + ld1 {v17.8h}, [x2], x3 + sub v2.8h, v16.8h, v17.8h + ld1 {v16.8h}, [x0], x1 + ld1 {v17.8h}, [x2], x3 + smull v0.4s, v2.4h, v2.4h + smull2 v20.4s, v2.8h, v2.8h +.endm + +.macro SSD_8 + sub v2.8h, v16.8h, v17.8h + ld1 {v16.8h}, [x0], x1 + ld1 {v17.8h}, [x2], x3 + smlal v0.4s, v2.4h, v2.4h + smlal2 v20.4s, v2.8h, v2.8h +.endm + +.macro SSD_END_8 + sub v2.8h, v16.8h, v17.8h + smlal v0.4s, v2.4h, v2.4h + smlal2 v20.4s, v2.8h, v2.8h + add v0.4s, v0.4s, v20.4s +.endm + +.macro SSD_START_16 + ld1 {v16.8h, v17.8h}, [x0], x1 + ld1 {v18.8h, v19.8h}, [x2], x3 + sub v2.8h, v16.8h, v18.8h + sub v3.8h, v17.8h, v19.8h + ld1 {v16.8h, v17.8h}, [x0], x1 + smull v0.4s, v2.4h, v2.4h + smull2 v20.4s, v2.8h, v2.8h + ld1 {v18.8h, v19.8h}, [x2], x3 + smlal v0.4s, v3.4h, v3.4h + smlal2 v20.4s, v3.8h, v3.8h +.endm + +.macro SSD_16 + sub v2.8h, v16.8h, v18.8h + sub v3.8h, v17.8h, v19.8h + ld1 {v16.8h, v17.8h}, [x0], x1 + smlal v0.4s, v2.4h, v2.4h + smlal2 v20.4s, v2.8h, v2.8h + ld1 {v18.8h, v19.8h}, [x2], x3 + smlal v0.4s, v3.4h, v3.4h + smlal2 v20.4s, v3.8h, v3.8h +.endm + +.macro SSD_END_16 + sub v2.8h, v16.8h, v18.8h + sub v3.8h, v17.8h, v19.8h + smlal v0.4s, v2.4h, v2.4h + smlal2 v20.4s, v2.8h, v2.8h + smlal v0.4s, v3.4h, v3.4h + smlal2 v20.4s, v3.8h, v3.8h + add v0.4s, v0.4s, v20.4s +.endm + +.macro SSD_FUNC w h +function pixel_ssd_\w\()x\h\()_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 + SSD_START_\w +.rept \h-2 + SSD_\w +.endr + SSD_END_\w + + addv s0, v0.4s + fmov w0, s0 + ret +endfunc +.endm + +function pixel_satd_4x4_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 + ld1 {v1.d}[0], [x2], x3 + ld1 {v0.d}[0], [x0], x1 + ld1 {v3.d}[0], [x2], x3 + ld1 {v2.d}[0], [x0], x1 + ld1 {v1.d}[1], [x2], x3 + ld1 {v0.d}[1], [x0], x1 + ld1 {v3.d}[1], [x2], x3 + ld1 {v2.d}[1], [x0], x1 + + sub v0.8h, v0.8h, v1.8h + sub v1.8h, v2.8h, v3.8h + + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + zip1 v0.2d, v2.2d, v3.2d + zip2 v1.2d, v2.2d, v3.2d + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + trn1 v0.8h, v2.8h, v3.8h + trn2 v1.8h, v2.8h, v3.8h + SUMSUB_AB v2.8h, v3.8h, v0.8h, v1.8h + + trn1 v0.4s, v2.4s, v3.4s + trn2 v1.4s, v2.4s, v3.4s + abs v0.8h, v0.8h + abs v1.8h, v1.8h + umax v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + fmov w0, s0 + ret +endfunc + +function pixel_satd_4x8_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 + ld1 {v1.d}[0], [x2], x3 + ld1 {v0.d}[0], [x0], x1 + ld1 {v3.d}[0], [x2], x3 + ld1 {v2.d}[0], [x0], x1 + ld1 {v5.d}[0], [x2], x3 + ld1 {v4.d}[0], [x0], x1 + ld1 {v7.d}[0], [x2], x3 + ld1 {v6.d}[0], [x0], x1 + ld1 {v1.d}[1], [x2], x3 + ld1 {v0.d}[1], [x0], x1 + ld1 {v3.d}[1], [x2], x3 + ld1 {v2.d}[1], [x0], x1 + ld1 {v5.d}[1], [x2], x3 + ld1 {v4.d}[1], [x0], x1 + ld1 {v7.d}[1], [x2], x3 + ld1 {v6.d}[1], [x0], x1 + b satd_4x8_8x4_end_neon +endfunc + +function pixel_satd_8x4_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 + ld1 {v1.8h}, [x2], x3 + ld1 {v0.8h}, [x0], x1 + ld1 {v3.8h}, [x2], x3 + ld1 {v2.8h}, [x0], x1 + ld1 {v5.8h}, [x2], x3 + ld1 {v4.8h}, [x0], x1 + ld1 {v7.8h}, [x2], x3 + ld1 {v6.8h}, [x0], x1 +endfunc + +function satd_4x8_8x4_end_neon + sub v0.8h, v0.8h, v1.8h + sub v1.8h, v2.8h, v3.8h + sub v2.8h, v4.8h, v5.8h + sub v3.8h, v6.8h, v7.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + + trn1 v0.8h, v4.8h, v5.8h + trn2 v1.8h, v4.8h, v5.8h + trn1 v2.8h, v6.8h, v7.8h + trn2 v3.8h, v6.8h, v7.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + + trn1 v0.4s, v16.4s, v18.4s + trn2 v1.4s, v16.4s, v18.4s + trn1 v2.4s, v17.4s, v19.4s + trn2 v3.4s, v17.4s, v19.4s + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + umax v0.8h, v0.8h, v1.8h + umax v1.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret +endfunc + +function pixel_satd_4x16_neon, export=1 + mov x4, x30 + lsl x1, x1, #1 + lsl x3, x3, #1 + + ld1 {v1.d}[0], [x2], x3 + ld1 {v0.d}[0], [x0], x1 + ld1 {v3.d}[0], [x2], x3 + ld1 {v2.d}[0], [x0], x1 + ld1 {v5.d}[0], [x2], x3 + ld1 {v4.d}[0], [x0], x1 + ld1 {v7.d}[0], [x2], x3 + ld1 {v6.d}[0], [x0], x1 + ld1 {v1.d}[1], [x2], x3 + ld1 {v0.d}[1], [x0], x1 + ld1 {v3.d}[1], [x2], x3 + ld1 {v2.d}[1], [x0], x1 + ld1 {v5.d}[1], [x2], x3 + ld1 {v4.d}[1], [x0], x1 + ld1 {v7.d}[1], [x2], x3 + ld1 {v6.d}[1], [x0], x1 + sub v16.8h, v0.8h, v1.8h + sub v17.8h, v2.8h, v3.8h + sub v18.8h, v4.8h, v5.8h + sub v19.8h, v6.8h, v7.8h + ld1 {v1.d}[0], [x2], x3 + ld1 {v0.d}[0], [x0], x1 + ld1 {v3.d}[0], [x2], x3 + ld1 {v2.d}[0], [x0], x1 + ld1 {v5.d}[0], [x2], x3 + ld1 {v4.d}[0], [x0], x1 + ld1 {v7.d}[0], [x2], x3 + ld1 {v6.d}[0], [x0], x1 + ld1 {v1.d}[1], [x2], x3 + ld1 {v0.d}[1], [x0], x1 + ld1 {v3.d}[1], [x2], x3 + ld1 {v2.d}[1], [x0], x1 + ld1 {v5.d}[1], [x2], x3 + ld1 {v4.d}[1], [x0], x1 + ld1 {v7.d}[1], [x2], x3 + ld1 {v6.d}[1], [x0], x1 + sub v20.8h, v0.8h, v1.8h + sub v21.8h, v2.8h, v3.8h + sub v22.8h, v4.8h, v5.8h + sub v23.8h, v6.8h, v7.8h + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + bl satd_8x4v_8x8h_neon + + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + fmov w0, s0 + ret x4 +endfunc + +.macro load_diff_fly_8x8 + ld1 {v1.8h}, [x2], x3 + ld1 {v0.8h}, [x0], x1 + ld1 {v3.8h}, [x2], x3 + ld1 {v2.8h}, [x0], x1 + sub v16.8h, v0.8h, v1.8h + ld1 {v5.8h}, [x2], x3 + ld1 {v4.8h}, [x0], x1 + sub v17.8h, v2.8h, v3.8h + ld1 {v7.8h}, [x2], x3 + ld1 {v6.8h}, [x0], x1 + sub v18.8h, v4.8h, v5.8h + ld1 {v1.8h}, [x2], x3 + ld1 {v0.8h}, [x0], x1 + sub v19.8h, v6.8h, v7.8h + ld1 {v3.8h}, [x2], x3 + ld1 {v2.8h}, [x0], x1 + sub v20.8h, v0.8h, v1.8h + ld1 {v5.8h}, [x2], x3 + ld1 {v4.8h}, [x0], x1 + sub v21.8h, v2.8h, v3.8h + ld1 {v7.8h}, [x2], x3 + ld1 {v6.8h}, [x0], x1 + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + sub v22.8h, v4.8h, v5.8h + sub v23.8h, v6.8h, v7.8h +.endm + +function pixel_satd_8x8_neon, export=1 + mov x4, x30 + + lsl x1, x1, #1 + lsl x3, x3, #1 + + bl satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function pixel_satd_8x16_neon, export=1 + mov x4, x30 + + lsl x1, x1, #1 + lsl x3, x3, #1 + + bl satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v0.8h, v1.8h + + bl satd_8x8_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v31.8h, v0.8h, v1.8h + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function satd_8x8_neon + load_diff_fly_8x8 +endfunc + +// one vertical hadamard pass and two horizontal +function satd_8x4v_8x8h_neon + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h + SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h + + transpose v0.4s, v2.4s, v16.4s, v18.4s + transpose v1.4s, v3.4s, v17.4s, v19.4s + transpose v4.4s, v6.4s, v20.4s, v22.4s + transpose v5.4s, v7.4s, v21.4s, v23.4s + + abs v0.8h, v0.8h + abs v1.8h, v1.8h + abs v2.8h, v2.8h + abs v3.8h, v3.8h + abs v4.8h, v4.8h + abs v5.8h, v5.8h + abs v6.8h, v6.8h + abs v7.8h, v7.8h + + umax v0.8h, v0.8h, v2.8h + umax v1.8h, v1.8h, v3.8h + umax v2.8h, v4.8h, v6.8h + umax v3.8h, v5.8h, v7.8h + + ret +endfunc + +function pixel_ssd_nv12_core_neon, export=1 + sxtw x8, w4 + add x8, x8, #8 + and x8, x8, #~15 + movi v6.2d, #0 + movi v7.2d, #0 + sub x1, x1, x8, lsl #1 + sub x3, x3, x8, lsl #1 + + lsl x1, x1, #1 + lsl x3, x3, #1 + lsl x4, x4, #1 +1: + subs w8, w4, #32 + ld2 {v0.8h, v1.8h}, [x0], #32 + ld2 {v2.8h, v3.8h}, [x2], #32 + ld2 {v24.8h, v25.8h}, [x0], #32 + ld2 {v26.8h, v27.8h}, [x2], #32 + + sub v16.8h, v0.8h, v2.8h + sub v17.8h, v1.8h, v3.8h + smull v20.4s, v16.4h, v16.4h + smull v21.4s, v17.4h, v17.4h + sub v18.8h, v24.8h, v26.8h + sub v19.8h, v25.8h, v27.8h + smlal2 v20.4s, v16.8h, v16.8h + smlal2 v21.4s, v17.8h, v17.8h + + b.lt 4f + b.eq 3f +2: + smlal v20.4s, v18.4h, v18.4h + smlal v21.4s, v19.4h, v19.4h + ld2 {v0.8h, v1.8h}, [x0], #32 + ld2 {v2.8h, v3.8h}, [x2], #32 + smlal2 v20.4s, v18.8h, v18.8h + smlal2 v21.4s, v19.8h, v19.8h + + subs w8, w8, #32 + sub v16.8h, v0.8h, v2.8h + sub v17.8h, v1.8h, v3.8h + smlal v20.4s, v16.4h, v16.4h + smlal v21.4s, v17.4h, v17.4h + ld2 {v24.8h,v25.8h}, [x0], #32 + ld2 {v26.8h,v27.8h}, [x2], #32 + smlal2 v20.4s, v16.8h, v16.8h + smlal2 v21.4s, v17.8h, v17.8h + b.lt 4f + + sub v18.8h, v24.8h, v26.8h + sub v19.8h, v25.8h, v27.8h + b.gt 2b + +3: + smlal v20.4s, v18.4h, v18.4h + smlal v21.4s, v19.4h, v19.4h + smlal2 v20.4s, v18.8h, v18.8h + smlal2 v21.4s, v19.8h, v19.8h + 4: + + subs w5, w5, #1 + uaddw v6.2d, v6.2d, v20.2s + uaddw v7.2d, v7.2d, v21.2s + add x0, x0, x1 + add x2, x2, x3 + uaddw2 v6.2d, v6.2d, v20.4s + uaddw2 v7.2d, v7.2d, v21.4s + + b.gt 1b + + addp v6.2d, v6.2d, v7.2d + st1 {v6.d}[0], [x6] + st1 {v6.d}[1], [x7] + + ret +endfunc + +.macro pixel_var_8 h +function pixel_var_8x\h\()_neon, export=1 + lsl x1, x1, #1 + ld1 {v16.8h}, [x0], x1 + ld1 {v17.8h}, [x0], x1 + mov x2, \h - 4 + umull v1.4s, v16.4h, v16.4h + umull2 v30.4s, v16.8h, v16.8h + mov v0.16b, v16.16b + umull v2.4s, v17.4h, v17.4h + umull2 v31.4s, v17.8h, v17.8h + add v0.8h, v0.8h, v17.8h + ld1 {v18.8h}, [x0], x1 + ld1 {v19.8h}, [x0], x1 + +1: subs x2, x2, #4 + add v0.8h, v0.8h, v18.8h + umull v24.4s, v18.4h, v18.4h + umull2 v25.4s, v18.8h, v18.8h + ld1 {v20.8h}, [x0], x1 + add v0.8h, v0.8h, v19.8h + umull v26.4s, v19.4h, v19.4h + umull2 v27.4s, v19.8h, v19.8h + add v1.4s, v1.4s, v24.4s + add v30.4s, v30.4s, v25.4s + ld1 {v21.8h}, [x0], x1 + add v0.8h, v0.8h, v20.8h + umull v28.4s, v20.4h, v20.4h + umull2 v29.4s, v20.8h, v20.8h + add v2.4s, v2.4s, v26.4s + add v31.4s, v31.4s, v27.4s + ld1 {v18.8h}, [x0], x1 + add v0.8h, v0.8h, v21.8h + umull v3.4s, v21.4h, v21.4h + umull2 v4.4s, v21.8h, v21.8h + add v1.4s, v1.4s, v28.4s + add v30.4s, v30.4s, v29.4s + ld1 {v19.8h}, [x0], x1 + add v2.4s, v2.4s, v3.4s + add v31.4s, v31.4s, v4.4s + b.gt 1b + + add v0.8h, v0.8h, v18.8h + umull v24.4s, v18.4h, v18.4h + umull2 v25.4s, v18.8h, v18.8h + add v0.8h, v0.8h, v19.8h + umull v26.4s, v19.4h, v19.4h + umull2 v27.4s, v19.8h, v19.8h + add v1.4s, v1.4s, v24.4s + add v30.4s, v30.4s, v25.4s + add v2.4s, v2.4s, v26.4s + add v31.4s, v31.4s, v27.4s + + b var_end +endfunc +.endm + +function pixel_var_16x16_neon, export=1 + lsl x1, x1, #1 + ld1 {v16.8h, v17.8h}, [x0], x1 + ld1 {v18.8h, v19.8h}, [x0], x1 + mov x2, #14 + + umull v1.4s, v16.4h, v16.4h + umull2 v30.4s, v16.8h, v16.8h + add v0.8h, v16.8h, v17.8h + umull v2.4s, v17.4h, v17.4h + umull2 v31.4s, v17.8h, v17.8h + +1: subs x2, x2, #2 + ld1 {v20.8h, v21.8h}, [x0], x1 + + add v0.8h, v0.8h, v18.8h + umlal v1.4s, v18.4h, v18.4h + umlal2 v30.4s, v18.8h, v18.8h + umlal v2.4s, v19.4h, v19.4h + umlal2 v31.4s, v19.8h, v19.8h + add v0.8h, v0.8h, v19.8h + ld1 {v18.8h, v19.8h}, [x0], x1 + add v0.8h, v0.8h, v20.8h + umlal v1.4s, v20.4h, v20.4h + umlal2 v30.4s, v20.8h, v20.8h + umlal v2.4s, v21.4h, v21.4h + umlal2 v31.4s, v21.8h, v21.8h + add v0.8h, v0.8h, v21.8h + + b.gt 1b + + add v0.8h, v0.8h, v18.8h + umlal v1.4s, v18.4h, v18.4h + umlal2 v30.4s, v18.8h, v18.8h + umlal v2.4s, v19.4h, v19.4h + umlal2 v31.4s, v19.8h, v19.8h + add v0.8h, v0.8h, v19.8h + +endfunc + +function var_end + add v1.4s, v1.4s, v2.4s + add v30.4s, v30.4s, v31.4s + add v1.4s, v1.4s, v30.4s + uaddlv s0, v0.8h + uaddlv d1, v1.4s + mov w0, v0.s[0] + mov x1, v1.d[0] + orr x0, x0, x1, lsl #32 + ret +endfunc + +.macro pixel_var2_8 h +function pixel_var2_8x\h\()_neon, export=1 + mov x3, #32 + ld1 {v16.8h}, [x0], #16 + ld1 {v18.8h}, [x1], x3 + ld1 {v17.8h}, [x0], #16 + ld1 {v19.8h}, [x1], x3 + mov x5, \h - 2 + sub v0.8h, v16.8h, v18.8h + sub v1.8h, v17.8h, v19.8h + ld1 {v16.8h}, [x0], #16 + ld1 {v18.8h}, [x1], x3 + smull v2.4s, v0.4h, v0.4h + smull2 v3.4s, v0.8h, v0.8h + smull v4.4s, v1.4h, v1.4h + smull2 v5.4s, v1.8h, v1.8h + + sub v6.8h, v16.8h, v18.8h + +1: subs x5, x5, #1 + ld1 {v17.8h}, [x0], #16 + ld1 {v19.8h}, [x1], x3 + smlal v2.4s, v6.4h, v6.4h + smlal2 v3.4s, v6.8h, v6.8h + sub v7.8h, v17.8h, v19.8h + add v0.8h, v0.8h, v6.8h + ld1 {v16.8h}, [x0], #16 + ld1 {v18.8h}, [x1], x3 + smlal v4.4s, v7.4h, v7.4h + smlal2 v5.4s, v7.8h, v7.8h + sub v6.8h, v16.8h, v18.8h + add v1.8h, v1.8h, v7.8h + b.gt 1b + + ld1 {v17.8h}, [x0], #16 + ld1 {v19.8h}, [x1], x3 + smlal v2.4s, v6.4h, v6.4h + smlal2 v3.4s, v6.8h, v6.8h + sub v7.8h, v17.8h, v19.8h + add v0.8h, v0.8h, v6.8h + smlal v4.4s, v7.4h, v7.4h + add v1.8h, v1.8h, v7.8h + smlal2 v5.4s, v7.8h, v7.8h + + saddlv s0, v0.8h + saddlv s1, v1.8h + add v2.4s, v2.4s, v3.4s + add v4.4s, v4.4s, v5.4s + mov w0, v0.s[0] + mov w1, v1.s[0] + addv s2, v2.4s + addv s4, v4.4s + mul w0, w0, w0 + mul w1, w1, w1 + mov w3, v2.s[0] + mov w4, v4.s[0] + sub w0, w3, w0, lsr # 6 + (\h >> 4) + sub w1, w4, w1, lsr # 6 + (\h >> 4) + str w3, [x2] + add w0, w0, w1 + str w4, [x2, #4] + + ret +endfunc +.endm + +function pixel_satd_16x8_neon, export=1 + mov x4, x30 + + lsl x1, x1, #1 + lsl x3, x3, #1 + + bl satd_16x4_neon + add v30.8h, v0.8h, v1.8h + add v31.8h, v2.8h, v3.8h + + bl satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + add v30.8h, v30.8h, v0.8h + add v31.8h, v31.8h, v1.8h + + add v0.8h, v30.8h, v31.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + ret x4 +endfunc + +function pixel_satd_16x16_neon, export=1 + mov x4, x30 + + lsl x1, x1, #1 + lsl x3, x3, #1 + + bl satd_16x4_neon + + uaddl v30.4s, v0.4h, v1.4h + uaddl v31.4s, v2.4h, v3.4h + uaddl2 v28.4s, v0.8h, v1.8h + uaddl2 v29.4s, v2.8h, v3.8h + add v30.4s, v30.4s, v28.4s + add v31.4s, v31.4s, v29.4s + + bl satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + + uaddw v30.4s, v30.4s, v0.4h + uaddw2 v30.4s, v30.4s, v0.8h + uaddw v31.4s, v31.4s, v1.4h + uaddw2 v31.4s, v31.4s, v1.8h + + bl satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + + uaddw v30.4s, v30.4s, v0.4h + uaddw2 v30.4s, v30.4s, v0.8h + uaddw v31.4s, v31.4s, v1.4h + uaddw2 v31.4s, v31.4s, v1.8h + + bl satd_16x4_neon + add v0.8h, v0.8h, v1.8h + add v1.8h, v2.8h, v3.8h + uaddw v30.4s, v30.4s, v0.4h + uaddw2 v30.4s, v30.4s, v0.8h + uaddw v31.4s, v31.4s, v1.4h + uaddw2 v31.4s, v31.4s, v1.8h + + add v0.4s, v30.4s, v31.4s + addv s0, v0.4s + mov w0, v0.s[0] + ret x4 +endfunc + +function satd_16x4_neon + ld1 {v0.8h, v1.8h}, [x2], x3 + ld1 {v2.8h, v3.8h}, [x0], x1 + + sub v16.8h, v2.8h, v0.8h + sub v20.8h, v3.8h, v1.8h + + ld1 {v4.8h, v5.8h}, [x2], x3 + ld1 {v6.8h, v7.8h}, [x0], x1 + + sub v17.8h, v6.8h, v4.8h + sub v21.8h, v7.8h, v5.8h + + ld1 {v0.8h, v1.8h}, [x2], x3 + ld1 {v2.8h, v3.8h}, [x0], x1 + + sub v18.8h, v2.8h, v0.8h + sub v22.8h, v3.8h, v1.8h + + ld1 {v4.8h, v5.8h}, [x2], x3 + ld1 {v6.8h, v7.8h}, [x0], x1 + + sub v19.8h, v6.8h, v4.8h + sub v23.8h, v7.8h, v5.8h + + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + + b satd_8x4v_8x8h_neon +endfunc + +function pixel_sa8d_8x8_neon, export=1 + mov x4, x30 + lsl x1, x1, #1 + lsl x3, x3, #1 + bl pixel_sa8d_8x8_neon + add v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + mov w0, v0.s[0] + add w0, w0, #1 + lsr w0, w0, #1 + ret x4 +endfunc + +function pixel_sa8d_16x16_neon, export=1 + mov x4, x30 + lsl x1, x1, #1 + lsl x3, x3, #1 + bl pixel_sa8d_8x8_neon + uaddlp v30.4s, v0.8h + uaddlp v31.4s, v1.8h + bl pixel_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + sub x0, x0, x1, lsl #4 + sub x2, x2, x3, lsl #4 + add x0, x0, #16 + add x2, x2, #16 + bl pixel_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + bl pixel_sa8d_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + add v0.4s, v30.4s, v31.4s + addv s0, v0.4s + mov w0, v0.s[0] + add w0, w0, #1 + lsr w0, w0, #1 + ret x4 +endfunc + +.macro sa8d_satd_8x8 satd= +function pixel_sa8d_\satd\()8x8_neon + load_diff_fly_8x8 + + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + + HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h +.ifc \satd, satd_ + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h + SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h + SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h + SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h + + transpose v4.4s, v6.4s, v24.4s, v26.4s + transpose v5.4s, v7.4s, v25.4s, v27.4s + transpose v24.4s, v26.4s, v0.4s, v2.4s + transpose v25.4s, v27.4s, v1.4s, v3.4s + + abs v0.8h, v4.8h + abs v1.8h, v5.8h + abs v2.8h, v6.8h + abs v3.8h, v7.8h + abs v4.8h, v24.8h + abs v5.8h, v25.8h + abs v6.8h, v26.8h + abs v7.8h, v27.8h + + umax v0.8h, v0.8h, v2.8h + umax v1.8h, v1.8h, v3.8h + umax v2.8h, v4.8h, v6.8h + umax v3.8h, v5.8h, v7.8h + + add v26.8h, v0.8h, v1.8h + add v27.8h, v2.8h, v3.8h +.endif + + SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h + SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h + SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h + SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h + + transpose v20.8h, v21.8h, v16.8h, v17.8h + transpose v4.8h, v5.8h, v0.8h, v1.8h + transpose v22.8h, v23.8h, v18.8h, v19.8h + transpose v6.8h, v7.8h, v2.8h, v3.8h + + SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h + SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h + SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h + SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h + + transpose v20.4s, v22.4s, v2.4s, v0.4s + transpose v21.4s, v23.4s, v3.4s, v1.4s + transpose v16.4s, v18.4s, v24.4s, v4.4s + transpose v17.4s, v19.4s, v25.4s, v5.4s + + SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h + SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h + SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h + SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h + + transpose v16.2d, v20.2d, v0.2d, v4.2d + transpose v17.2d, v21.2d, v1.2d, v5.2d + transpose v18.2d, v22.2d, v2.2d, v6.2d + transpose v19.2d, v23.2d, v3.2d, v7.2d + + abs v16.8h, v16.8h + abs v20.8h, v20.8h + abs v17.8h, v17.8h + abs v21.8h, v21.8h + abs v18.8h, v18.8h + abs v22.8h, v22.8h + abs v19.8h, v19.8h + abs v23.8h, v23.8h + + umax v16.8h, v16.8h, v20.8h + umax v17.8h, v17.8h, v21.8h + umax v18.8h, v18.8h, v22.8h + umax v19.8h, v19.8h, v23.8h + + add v0.8h, v16.8h, v17.8h + add v1.8h, v18.8h, v19.8h + + ret +endfunc +.endm + +function pixel_sa8d_satd_16x16_neon, export=1 + mov x4, x30 + lsl x1, x1, #1 + lsl x3, x3, #1 + bl pixel_sa8d_satd_8x8_neon + uaddlp v30.4s, v0.8h + uaddlp v31.4s, v1.8h + uaddlp v28.4s, v26.8h + uaddlp v29.4s, v27.8h + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + sub x0, x0, x1, lsl #4 + sub x2, x2, x3, lsl #4 + add x0, x0, #16 + add x2, x2, #16 + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + bl pixel_sa8d_satd_8x8_neon + uadalp v30.4s, v0.8h + uadalp v31.4s, v1.8h + uadalp v28.4s, v26.8h + uadalp v29.4s, v27.8h + add v0.4s, v30.4s, v31.4s // sa8d + add v1.4s, v28.4s, v29.4s // satd + addv s0, v0.4s + addv s1, v1.4s + urshr v0.4s, v0.4s, #1 + fmov w0, s0 + fmov w1, s1 + add x0, x0, x1, lsl #32 + ret x4 +endfunc + +.macro HADAMARD_AC w h +function pixel_hadamard_ac_\w\()x\h\()_neon, export=1 + movrel x5, mask_ac_4_8 + mov x4, x30 + lsl x1, x1, #1 + ld1 {v30.8h,v31.8h}, [x5] + movi v28.16b, #0 + movi v29.16b, #0 + + bl hadamard_ac_8x8_neon +.if \h > 8 + bl hadamard_ac_8x8_neon +.endif +.if \w > 8 + sub x0, x0, x1, lsl #3 + add x0, x0, 16 + bl hadamard_ac_8x8_neon +.endif +.if \w * \h == 256 + sub x0, x0, x1, lsl #4 + bl hadamard_ac_8x8_neon +.endif + + addv s1, v29.4s + addv s0, v28.4s + mov w1, v1.s[0] + mov w0, v0.s[0] + lsr w1, w1, #2 + lsr w0, w0, #1 + orr x0, x0, x1, lsl #32 + ret x4 +endfunc +.endm + +// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8 +function hadamard_ac_8x8_neon + ld1 {v16.8h}, [x0], x1 + ld1 {v17.8h}, [x0], x1 + ld1 {v18.8h}, [x0], x1 + ld1 {v19.8h}, [x0], x1 + SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h + ld1 {v20.8h}, [x0], x1 + ld1 {v21.8h}, [x0], x1 + SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h + ld1 {v22.8h}, [x0], x1 + ld1 {v23.8h}, [x0], x1 + SUMSUB_AB v4.8h, v5.8h, v20.8h, v21.8h + SUMSUB_AB v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h + SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h + + transpose v0.8h, v1.8h, v16.8h, v17.8h + transpose v2.8h, v3.8h, v18.8h, v19.8h + transpose v4.8h, v5.8h, v20.8h, v21.8h + transpose v6.8h, v7.8h, v22.8h, v23.8h + + SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h + SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h + SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h + SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h + + transpose v0.4s, v2.4s, v16.4s, v18.4s + transpose v1.4s, v3.4s, v17.4s, v19.4s + transpose v4.4s, v6.4s, v20.4s, v22.4s + transpose v5.4s, v7.4s, v21.4s, v23.4s + + SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h + SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h + SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h + + abs v0.8h, v16.8h + abs v4.8h, v20.8h + abs v1.8h, v17.8h + abs v5.8h, v21.8h + abs v2.8h, v18.8h + abs v6.8h, v22.8h + abs v3.8h, v19.8h + abs v7.8h, v23.8h + + add v0.8h, v0.8h, v4.8h + add v1.8h, v1.8h, v5.8h + and v0.16b, v0.16b, v30.16b + add v2.8h, v2.8h, v6.8h + add v3.8h, v3.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v1.8h, v1.8h, v3.8h + uadalp v28.4s, v0.8h + uadalp v28.4s, v1.8h + + SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h + SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h + SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h + SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h + + transpose v16.2d, v17.2d, v6.2d, v7.2d + transpose v18.2d, v19.2d, v4.2d, v5.2d + transpose v20.2d, v21.2d, v2.2d, v3.2d + + abs v16.8h, v16.8h + abs v17.8h, v17.8h + abs v18.8h, v18.8h + abs v19.8h, v19.8h + abs v20.8h, v20.8h + abs v21.8h, v21.8h + + transpose v7.2d, v6.2d, v1.2d, v0.2d + + umax v3.8h, v16.8h, v17.8h + umax v2.8h, v18.8h, v19.8h + umax v1.8h, v20.8h, v21.8h + + SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h + + add v2.8h, v2.8h, v3.8h + add v2.8h, v2.8h, v1.8h + and v4.16b, v4.16b, v31.16b + add v2.8h, v2.8h, v2.8h + abs v5.8h, v5.8h + abs v4.8h, v4.8h + add v2.8h, v2.8h, v5.8h + add v2.8h, v2.8h, v4.8h + uadalp v29.4s, v2.8h + ret +endfunc + +function pixel_ssim_4x4x2_core_neon, export=1 + lsl x1, x1, #1 + lsl x3, x3, #1 + + ld1 {v0.8h}, [x0], x1 + ld1 {v2.8h}, [x2], x3 + ld1 {v28.8h}, [x0], x1 + ld1 {v29.8h}, [x2], x3 + + umull v16.4s, v0.4h, v0.4h + umull2 v17.4s, v0.8h, v0.8h + umull v18.4s, v0.4h, v2.4h + umull2 v19.4s, v0.8h, v2.8h + umlal v16.4s, v2.4h, v2.4h + umlal2 v17.4s, v2.8h, v2.8h + + ld1 {v26.8h}, [x0], x1 + ld1 {v27.8h}, [x2], x3 + + umlal v16.4s, v28.4h, v28.4h + umlal2 v17.4s, v28.8h, v28.8h + umlal v18.4s, v28.4h, v29.4h + umlal2 v19.4s, v28.8h, v29.8h + umlal v16.4s, v29.4h, v29.4h + umlal2 v17.4s, v29.8h, v29.8h + + add v0.8h, v0.8h, v28.8h + add v1.8h, v2.8h, v29.8h + + umlal v16.4s, v26.4h, v26.4h + umlal2 v17.4s, v26.8h, v26.8h + umlal v18.4s, v26.4h, v27.4h + umlal2 v19.4s, v26.8h, v27.8h + umlal v16.4s, v27.4h, v27.4h + umlal2 v17.4s, v27.8h, v27.8h + + ld1 {v28.8h}, [x0], x1 + ld1 {v29.8h}, [x2], x3 + + add v0.8h, v0.8h, v26.8h + add v1.8h, v1.8h, v27.8h + + umlal v16.4s, v28.4h, v28.4h + umlal2 v17.4s, v28.8h, v28.8h + umlal v18.4s, v28.4h, v29.4h + umlal2 v19.4s, v28.8h, v29.8h + umlal v16.4s, v29.4h, v29.4h + umlal2 v17.4s, v29.8h, v29.8h + + add v0.8h, v0.8h, v28.8h + add v1.8h, v1.8h, v29.8h + + addp v16.4s, v16.4s, v17.4s + addp v17.4s, v18.4s, v19.4s + + uaddlp v0.4s, v0.8h + uaddlp v1.4s, v1.8h + + addp v0.4s, v0.4s, v0.4s + addp v1.4s, v1.4s, v1.4s + addp v2.4s, v16.4s, v16.4s + addp v3.4s, v17.4s, v17.4s + + st4 {v0.2s, v1.2s, v2.2s, v3.2s}, [x4] + ret +endfunc + +function pixel_ssim_end4_neon, export=1 + mov x5, #4 + ld1 {v16.4s, v17.4s}, [x0], #32 + ld1 {v18.4s, v19.4s}, [x1], #32 + subs x2, x5, w2, uxtw + // These values must be stored in float, since with 10 bit depth edge cases + // may overflow. The hexadecimal values are IEEE-754 representation of the + // floating point numbers. + ldr w3, =0x45d14e49 // ssim_c1 = .01*.01*1023*1023*64 + ldr w4, =0x4a67ca32 // ssim_c2 = .03*.03*1023*1023*64*63 + add v0.4s, v16.4s, v18.4s + add v1.4s, v17.4s, v19.4s + add v0.4s, v0.4s, v1.4s + ld1 {v20.4s, v21.4s}, [x0], #32 + ld1 {v22.4s, v23.4s}, [x1], #32 + add v2.4s, v20.4s, v22.4s + add v3.4s, v21.4s, v23.4s + add v1.4s, v1.4s, v2.4s + ld1 {v16.4s}, [x0], #16 + ld1 {v18.4s}, [x1], #16 + add v16.4s, v16.4s, v18.4s + add v2.4s, v2.4s, v3.4s + add v3.4s, v3.4s, v16.4s + + dup v30.4s, w3 + dup v31.4s, w4 + + transpose v4.4s, v5.4s, v0.4s, v1.4s + transpose v6.4s, v7.4s, v2.4s, v3.4s + transpose v0.2d, v2.2d, v4.2d, v6.2d + transpose v1.2d, v3.2d, v5.2d, v7.2d + + // Conversion to floating point number must occur earlier than in 8 bit case + // because of the range overflow + scvtf v0.4s, v0.4s + scvtf v2.4s, v2.4s + scvtf v1.4s, v1.4s + scvtf v3.4s, v3.4s + + fmul v16.4s, v0.4s, v1.4s // s1*s2 + fmul v0.4s, v0.4s, v0.4s + fmla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2 + + // IEEE-754 hexadecimal representation of multipliers + ldr w3, =0x42800000 // 64 + ldr w4, =0x43000000 // 128 + dup v28.4s, w3 + dup v29.4s, w4 + + fmul v2.4s, v2.4s, v28.4s + fmul v3.4s, v3.4s, v29.4s + + fadd v1.4s, v16.4s, v16.4s + + fsub v2.4s, v2.4s, v0.4s // vars + fsub v3.4s, v3.4s, v1.4s // covar*2 + fadd v0.4s, v0.4s, v30.4s + fadd v2.4s, v2.4s, v31.4s + fadd v1.4s, v1.4s, v30.4s + fadd v3.4s, v3.4s, v31.4s + + fmul v0.4s, v0.4s, v2.4s + fmul v1.4s, v1.4s, v3.4s + + fdiv v0.4s, v1.4s, v0.4s + + b.eq 1f + movrel x3, mask + add x3, x3, x2, lsl #2 + ld1 {v29.4s}, [x3] + and v0.16b, v0.16b, v29.16b +1: + faddp v0.4s, v0.4s, v0.4s + faddp s0, v0.2s + ret +endfunc + +#endif /* BIT_DEPTH == 8 */ + +SAD_FUNC 4, 4 +SAD_FUNC 4, 8 +SAD_FUNC 4, 16 +SAD_FUNC 8, 4 +SAD_FUNC 8, 8 +SAD_FUNC 8, 16 +SAD_FUNC 16, 8 +SAD_FUNC 16, 16 + +SAD_X_FUNC 3, 4, 4 +SAD_X_FUNC 3, 4, 8 +SAD_X_FUNC 3, 8, 4 +SAD_X_FUNC 3, 8, 8 +SAD_X_FUNC 3, 8, 16 +SAD_X_FUNC 3, 16, 8 +SAD_X_FUNC 3, 16, 16 + +SAD_X_FUNC 4, 4, 4 +SAD_X_FUNC 4, 4, 8 +SAD_X_FUNC 4, 8, 4 +SAD_X_FUNC 4, 8, 8 +SAD_X_FUNC 4, 8, 16 +SAD_X_FUNC 4, 16, 8 +SAD_X_FUNC 4, 16, 16 + +SSD_FUNC 4, 4 +SSD_FUNC 4, 8 +SSD_FUNC 4, 16 +SSD_FUNC 8, 4 +SSD_FUNC 8, 8 +SSD_FUNC 8, 16 +SSD_FUNC 16, 8 +SSD_FUNC 16, 16 + +pixel_var_8 8 +pixel_var_8 16 + +pixel_var2_8 8 +pixel_var2_8 16 + +sa8d_satd_8x8 +sa8d_satd_8x8 satd_ + +HADAMARD_AC 8, 8 +HADAMARD_AC 8, 16 +HADAMARD_AC 16, 8 +HADAMARD_AC 16, 16 diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h index e1bcbcb5a..fac6c7445 100644 --- a/common/aarch64/pixel.h +++ b/common/aarch64/pixel.h @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.h: aarch64 pixel metrics ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -65,6 +65,11 @@ #define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon) #define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon) #define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon) +#define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve) +#define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve) +#define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve) +#define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve) +#define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve) #define DECL_PIXELS( ret, name, suffix, args ) \ ret x264_pixel_##name##_16x16_##suffix args;\ ret x264_pixel_##name##_16x8_##suffix args;\ @@ -73,10 +78,18 @@ ret x264_pixel_##name##_8x4_##suffix args;\ ret x264_pixel_##name##_4x16_##suffix args;\ ret x264_pixel_##name##_4x8_##suffix args;\ - ret x264_pixel_##name##_4x4_##suffix args;\ + ret x264_pixel_##name##_4x4_##suffix args; +#define DECL_PIXELS_SSD_SVE( ret, args ) \ + ret x264_pixel_ssd_8x8_sve args;\ + ret x264_pixel_ssd_8x4_sve args;\ + ret x264_pixel_ssd_4x16_sve args;\ + ret x264_pixel_ssd_4x8_sve args;\ + ret x264_pixel_ssd_4x4_sve args; #define DECL_X1( name, suffix ) \ DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) ) +#define DECL_X1_SSD_SVE( ) \ + DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) ) #define DECL_X4( name, suffix ) \ DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\ @@ -86,49 +99,66 @@ DECL_X1( sad, neon ) DECL_X4( sad, neon ) DECL_X1( satd, neon ) DECL_X1( ssd, neon ) +DECL_X1_SSD_SVE( ) #define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon) -void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * ); +void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, int, uint64_t *, uint64_t * ); #define x264_pixel_vsad_neon x264_template(pixel_vsad_neon) -int x264_pixel_vsad_neon( uint8_t *, intptr_t, int ); +int x264_pixel_vsad_neon( pixel *, intptr_t, int ); #define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon) -int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sa8d_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t ); #define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon) -int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); +int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t ); #define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon) -uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); +uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t ); +#define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve) +int x264_pixel_sa8d_8x8_sve ( pixel *, intptr_t, pixel *, intptr_t ); #define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon) -uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_8x8_neon ( pixel *, intptr_t ); #define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon) -uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_8x16_neon ( pixel *, intptr_t ); #define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon) -uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t ); #define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon) -int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * ); +int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * ); #define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon) -int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * ); +int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * ); +#define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve) +uint64_t x264_pixel_var_8x8_sve ( pixel *, intptr_t ); +#define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve) +uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t ); + #define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon) -uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_8x8_neon ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon) -uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon) -uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t ); #define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon) -uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t ); +uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t ); +#define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve) +uint64_t x264_pixel_hadamard_ac_8x8_sve ( pixel *, intptr_t ); +#define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve) +uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t ); +#define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve) +uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t ); +#define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve) +uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t ); + #define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon) -void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t, - const uint8_t *, intptr_t, +void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t, + const pixel *, intptr_t, int sums[2][4] ); #define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon) float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width ); #define x264_pixel_asd8_neon x264_template(pixel_asd8_neon) -int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +int x264_pixel_asd8_neon( pixel *, intptr_t, pixel *, intptr_t, int ); #endif diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S index 1444711ba..5a052e713 100644 --- a/common/aarch64/predict-a.S +++ b/common/aarch64/predict-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * predict.S: aarch64 intra prediction ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Mans Rullgard @@ -36,29 +36,29 @@ endconst .macro ldcol.8 vd, xn, xm, n=8, hi=0 .if \n == 8 || \hi == 0 - ld1 {\vd\().b}[0], [\xn], \xm - ld1 {\vd\().b}[1], [\xn], \xm - ld1 {\vd\().b}[2], [\xn], \xm - ld1 {\vd\().b}[3], [\xn], \xm + ld1 {\vd\().b}[0], [\xn], \xm + ld1 {\vd\().b}[1], [\xn], \xm + ld1 {\vd\().b}[2], [\xn], \xm + ld1 {\vd\().b}[3], [\xn], \xm .endif .if \n == 8 || \hi == 1 - ld1 {\vd\().b}[4], [\xn], \xm - ld1 {\vd\().b}[5], [\xn], \xm - ld1 {\vd\().b}[6], [\xn], \xm - ld1 {\vd\().b}[7], [\xn], \xm + ld1 {\vd\().b}[4], [\xn], \xm + ld1 {\vd\().b}[5], [\xn], \xm + ld1 {\vd\().b}[6], [\xn], \xm + ld1 {\vd\().b}[7], [\xn], \xm .endif .endm .macro ldcol.16 vd, xn, xm ldcol.8 \vd, \xn, \xm - ld1 {\vd\().b}[ 8], [\xn], \xm - ld1 {\vd\().b}[ 9], [\xn], \xm - ld1 {\vd\().b}[10], [\xn], \xm - ld1 {\vd\().b}[11], [\xn], \xm - ld1 {\vd\().b}[12], [\xn], \xm - ld1 {\vd\().b}[13], [\xn], \xm - ld1 {\vd\().b}[14], [\xn], \xm - ld1 {\vd\().b}[15], [\xn], \xm + ld1 {\vd\().b}[ 8], [\xn], \xm + ld1 {\vd\().b}[ 9], [\xn], \xm + ld1 {\vd\().b}[10], [\xn], \xm + ld1 {\vd\().b}[11], [\xn], \xm + ld1 {\vd\().b}[12], [\xn], \xm + ld1 {\vd\().b}[13], [\xn], \xm + ld1 {\vd\().b}[14], [\xn], \xm + ld1 {\vd\().b}[15], [\xn], \xm .endm @@ -127,13 +127,13 @@ endfunc function predict_4x4_ddr_neon, export=1 sub x1, x0, #FDEC_STRIDE+1 mov x7, #FDEC_STRIDE - ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1 - ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1 - ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1 + ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1 + ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1 + ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1 ext v0.8b, v1.8b, v0.8b, #7 - ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1 + ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1 ext v0.8b, v2.8b, v0.8b, #7 // a - ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1 + ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1 ext v1.8b, v3.8b, v0.8b, #7 // b ext v2.8b, v4.8b, v1.8b, #7 // c uaddl v0.8h, v0.8b, v1.8b @@ -155,7 +155,7 @@ endfunc function predict_4x4_ddl_neon, export=1 sub x0, x0, #FDEC_STRIDE mov x7, #FDEC_STRIDE - ld1 {v0.8b}, [x0], x7 + ld1 {v0.8b}, [x0], x7 dup v3.8b, v0.b[7] ext v1.8b, v0.8b, v0.8b, #1 ext v2.8b, v0.8b, v3.8b, #2 @@ -173,8 +173,8 @@ endfunc function predict_8x8_dc_neon, export=1 mov x7, #FDEC_STRIDE - ld1 {v0.16b}, [x1], #16 - ld1 {v1.8b}, [x1] + ld1 {v0.16b}, [x1], #16 + ld1 {v1.8b}, [x1] ext v0.16b, v0.16b, v0.16b, #7 uaddlv h1, v1.8b uaddlv h0, v0.8b @@ -182,39 +182,39 @@ function predict_8x8_dc_neon, export=1 dup v0.8h, v0.h[0] rshrn v0.8b, v0.8h, #4 .rept 8 - st1 {v0.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 .endr ret endfunc function predict_8x8_h_neon, export=1 mov x7, #FDEC_STRIDE - ld1 {v16.16b}, [x1] + ld1 {v16.16b}, [x1] dup v0.8b, v16.b[14] dup v1.8b, v16.b[13] - st1 {v0.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 dup v2.8b, v16.b[12] - st1 {v1.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 dup v3.8b, v16.b[11] - st1 {v2.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 dup v4.8b, v16.b[10] - st1 {v3.8b}, [x0], x7 + st1 {v3.8b}, [x0], x7 dup v5.8b, v16.b[9] - st1 {v4.8b}, [x0], x7 + st1 {v4.8b}, [x0], x7 dup v6.8b, v16.b[8] - st1 {v5.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 dup v7.8b, v16.b[7] - st1 {v6.8b}, [x0], x7 - st1 {v7.8b}, [x0], x7 + st1 {v6.8b}, [x0], x7 + st1 {v7.8b}, [x0], x7 ret endfunc function predict_8x8_v_neon, export=1 add x1, x1, #16 mov x7, #FDEC_STRIDE - ld1 {v0.8b}, [x1] + ld1 {v0.8b}, [x1] .rept 8 - st1 {v0.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 .endr ret endfunc @@ -222,7 +222,7 @@ endfunc function predict_8x8_ddl_neon, export=1 add x1, x1, #16 mov x7, #FDEC_STRIDE - ld1 {v0.16b}, [x1] + ld1 {v0.16b}, [x1] movi v3.16b, #0 dup v2.16b, v0.b[15] ext v4.16b, v3.16b, v0.16b, #15 @@ -231,25 +231,25 @@ function predict_8x8_ddl_neon, export=1 urhadd v0.16b, v0.16b, v4.16b ext v1.16b, v0.16b, v0.16b, #1 ext v2.16b, v0.16b, v0.16b, #2 - st1 {v1.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 ext v3.16b, v0.16b, v0.16b, #3 - st1 {v2.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 ext v4.16b, v0.16b, v0.16b, #4 - st1 {v3.8b}, [x0], x7 + st1 {v3.8b}, [x0], x7 ext v5.16b, v0.16b, v0.16b, #5 - st1 {v4.8b}, [x0], x7 + st1 {v4.8b}, [x0], x7 ext v6.16b, v0.16b, v0.16b, #6 - st1 {v5.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 ext v7.16b, v0.16b, v0.16b, #7 - st1 {v6.8b}, [x0], x7 + st1 {v6.8b}, [x0], x7 ext v0.16b, v0.16b, v0.16b, #8 - st1 {v7.8b}, [x0], x7 - st1 {v0.8b}, [x0], x7 + st1 {v7.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 ret endfunc function predict_8x8_ddr_neon, export=1 - ld1 {v0.16b,v1.16b}, [x1] + ld1 {v0.16b,v1.16b}, [x1] ext v2.16b, v0.16b, v1.16b, #7 ext v4.16b, v0.16b, v1.16b, #9 ext v3.16b, v0.16b, v1.16b, #8 @@ -261,20 +261,20 @@ function predict_8x8_ddr_neon, export=1 mov x7, #-1*FDEC_STRIDE ext v6.16b, v7.16b, v7.16b, #1 - st1 {v7.8b}, [x0], x7 + st1 {v7.8b}, [x0], x7 ext v5.16b, v7.16b, v7.16b, #2 - st1 {v6.8b}, [x0], x7 + st1 {v6.8b}, [x0], x7 ext v4.16b, v7.16b, v7.16b, #3 - st1 {v5.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 ext v3.16b, v7.16b, v7.16b, #4 - st1 {v4.8b}, [x0], x7 + st1 {v4.8b}, [x0], x7 ext v2.16b, v7.16b, v7.16b, #5 - st1 {v3.8b}, [x0], x7 + st1 {v3.8b}, [x0], x7 ext v1.16b, v7.16b, v7.16b, #6 - st1 {v2.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 ext v0.16b, v7.16b, v7.16b, #7 - st1 {v1.8b}, [x0], x7 - st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 ret endfunc @@ -282,7 +282,7 @@ function predict_8x8_vl_neon, export=1 add x1, x1, #16 mov x7, #FDEC_STRIDE - ld1 {v0.16b}, [x1] + ld1 {v0.16b}, [x1] ext v1.16b, v1.16b, v0.16b, #15 ext v2.16b, v0.16b, v2.16b, #1 @@ -291,28 +291,28 @@ function predict_8x8_vl_neon, export=1 urhadd v0.16b, v0.16b, v1.16b - ext v4.16b, v0.16b, v0.16b, #1 - st1 {v3.8b}, [x0], x7 - ext v5.16b, v3.16b, v3.16b, #1 - st1 {v4.8b}, [x0], x7 - ext v6.16b, v0.16b, v0.16b, #2 - st1 {v5.8b}, [x0], x7 - ext v7.16b, v3.16b, v3.16b, #2 - st1 {v6.8b}, [x0], x7 - ext v4.16b, v0.16b, v0.16b, #3 - st1 {v7.8b}, [x0], x7 - ext v5.16b, v3.16b, v3.16b, #3 - st1 {v4.8b}, [x0], x7 - ext v6.16b, v0.16b, v0.16b, #4 - st1 {v5.8b}, [x0], x7 - st1 {v6.8b}, [x0], x7 + ext v4.16b, v0.16b, v0.16b, #1 + st1 {v3.8b}, [x0], x7 + ext v5.16b, v3.16b, v3.16b, #1 + st1 {v4.8b}, [x0], x7 + ext v6.16b, v0.16b, v0.16b, #2 + st1 {v5.8b}, [x0], x7 + ext v7.16b, v3.16b, v3.16b, #2 + st1 {v6.8b}, [x0], x7 + ext v4.16b, v0.16b, v0.16b, #3 + st1 {v7.8b}, [x0], x7 + ext v5.16b, v3.16b, v3.16b, #3 + st1 {v4.8b}, [x0], x7 + ext v6.16b, v0.16b, v0.16b, #4 + st1 {v5.8b}, [x0], x7 + st1 {v6.8b}, [x0], x7 ret endfunc function predict_8x8_vr_neon, export=1 add x1, x1, #8 mov x7, #FDEC_STRIDE - ld1 {v2.16b}, [x1] + ld1 {v2.16b}, [x1] ext v1.16b, v2.16b, v2.16b, #14 ext v0.16b, v2.16b, v2.16b, #15 @@ -326,20 +326,20 @@ function predict_8x8_vr_neon, export=1 uzp2 v3.8b, v0.8b, v0.8b ext v0.16b, v0.16b, v0.16b, #8 - st1 {v1.8b}, [x0], x7 - st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 ext v4.8b, v3.8b, v1.8b, #7 ext v5.8b, v2.8b, v0.8b, #7 - st1 {v4.8b}, [x0], x7 - st1 {v5.8b}, [x0], x7 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 ext v6.8b, v3.8b, v1.8b, #6 ext v7.8b, v2.8b, v0.8b, #6 - st1 {v6.8b}, [x0], x7 - st1 {v7.8b}, [x0], x7 + st1 {v6.8b}, [x0], x7 + st1 {v7.8b}, [x0], x7 ext v1.8b, v3.8b, v1.8b, #5 ext v0.8b, v2.8b, v0.8b, #5 - st1 {v1.8b}, [x0], x7 - st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 ret endfunc @@ -347,7 +347,7 @@ function predict_8x8_hd_neon, export=1 add x1, x1, #7 mov x7, #FDEC_STRIDE - ld1 {v1.16b}, [x1] + ld1 {v1.16b}, [x1] ext v3.16b, v1.16b, v1.16b, #1 ext v2.16b, v1.16b, v1.16b, #2 @@ -362,18 +362,18 @@ function predict_8x8_hd_neon, export=1 ext v0.8b, v17.8b, v7.8b, #6 ext v1.8b, v17.8b, v7.8b, #4 - st1 {v0.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 ext v2.8b, v17.8b, v7.8b, #2 - st1 {v1.8b}, [x0], x7 - st1 {v2.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 ext v3.8b, v16.8b, v17.8b, #6 - st1 {v17.8b}, [x0], x7 + st1 {v17.8b}, [x0], x7 ext v4.8b, v16.8b, v17.8b, #4 - st1 {v3.8b}, [x0], x7 + st1 {v3.8b}, [x0], x7 ext v5.8b, v16.8b, v17.8b, #2 - st1 {v4.8b}, [x0], x7 - st1 {v5.8b}, [x0], x7 - st1 {v16.8b}, [x0], x7 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 + st1 {v16.8b}, [x0], x7 ret endfunc @@ -381,7 +381,7 @@ endfunc function predict_8x8_hu_neon, export=1 add x1, x1, #7 mov x7, #FDEC_STRIDE - ld1 {v7.8b}, [x1] + ld1 {v7.8b}, [x1] dup v6.8b, v7.b[0] rev64 v7.8b, v7.8b @@ -400,18 +400,18 @@ function predict_8x8_hu_neon, export=1 ext v0.8b, v16.8b, v17.8b, #2 ext v1.8b, v16.8b, v17.8b, #4 ext v2.8b, v16.8b, v17.8b, #6 - st1 {v16.8b}, [x0], x7 - st1 {v0.8b}, [x0], x7 - st1 {v1.8b}, [x0], x7 - st1 {v2.8b}, [x0], x7 + st1 {v16.8b}, [x0], x7 + st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 + st1 {v2.8b}, [x0], x7 ext v4.8b, v17.8b, v18.8b, #2 ext v5.8b, v17.8b, v18.8b, #4 ext v6.8b, v17.8b, v18.8b, #6 - st1 {v17.8b}, [x0], x7 - st1 {v4.8b}, [x0], x7 - st1 {v5.8b}, [x0], x7 - st1 {v6.8b}, [x0] + st1 {v17.8b}, [x0], x7 + st1 {v4.8b}, [x0], x7 + st1 {v5.8b}, [x0], x7 + st1 {v6.8b}, [x0] ret endfunc @@ -419,7 +419,7 @@ endfunc function predict_8x8c_dc_top_neon, export=1 sub x2, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE - ld1 {v0.8b}, [x2] + ld1 {v0.8b}, [x2] uaddlp v0.4h, v0.8b addp v0.4h, v0.4h, v0.4h rshrn v0.8b, v0.8h, #2 @@ -469,7 +469,7 @@ function predict_8x8c_dc_neon, export=1 add w6, w6, w7 add w10, w10, w12, lsl #16 add w4, w4, w6, lsl #16 - ld1 {v0.8b}, [x2] + ld1 {v0.8b}, [x2] add x10, x10, x4, lsl #32 uaddlp v0.4h, v0.8b // s0, s1 mov v1.d[0], x10 // s2, s3 @@ -487,14 +487,14 @@ pred8x8c_dc_end: add x2, x0, #2 * FDEC_STRIDE add x4, x0, #4 * FDEC_STRIDE add x5, x0, #6 * FDEC_STRIDE - st1 {v0.8b}, [x0], x1 - st1 {v0.8b}, [x2], x1 - st1 {v0.8b}, [x0] - st1 {v0.8b}, [x2] - st1 {v1.8b}, [x4], x1 - st1 {v1.8b}, [x5], x1 - st1 {v1.8b}, [x4] - st1 {v1.8b}, [x5] + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x2], x1 + st1 {v0.8b}, [x0] + st1 {v0.8b}, [x2] + st1 {v1.8b}, [x4], x1 + st1 {v1.8b}, [x5], x1 + st1 {v1.8b}, [x4] + st1 {v1.8b}, [x5] ret endfunc @@ -502,10 +502,10 @@ function predict_8x8c_h_neon, export=1 sub x1, x0, #1 mov x7, #FDEC_STRIDE .rept 4 - ld1r {v0.8b}, [x1], x7 - ld1r {v1.8b}, [x1], x7 - st1 {v0.8b}, [x0], x7 - st1 {v1.8b}, [x0], x7 + ld1r {v0.8b}, [x1], x7 + ld1r {v1.8b}, [x1], x7 + st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x0], x7 .endr ret endfunc @@ -523,8 +523,8 @@ function predict_8x8c_p_neon, export=1 mov x1, #FDEC_STRIDE add x2, x3, #4 sub x3, x3, #1 - ld1 {v0.s}[0], [x3] - ld1 {v2.s}[0], [x2], x1 + ld1 {v0.s}[0], [x3] + ld1 {v2.s}[0], [x2], x1 ldcol.8 v0, x3, x1, 4, hi=1 add x3, x3, x1 ldcol.8 v3, x3, x1, 4 @@ -533,10 +533,10 @@ function predict_8x8c_p_neon, export=1 uaddl v4.8h, v2.8b, v3.8b rev32 v0.8b, v0.8b trn1 v2.2s, v2.2s, v3.2s - ld1 {v7.8h}, [x4] + ld1 {v7.8h}, [x4] usubl v2.8h, v2.8b, v0.8b mul v2.8h, v2.8h, v7.8h - ld1 {v0.8h}, [x5] + ld1 {v0.8h}, [x5] saddlp v2.4s, v2.8h addp v2.4s, v2.4s, v2.4s shl v3.2s, v2.2s, #4 @@ -561,7 +561,7 @@ function predict_8x8c_p_neon, export=1 subs x3, x3, #1 sqshrun v0.8b, v1.8h, #5 add v1.8h, v1.8h, v2.8h - st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 b.ne 1b ret endfunc @@ -587,14 +587,14 @@ function predict_8x16c_h_neon, export=1 mov x7, #2 * FDEC_STRIDE add x1, x0, #FDEC_STRIDE .rept 4 - ld1r {v0.8b}, [x2], x7 - ld1r {v1.8b}, [x3], x7 - ld1r {v2.8b}, [x2], x7 - ld1r {v3.8b}, [x3], x7 - st1 {v0.8b}, [x0], x7 - st1 {v1.8b}, [x1], x7 - st1 {v2.8b}, [x0], x7 - st1 {v3.8b}, [x1], x7 + ld1r {v0.8b}, [x2], x7 + ld1r {v1.8b}, [x3], x7 + ld1r {v2.8b}, [x2], x7 + ld1r {v3.8b}, [x3], x7 + st1 {v0.8b}, [x0], x7 + st1 {v1.8b}, [x1], x7 + st1 {v2.8b}, [x0], x7 + st1 {v3.8b}, [x1], x7 .endr ret endfunc @@ -602,24 +602,24 @@ endfunc function predict_8x16c_v_neon, export=1 sub x1, x0, #FDEC_STRIDE mov x2, #2 * FDEC_STRIDE - ld1 {v0.8b}, [x1], x2 + ld1 {v0.8b}, [x1], x2 .rept 8 - st1 {v0.8b}, [x0], x2 - st1 {v0.8b}, [x1], x2 + st1 {v0.8b}, [x0], x2 + st1 {v0.8b}, [x1], x2 .endr ret endfunc function predict_8x16c_p_neon, export=1 movrel x4, p16weight - ld1 {v17.8h}, [x4] + ld1 {v17.8h}, [x4] sub x3, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE add x2, x3, #4 sub x3, x3, #1 - ld1 {v0.8b}, [x3] - ld1 {v2.8b}, [x2], x1 + ld1 {v0.8b}, [x3] + ld1 {v2.8b}, [x2], x1 ldcol.8 v1, x3, x1 add x3, x3, x1 ldcol.8 v3, x3, x1 @@ -670,9 +670,9 @@ function predict_8x16c_p_neon, export=1 sqrshrun v4.8b, v1.8h, #5 add v1.8h, v1.8h, v2.8h sqrshrun v5.8b, v1.8h, #5 - st1 {v4.8b}, [x0], x1 + st1 {v4.8b}, [x0], x1 add v1.8h, v1.8h, v2.8h - st1 {v5.8b}, [x0], x1 + st1 {v5.8b}, [x0], x1 b.ne 1b ret endfunc @@ -681,7 +681,7 @@ function predict_8x16c_dc_neon, export=1 mov x1, #FDEC_STRIDE sub x10, x0, #FDEC_STRIDE loadsum4 w2, w3, w4, w5, x0, 0 - ld1 {v6.8b}, [x10] + ld1 {v6.8b}, [x10] loadsum4 w6, w7, w8, w9, x0, 4 uaddlp v6.4h, v6.8b dup v22.8h, w2 // s2 @@ -714,10 +714,10 @@ function predict_8x16c_dc_neon, export=1 add x12, x0, #8 * FDEC_STRIDE add x13, x0, #12 * FDEC_STRIDE .rept 4 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x11], x1 - st1 {v2.8b}, [x12], x1 - st1 {v3.8b}, [x13], x1 + st1 {v0.8b}, [x0], x1 + st1 {v1.8b}, [x11], x1 + st1 {v2.8b}, [x12], x1 + st1 {v3.8b}, [x13], x1 .endr ret endfunc @@ -760,17 +760,17 @@ function predict_8x16c_dc_left_neon, export=1 add w2, w2, w3 rshrn v2.8b, v2.8h, #2 add w4, w4, w5 - st1 {v0.8b}, [x0], x1 - st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 add w2, w2, w4 - st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 dup v3.8h, w2 - st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 rshrn v3.8b, v3.8h, #2 .irp idx, 1, 2, 3 .rept 4 - st1 {v\idx\().8b}, [x0], x1 + st1 {v\idx\().8b}, [x0], x1 .endr .endr ret @@ -779,7 +779,7 @@ endfunc function predict_8x16c_dc_top_neon, export=1 sub x2, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE - ld1 {v0.8b}, [x2] + ld1 {v0.8b}, [x2] uaddlp v0.4h, v0.8b addp v0.4h, v0.4h, v0.4h rshrn v4.8b, v0.8h, #2 @@ -787,7 +787,7 @@ function predict_8x16c_dc_top_neon, export=1 dup v1.8b, v4.b[1] ext v0.8b, v0.8b, v1.8b, #4 .rept 16 - st1 {v0.8b}, [x0], x1 + st1 {v0.8b}, [x0], x1 .endr ret endfunc @@ -796,7 +796,7 @@ endfunc function predict_16x16_dc_top_neon, export=1 sub x2, x0, #FDEC_STRIDE mov x1, #FDEC_STRIDE - ld1 {v0.16b}, [x2] + ld1 {v0.16b}, [x2] uaddlv h0, v0.16b rshrn v0.8b, v0.8h, #4 dup v0.16b, v0.b[0] @@ -817,7 +817,7 @@ function predict_16x16_dc_neon, export=1 sub x3, x0, #FDEC_STRIDE sub x2, x0, #1 mov x1, #FDEC_STRIDE - ld1 {v0.16b}, [x3] + ld1 {v0.16b}, [x3] ldcol.16 v1, x2, x1 uaddlv h0, v0.16b uaddlv h1, v1.16b @@ -826,7 +826,7 @@ function predict_16x16_dc_neon, export=1 dup v0.16b, v0.b[0] pred16x16_dc_end: .rept 16 - st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 .endr ret endfunc @@ -835,10 +835,10 @@ function predict_16x16_h_neon, export=1 sub x1, x0, #1 mov x7, #FDEC_STRIDE .rept 8 - ld1r {v0.16b}, [x1], x7 - ld1r {v1.16b}, [x1], x7 - st1 {v0.16b}, [x0], x7 - st1 {v1.16b}, [x0], x7 + ld1r {v0.16b}, [x1], x7 + ld1r {v1.16b}, [x1], x7 + st1 {v0.16b}, [x0], x7 + st1 {v1.16b}, [x0], x7 .endr ret endfunc @@ -846,9 +846,9 @@ endfunc function predict_16x16_v_neon, export=1 sub x0, x0, #FDEC_STRIDE mov x7, #FDEC_STRIDE - ld1 {v0.16b}, [x0], x7 + ld1 {v0.16b}, [x0], x7 .rept 16 - st1 {v0.16b}, [x0], x7 + st1 {v0.16b}, [x0], x7 .endr ret endfunc @@ -858,8 +858,8 @@ function predict_16x16_p_neon, export=1 mov x1, #FDEC_STRIDE add x2, x3, #8 sub x3, x3, #1 - ld1 {v0.8b}, [x3] - ld1 {v2.8b}, [x2], x1 + ld1 {v0.8b}, [x3] + ld1 {v2.8b}, [x2], x1 ldcol.8 v1, x3, x1 add x3, x3, x1 ldcol.8 v3, x3, x1 @@ -867,7 +867,7 @@ function predict_16x16_p_neon, export=1 rev64 v1.8b, v1.8b movrel x4, p16weight uaddl v4.8h, v2.8b, v3.8b - ld1 {v7.8h}, [x4] + ld1 {v7.8h}, [x4] usubl v2.8h, v2.8b, v0.8b usubl v3.8h, v3.8b, v1.8b mul v2.8h, v2.8h, v7.8h @@ -902,7 +902,7 @@ function predict_16x16_p_neon, export=1 add v1.8h, v1.8h, v2.8h sqshrun2 v0.16b, v3.8h, #5 add v3.8h, v3.8h, v2.8h - st1 {v0.16b}, [x0], x1 + st1 {v0.16b}, [x0], x1 b.ne 1b ret endfunc diff --git a/common/aarch64/predict-c.c b/common/aarch64/predict-c.c index 8d5460ff4..d8d4bc3ce 100644 --- a/common/aarch64/predict-c.c +++ b/common/aarch64/predict-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * predict.c: aarch64 intra prediction ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau diff --git a/common/aarch64/predict.h b/common/aarch64/predict.h index 40f85dde1..f4c43e734 100644 --- a/common/aarch64/predict.h +++ b/common/aarch64/predict.h @@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: aarch64 intra prediction ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S index eaac703dc..006e0eadd 100644 --- a/common/aarch64/quant-a.S +++ b/common/aarch64/quant-a.S @@ -1,7 +1,7 @@ /**************************************************************************** * quant.S: arm quantization and level-run ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -27,6 +27,306 @@ #include "asm.S" +// This is a common function for both 8 and 10 bit depth, since these two differ +// at data loading only. The distinction is based on the depth parameters that +//are passed to the macro. +.macro decimate_score_1x size depth +function decimate_score\size\()_neon, export=1 + +.if BIT_DEPTH == 8 + ld1 {v0.8h,v1.8h}, [x0] + movrel x5, X264(decimate_table4) + movi v3.16b, #0x01 + sqxtn v0.8b, v0.8h + sqxtn2 v0.16b, v1.8h +.else // BIT_DEPTH == 8 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + movrel x5, X264(decimate_table4) + sqxtn v20.4h, v0.4s + sqxtn2 v20.8h, v1.4s + sqxtn v21.4h, v2.4s + sqxtn2 v21.8h, v3.4s + sqxtn v0.8b, v20.8h + sqxtn2 v0.16b, v21.8h +.endif // BIT_DEPTH == 8 + + movi v3.16b, #0x01 + abs v2.16b, v0.16b + cmeq v1.16b, v0.16b, #0 + cmhi v2.16b, v2.16b, v3.16b + shrn v1.8b, v1.8h, #4 + shrn v2.8b, v2.8h, #4 + fmov x2, d2 + fmov x1, d1 + cbnz x2, 9f + mvn x1, x1 + mov w0, #0 + cbz x1, 0f +.ifc \size, 15 + lsr x1, x1, #1 +.endif + rbit x1, x1 +1: + clz x3, x1 + lsr x6, x3, #2 + lsl x1, x1, x3 + ldrb w7, [x5, x6] + lsl x1, x1, #4 + add w0, w0, w7 + cbnz x1, 1b + ret +9: + mov w0, #9 +0: + ret +endfunc +.endm + +const mask64, align=6 + .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 + .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 +endconst + +.macro decimate_score64 depth +function decimate_score64_neon, export=1 +.if BIT_DEPTH == 8 + ld1 {v0.8h, v1.8h}, [x0], #32 + ld1 {v2.8h, v3.8h}, [x0], #32 + ld1 {v4.8h, v5.8h}, [x0], #32 + ld1 {v6.8h, v7.8h}, [x0] + sqxtn v16.8b, v1.8h + sqxtn2 v16.16b, v0.8h + sqxtn v17.8b, v3.8h + sqxtn2 v17.16b, v2.8h + sqxtn v18.8b, v5.8h + sqxtn2 v18.16b, v4.8h + sqxtn v19.8b, v7.8h + sqxtn2 v19.16b, v6.8h +.else // BIT_DEPTH == 8 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64 + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0] + + sqxtn v28.4h, v0.4s + sqxtn2 v28.8h, v1.4s + sqxtn v0.4h, v2.4s + sqxtn2 v0.8h, v3.4s + sqxtn v2.4h, v6.4s + sqxtn2 v2.8h, v7.4s + sqxtn v3.4h, v4.4s + sqxtn2 v3.8h, v5.4s + sqxtn v4.4h, v22.4s + sqxtn2 v4.8h, v23.4s + sqxtn v5.4h, v20.4s + sqxtn2 v5.8h, v21.4s + sqxtn v6.4h, v26.4s + sqxtn2 v6.8h, v27.4s + sqxtn v7.4h, v24.4s + sqxtn2 v7.8h, v25.4s + + sqxtn v16.8b, v0.8h + sqxtn2 v16.16b, v28.8h + sqxtn v17.8b, v2.8h + sqxtn2 v17.16b, v3.8h + sqxtn v18.8b, v4.8h + sqxtn2 v18.16b, v5.8h + sqxtn v19.8b, v6.8h + sqxtn2 v19.16b, v7.8h +.endif // BIT_DEPTH == 8 + + movrel x6, mask64 + movi v31.16b, #0x01 + abs v4.16b, v16.16b + abs v5.16b, v17.16b + abs v6.16b, v18.16b + abs v7.16b, v19.16b + ld1 {v30.16b}, [x6] + cmeq v0.16b, v16.16b, #0 + cmeq v1.16b, v17.16b, #0 + cmeq v2.16b, v18.16b, #0 + cmeq v3.16b, v19.16b, #0 + umax v4.16b, v4.16b, v5.16b + umax v6.16b, v6.16b, v7.16b + and v0.16b, v0.16b, v30.16b + and v1.16b, v1.16b, v30.16b + and v2.16b, v2.16b, v30.16b + and v3.16b, v3.16b, v30.16b + umax v4.16b, v4.16b, v6.16b + addp v0.16b, v1.16b, v0.16b + addp v2.16b, v3.16b, v2.16b + cmhi v4.16b, v4.16b, v31.16b + addp v0.16b, v2.16b, v0.16b + shrn v4.8b, v4.8h, #4 + addp v0.16b, v0.16b, v0.16b + fmov x2, d4 + fmov x1, d0 + cbnz x2, 9f + mvn x1, x1 + mov w0, #0 + cbz x1, 0f + movrel x5, X264(decimate_table8) +1: + clz x3, x1 + lsl x1, x1, x3 + ldrb w7, [x5, x3] + lsl x1, x1, #1 + add w0, w0, w7 + cbnz x1, 1b + ret +9: + mov w0, #9 +0: + ret +endfunc +.endm + +.macro COEFF_LAST_1x size, sub_factor +function coeff_last\size\()_neon, export=1 +.if \size == 15 + sub x0, x0, \sub_factor +.endif + +.if BIT_DEPTH == 8 + ld1 {v0.8h, v1.8h}, [x0] + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h +.else // BIT_DEPTH == 8 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + uqxtn v0.4h, v0.4s + uqxtn2 v0.8h, v1.4s + uqxtn v1.4h, v2.4s + uqxtn2 v1.8h, v3.4s + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h +.endif // BIT_DEPTH == 8 + + cmtst v0.16b, v0.16b, v0.16b + shrn v0.8b, v0.8h, #4 + fmov x1, d0 + mov w3, #\size - 1 + clz x2, x1 + sub w0, w3, w2, lsr #2 + ret +endfunc +.endm + +.macro COEFF_LAST64 +function coeff_last64_neon, export=1 +.if BIT_DEPTH == 8 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], 64 + movi v31.8h, #8 + movi v30.8h, #1 + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], 64 + uqxtn v1.8b, v2.8h + uqxtn2 v1.16b, v3.8h + uqxtn v2.8b, v4.8h + uqxtn2 v2.16b, v5.8h + uqxtn v3.8b, v6.8h + uqxtn2 v3.16b, v7.8h +.else // BIT_DEPTH == 8 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 + movi v31.8h, #8 + movi v30.8h, #1 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64 + uqxtn v0.4h, v0.4s + uqxtn2 v0.8h, v1.4s + uqxtn v1.4h, v2.4s + uqxtn2 v1.8h, v3.4s + uqxtn v2.4h, v4.4s + uqxtn2 v2.8h, v5.4s + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 + uqxtn v3.4h, v6.4s + uqxtn2 v3.8h, v7.4s + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + uqxtn v1.8b, v2.8h + uqxtn2 v1.16b, v3.8h + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 + uqxtn v16.4h, v16.4s + uqxtn2 v16.8h, v17.4s + uqxtn v17.4h, v18.4s + uqxtn2 v17.8h, v19.4s + uqxtn v18.4h, v20.4s + uqxtn2 v18.8h, v21.4s + uqxtn v19.4h, v22.4s + uqxtn2 v19.8h, v23.4s + uqxtn v2.8b, v16.8h + uqxtn2 v2.16b, v17.8h + uqxtn v3.8b, v18.8h + uqxtn2 v3.16b, v19.8h +.endif // BIT_DEPTH == 8 + + cmtst v0.16b, v0.16b, v0.16b + cmtst v1.16b, v1.16b, v1.16b + cmtst v2.16b, v2.16b, v2.16b + cmtst v3.16b, v3.16b, v3.16b + + shrn v0.8b, v0.8h, #4 + shrn2 v0.16b, v1.8h, #4 + shrn v1.8b, v2.8h, #4 + shrn2 v1.16b, v3.8h, #4 + + clz v0.4s, v0.4s + clz v1.4s, v1.4s + + shrn v0.4h, v0.4s, #2 + shrn2 v0.8h, v1.4s, #2 + + sub v0.8h, v31.8h, v0.8h + sshl v0.8h, v30.8h, v0.8h + shrn v0.8b, v0.8h, #1 + + fmov x2, d0 + mov w3, #63 + clz x2, x2 + sub w0, w3, w2 + ret +endfunc +.endm + +.macro coeff_level_run_start size, mask + add x6, x1, #\mask // runlevel->mask + mov w7, #0 + mov w8, #0 + mov w9, #1 + mov w4, #\size - 1 +.endm + +.macro coeff_level_run shift, depth + clz x3, x2 + subs w4, w4, w3, lsr #\shift + str w4, [x1], #4 +1: +.ifc \depth, 8 + ldrh w5, [x0, x4, lsl #1] + strh w5, [x6], #2 +.else + lsl w5, w4, #2 + ldr w5, [x0, x5] + str w5, [x6], #4 +.endif + + add w7, w7, #1 + lsl w10, w9, w4 + orr w8, w8, w10 + b.le 2f + add w3, w3, #1 << \shift + sub w4, w4, #1 + and x3, x3, #~((1 << \shift) - 1) + lsl x2, x2, x3 + clz x3, x2 + subs w4, w4, w3, lsr #\shift + b.ge 1b +2: + str w8, [x1] + mov w0, w7 +.endm + +.if BIT_DEPTH == 8 + .macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask add v18.8h, v18.8h, \bias0 add v19.8h, v19.8h, \bias1 @@ -45,7 +345,7 @@ sub v18.8h, v18.8h, v16.8h sub v19.8h, v19.8h, v17.8h orr \mask, v18.16b, v19.16b - st1 {v18.8h,v19.8h}, [x0], #32 + st1 {v18.8h,v19.8h}, [x0], #32 .endm .macro QUANT_END d @@ -58,7 +358,7 @@ // quant_2x2_dc( int16_t dct[4], int mf, int bias ) function quant_2x2_dc_neon, export=1 - ld1 {v0.4h}, [x0] + ld1 {v0.4h}, [x0] dup v2.4h, w2 dup v1.4h, w1 abs v3.4h, v0.4h @@ -68,13 +368,13 @@ function quant_2x2_dc_neon, export=1 shrn v3.4h, v3.4s, #16 eor v3.8b, v3.8b, v0.8b sub v3.4h, v3.4h, v0.4h - st1 {v3.4h}, [x0] + st1 {v3.4h}, [x0] QUANT_END d3 endfunc // quant_4x4_dc( int16_t dct[16], int mf, int bias ) function quant_4x4_dc_neon, export=1 - ld1 {v16.8h,v17.8h}, [x0] + ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h dup v0.8h, w2 @@ -86,11 +386,11 @@ endfunc // quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ) function quant_4x4_neon, export=1 - ld1 {v16.8h,v17.8h}, [x0] + ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h - ld1 {v0.8h,v1.8h}, [x2] - ld1 {v2.8h,v3.8h}, [x1] + ld1 {v0.8h,v1.8h}, [x2] + ld1 {v2.8h,v3.8h}, [x1] QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b uqxtn v0.8b, v0.8h QUANT_END d0 @@ -98,21 +398,21 @@ endfunc // quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ) function quant_4x4x4_neon, export=1 - ld1 {v16.8h,v17.8h}, [x0] + ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h - ld1 {v0.8h,v1.8h}, [x2] - ld1 {v2.8h,v3.8h}, [x1] + ld1 {v0.8h,v1.8h}, [x2] + ld1 {v2.8h,v3.8h}, [x1] QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b - ld1 {v16.8h,v17.8h}, [x0] + ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b - ld1 {v16.8h,v17.8h}, [x0] + ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b - ld1 {v16.8h,v17.8h}, [x0] + ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b @@ -141,18 +441,18 @@ endfunc // quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ) function quant_8x8_neon, export=1 - ld1 {v16.8h,v17.8h}, [x0] + ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h - ld1 {v0.8h,v1.8h}, [x2], #32 - ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v0.8h,v1.8h}, [x2], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b .rept 3 - ld1 {v16.8h,v17.8h}, [x0] + ld1 {v16.8h,v17.8h}, [x0] abs v18.8h, v16.8h abs v19.8h, v17.8h - ld1 {v0.8h,v1.8h}, [x2], #32 - ld1 {v2.8h,v3.8h}, [x1], #32 + ld1 {v0.8h,v1.8h}, [x2], #32 + ld1 {v2.8h,v3.8h}, [x1], #32 QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b orr v4.16b, v4.16b, v5.16b .endr @@ -189,20 +489,20 @@ dequant_\size\()_lshift_loop: .ifc \size, 8x8 subs w2, w2, #1 .endif - ld1 {v16.4s}, [x1], #16 - ld1 {v17.4s}, [x1], #16 + ld1 {v16.4s}, [x1], #16 + ld1 {v17.4s}, [x1], #16 sqxtn v2.4h, v16.4s - ld1 {v18.4s}, [x1], #16 + ld1 {v18.4s}, [x1], #16 sqxtn2 v2.8h, v17.4s - ld1 {v19.4s}, [x1], #16 + ld1 {v19.4s}, [x1], #16 sqxtn v3.4h, v18.4s - ld1 {v0.8h,v1.8h}, [x0] + ld1 {v0.8h,v1.8h}, [x0] sqxtn2 v3.8h, v19.4s mul v0.8h, v0.8h, v2.8h mul v1.8h, v1.8h, v3.8h sshl v0.8h, v0.8h, v31.8h sshl v1.8h, v1.8h, v31.8h - st1 {v0.8h,v1.8h}, [x0], #32 + st1 {v0.8h,v1.8h}, [x0], #32 .ifc \size, 8x8 b.gt dequant_\size\()_lshift_loop .endif @@ -210,43 +510,35 @@ dequant_\size\()_lshift_loop: dequant_\size\()_rshift: dup v31.4s, w3 - neg w3, w3 - mov w5, #1 - sub w3, w3, #1 - lsl w5, w5, w3 .ifc \size, 8x8 dequant_\size\()_rshift_loop: subs w2, w2, #1 .endif - ld1 {v16.4s}, [x1], #16 - ld1 {v17.4s}, [x1], #16 + ld1 {v16.4s}, [x1], #16 + ld1 {v17.4s}, [x1], #16 sqxtn v2.4h, v16.4s - ld1 {v18.4s}, [x1], #16 - dup v16.4s, w5 + ld1 {v18.4s}, [x1], #16 sqxtn2 v2.8h, v17.4s - ld1 {v19.4s}, [x1], #16 - dup v17.4s, w5 + ld1 {v19.4s}, [x1], #16 sqxtn v3.4h, v18.4s - ld1 {v0.8h,v1.8h}, [x0] - dup v18.4s, w5 + ld1 {v0.8h,v1.8h}, [x0] sqxtn2 v3.8h, v19.4s - dup v19.4s, w5 - smlal v16.4s, v0.4h, v2.4h - smlal2 v17.4s, v0.8h, v2.8h - smlal v18.4s, v1.4h, v3.4h - smlal2 v19.4s, v1.8h, v3.8h - sshl v16.4s, v16.4s, v31.4s - sshl v17.4s, v17.4s, v31.4s - sshl v18.4s, v18.4s, v31.4s - sshl v19.4s, v19.4s, v31.4s + smull v16.4s, v0.4h, v2.4h + smull2 v17.4s, v0.8h, v2.8h + smull v18.4s, v1.4h, v3.4h + smull2 v19.4s, v1.8h, v3.8h + srshl v16.4s, v16.4s, v31.4s + srshl v17.4s, v17.4s, v31.4s + srshl v18.4s, v18.4s, v31.4s + srshl v19.4s, v19.4s, v31.4s sqxtn v0.4h, v16.4s sqxtn2 v0.8h, v17.4s sqxtn v1.4h, v18.4s sqxtn2 v1.8h, v19.4s - st1 {v0.8h,v1.8h}, [x0], #32 + st1 {v0.8h,v1.8h}, [x0], #32 .ifc \size, 8x8 b.gt dequant_\size\()_rshift_loop .endif @@ -264,147 +556,41 @@ function dequant_4x4_dc_neon, export=1 lsl w1, w1, w3 dup v2.8h, w1 - ld1 {v0.8h,v1.8h}, [x0] + ld1 {v0.8h,v1.8h}, [x0] mul v0.8h, v0.8h, v2.8h mul v1.8h, v1.8h, v2.8h - st1 {v0.8h,v1.8h}, [x0] + st1 {v0.8h,v1.8h}, [x0] ret dequant_4x4_dc_rshift: dup v4.8h, w1 dup v3.4s, w3 - neg w3, w3 - mov w5, #1 - sub w3, w3, #1 - lsl w5, w5, w3 - - dup v16.4s, w5 - dup v17.4s, w5 - ld1 {v0.8h,v1.8h}, [x0] - dup v18.4s, w5 - dup v19.4s, w5 - - smlal v16.4s, v0.4h, v4.4h - smlal2 v17.4s, v0.8h, v4.8h - smlal v18.4s, v1.4h, v4.4h - smlal2 v19.4s, v1.8h, v4.8h - sshl v16.4s, v16.4s, v3.4s - sshl v17.4s, v17.4s, v3.4s - sshl v18.4s, v18.4s, v3.4s - sshl v19.4s, v19.4s, v3.4s + + ld1 {v0.8h,v1.8h}, [x0] + + smull v16.4s, v0.4h, v4.4h + smull2 v17.4s, v0.8h, v4.8h + smull v18.4s, v1.4h, v4.4h + smull2 v19.4s, v1.8h, v4.8h + srshl v16.4s, v16.4s, v3.4s + srshl v17.4s, v17.4s, v3.4s + srshl v18.4s, v18.4s, v3.4s + srshl v19.4s, v19.4s, v3.4s sqxtn v0.4h, v16.4s sqxtn2 v0.8h, v17.4s sqxtn v1.4h, v18.4s sqxtn2 v1.8h, v19.4s - st1 {v0.8h,v1.8h}, [x0] + st1 {v0.8h,v1.8h}, [x0] ret endfunc -.macro decimate_score_1x size -function decimate_score\size\()_neon, export=1 - ld1 {v0.8h,v1.8h}, [x0] - movrel x5, X264(decimate_table4) - movi v3.16b, #0x01 - sqxtn v0.8b, v0.8h - sqxtn2 v0.16b, v1.8h - abs v2.16b, v0.16b - cmeq v1.16b, v0.16b, #0 - cmhi v2.16b, v2.16b, v3.16b - shrn v1.8b, v1.8h, #4 - shrn v2.8b, v2.8h, #4 - fmov x2, d2 - fmov x1, d1 - cbnz x2, 9f - mvn x1, x1 - mov w0, #0 - cbz x1, 0f -.ifc \size, 15 - lsr x1, x1, #1 -.endif - rbit x1, x1 -1: - clz x3, x1 - lsr x6, x3, #2 - lsl x1, x1, x3 - ldrb w7, [x5, x6] - lsl x1, x1, #4 - add w0, w0, w7 - cbnz x1, 1b - ret -9: - mov w0, #9 -0: - ret -endfunc -.endm decimate_score_1x 15 decimate_score_1x 16 -const mask64, align=6 - .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 - .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 -endconst - -function decimate_score64_neon, export=1 - ld1 {v0.8h,v1.8h}, [x0], #32 - ld1 {v2.8h,v3.8h}, [x0], #32 - ld1 {v4.8h,v5.8h}, [x0], #32 - ld1 {v6.8h,v7.8h}, [x0] - movrel x6, mask64 - movi v31.16b, #0x01 - sqxtn v16.8b, v1.8h - sqxtn2 v16.16b, v0.8h - sqxtn v17.8b, v3.8h - sqxtn2 v17.16b, v2.8h - sqxtn v18.8b, v5.8h - sqxtn2 v18.16b, v4.8h - sqxtn v19.8b, v7.8h - sqxtn2 v19.16b, v6.8h - abs v4.16b, v16.16b - abs v5.16b, v17.16b - abs v6.16b, v18.16b - abs v7.16b, v19.16b - ld1 {v30.16b}, [x6] - cmeq v0.16b, v16.16b, #0 - cmeq v1.16b, v17.16b, #0 - cmeq v2.16b, v18.16b, #0 - cmeq v3.16b, v19.16b, #0 - umax v4.16b, v4.16b, v5.16b - umax v6.16b, v6.16b, v7.16b - and v0.16b, v0.16b, v30.16b - and v1.16b, v1.16b, v30.16b - and v2.16b, v2.16b, v30.16b - and v3.16b, v3.16b, v30.16b - umax v4.16b, v4.16b, v6.16b - addp v0.16b, v1.16b, v0.16b - addp v2.16b, v3.16b, v2.16b - cmhi v4.16b, v4.16b, v31.16b - addp v0.16b, v2.16b, v0.16b - shrn v4.8b, v4.8h, #4 - addp v0.16b, v0.16b, v0.16b - fmov x2, d4 - fmov x1, d0 - cbnz x2, 9f - mvn x1, x1 - mov w0, #0 - cbz x1, 0f - movrel x5, X264(decimate_table8) -1: - clz x3, x1 - lsl x1, x1, x3 - ldrb w7, [x5, x3] - lsl x1, x1, #1 - add w0, w0, w7 - cbnz x1, 1b - ret -9: - mov w0, #9 -0: - ret -endfunc +decimate_score64 // int coeff_last( int16_t *l ) function coeff_last4_aarch64, export=1 @@ -429,106 +615,17 @@ function coeff_last8_aarch64, export=1 ret endfunc -.macro COEFF_LAST_1x size -function coeff_last\size\()_neon, export=1 -.if \size == 15 - sub x0, x0, #2 -.endif - ld1 {v0.8h,v1.8h}, [x0] - uqxtn v0.8b, v0.8h - uqxtn2 v0.16b, v1.8h - cmtst v0.16b, v0.16b, v0.16b - shrn v0.8b, v0.8h, #4 - fmov x1, d0 - mov w3, #\size - 1 - clz x2, x1 - sub w0, w3, w2, lsr #2 - ret -endfunc -.endm +COEFF_LAST_1x 15, #2 +COEFF_LAST_1x 16, #2 -COEFF_LAST_1x 15 -COEFF_LAST_1x 16 - -function coeff_last64_neon, export=1 - ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64 - movi v31.8h, #8 - movi v30.8h, #1 - uqxtn v0.8b, v0.8h - uqxtn2 v0.16b, v1.8h - ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64 - uqxtn v1.8b, v2.8h - uqxtn2 v1.16b, v3.8h - uqxtn v2.8b, v4.8h - uqxtn2 v2.16b, v5.8h - uqxtn v3.8b, v6.8h - uqxtn2 v3.16b, v7.8h - - cmtst v0.16b, v0.16b, v0.16b - cmtst v1.16b, v1.16b, v1.16b - cmtst v2.16b, v2.16b, v2.16b - cmtst v3.16b, v3.16b, v3.16b - - shrn v0.8b, v0.8h, #4 - shrn2 v0.16b, v1.8h, #4 - shrn v1.8b, v2.8h, #4 - shrn2 v1.16b, v3.8h, #4 - - clz v0.4s, v0.4s - clz v1.4s, v1.4s - - shrn v0.4h, v0.4s, #2 - shrn2 v0.8h, v1.4s, #2 - - sub v0.8h, v31.8h, v0.8h - sshl v0.8h, v30.8h, v0.8h - shrn v0.8b, v0.8h, #1 - - fmov x2, d0 - mov w3, #63 - clz x2, x2 - sub w0, w3, w2 - ret -endfunc - -.macro coeff_level_run_start size - add x6, x1, #23 // runlevel->mask - mov w7, #0 - mov w8, #0 - mov w9, #1 - and x6, x6, #~15 - mov w4, #\size - 1 -.endm - -.macro coeff_level_run shift - clz x3, x2 - subs w4, w4, w3, lsr #\shift - str w4, [x1], #4 -1: - ldrh w5, [x0, x4, lsl #1] - strh w5, [x6], #2 - add w7, w7, #1 - lsl w10, w9, w4 - orr w8, w8, w10 - b.le 2f - add w3, w3, #1 << \shift - sub w4, w4, #1 - and x3, x3, #~((1 << \shift) - 1) - lsl x2, x2, x3 - clz x3, x2 - subs w4, w4, w3, lsr #\shift - b.ge 1b -2: - str w8, [x1] - mov w0, w7 -.endm +COEFF_LAST64 function coeff_level_run4_aarch64, export=1 ldr x2, [x0] - coeff_level_run_start 4 - - coeff_level_run 4 + coeff_level_run_start 4, 23 + and x6, x6, #~15 + coeff_level_run 4, 8 ret endfunc @@ -554,9 +651,10 @@ function coeff_level_run\size\()_neon, export=1 add x0, x0, #2 .endif - coeff_level_run_start \size + coeff_level_run_start \size, 23 + and x6, x6, #~15 - coeff_level_run (4 - (\size + 1) / 8) + coeff_level_run (4 - (\size + 1) / 8), 8 ret endfunc @@ -590,3 +688,482 @@ function denoise_dct_neon, export=1 b.gt 1b ret endfunc + +.else // BIT_DEPTH == 8 + +.macro QUANT_TWO mask + add v20.4s, v20.4s, v0.4s + add v21.4s, v21.4s, v1.4s + add v22.4s, v22.4s, v2.4s + add v23.4s, v23.4s, v3.4s + + mul v24.4s, v20.4s, v4.4s + mul v25.4s, v21.4s, v5.4s + mul v26.4s, v22.4s, v6.4s + mul v27.4s, v23.4s, v7.4s + + sshr v16.4s, v16.4s, #31 + sshr v17.4s, v17.4s, #31 + sshr v18.4s, v18.4s, #31 + sshr v19.4s, v19.4s, #31 + + sshr v20.4s, v24.4s, #16 + sshr v21.4s, v25.4s, #16 + sshr v22.4s, v26.4s, #16 + sshr v23.4s, v27.4s, #16 + + eor v20.16b, v20.16b, v16.16b + eor v21.16b, v21.16b, v17.16b + eor v22.16b, v22.16b, v18.16b + eor v23.16b, v23.16b, v19.16b + + sub v20.4s, v20.4s, v16.4s + sub v21.4s, v21.4s, v17.4s + sub v22.4s, v22.4s, v18.4s + sub v23.4s, v23.4s, v19.4s + + orr \mask, v20.16b, v21.16b + orr v16.16b, v22.16b, v23.16b + orr \mask, \mask, v16.16b + + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 +.endm + + +.macro QUANT_END d + // Use parameter d as a register number and extract upper and lower halves. + fmov x2, d\d + fmov x3, v\d\().d[1] + orr x2, x2, x3 + mov w0, #0 + tst x2, x2 + cinc w0, w0, ne + ret +.endm + +// quant_2x2_dc( dctcoef dct[4], int mf, int bias ) +function quant_2x2_dc_neon, export=1 + ld1 {v0.4s}, [x0] + dup v2.4s, w2 + dup v1.4s, w1 + abs v3.4s, v0.4s + add v3.4s, v3.4s, v2.4s + mul v3.4s, v3.4s, v1.4s + sshr v0.4s, v0.4s, #31 + sshr v3.4s, v3.4s, #16 + eor v3.16b, v3.16b, v0.16b + sub v0.4s, v3.4s, v0.4s + st1 {v0.4s}, [x0] + QUANT_END 0 +endfunc + +// quant_4x4_dc( dctcoef dct[16], int mf, int bias ) +function quant_4x4_dc_neon, export=1 + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + + dup v0.4s, w2 + dup v1.4s, w2 + dup v2.4s, w2 + dup v3.4s, w2 + dup v4.4s, w1 + dup v5.4s, w1 + dup v6.4s, w1 + dup v7.4s, w1 + + QUANT_TWO v0.16b + QUANT_END 0 +endfunc + +// quant_4x4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ) +function quant_4x4_neon, export=1 + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1] + + QUANT_TWO v0.16b + QUANT_END 0 +endfunc + +// quant_4x4x4( dctcoef dct[4][16], uint32_t mf[16], uint32_t bias[16] ) +function quant_4x4x4_neon, export=1 + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1] + + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + + QUANT_TWO v28.16b + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + QUANT_TWO v29.16b + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + QUANT_TWO v30.16b + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + QUANT_TWO v31.16b + + uqxtn v28.4h, v28.4s + uqxtn v29.4h, v29.4s + uqxtn v30.4h, v30.4s + uqxtn v31.4h, v31.4s + + fmov x7, d28 + fmov x6, d29 + fmov x10, d30 + fmov x12, d31 + + mov w0, #0 + tst x12, x12 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x10, x10 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x6, x6 + cinc w0, w0, ne + lsl w0, w0, #1 + tst x7, x7 + cinc w0, w0, ne + ret +endfunc + +// quant_8x8( dctcoef dct[64], uint32_t mf[64], uint32_t bias[64] ) +function quant_8x8_neon, export=1 + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 + + QUANT_TWO v28.16b + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 + + QUANT_TWO v29.16b + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 + + QUANT_TWO v30.16b + + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + abs v20.4s, v16.4s + abs v21.4s, v17.4s + abs v22.4s, v18.4s + abs v23.4s, v19.4s + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 + + QUANT_TWO v31.16b + + orr v0.16b, v28.16b, v29.16b + orr v0.16b, v0.16b, v30.16b + orr v0.16b, v0.16b, v31.16b + + QUANT_END 0 +endfunc + +.macro DEQUANT_START mf_size offset dc=no + mov w3, #0x2b + mul w3, w3, w2 + lsr w3, w3, #8 // i_qbits = i_qp / 6 + add w5, w3, w3, lsl #1 + sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6 + lsl w2, w2, #\mf_size +.ifc \dc,no + add x1, x1, w2, sxtw // dequant_mf[i_mf] +.else + ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0] +.endif + subs w3, w3, #\offset // 6 for 8x8 +.endm + +// dequant_4x4( int32_t dct[16], int dequant_mf[6][16], int i_qp ) +.macro DEQUANT size bits +function dequant_\size\()_neon, export=1 + DEQUANT_START \bits+2, \bits +.ifc \size, 8x8 + mov w2, #4 +.endif + b.lt dequant_\size\()_rshift + + dup v31.4s, w3 +dequant_\size\()_lshift_loop: +.ifc \size, 8x8 + subs w2, w2, #1 +.endif + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + + mul v0.4s, v0.4s, v16.4s + mul v1.4s, v1.4s, v17.4s + mul v2.4s, v2.4s, v18.4s + mul v3.4s, v3.4s, v19.4s + + sshl v0.4s, v0.4s, v31.4s + sshl v1.4s, v1.4s, v31.4s + sshl v2.4s, v2.4s, v31.4s + sshl v3.4s, v3.4s, v31.4s + + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], #64 +.ifc \size, 8x8 + b.gt dequant_\size\()_lshift_loop +.endif + ret + +dequant_\size\()_rshift: + dup v31.4s, w3 + +.ifc \size, 8x8 +dequant_\size\()_rshift_loop: + subs w2, w2, #1 +.endif + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + + mul v20.4s, v0.4s, v16.4s + mul v21.4s, v1.4s, v17.4s + mul v22.4s, v2.4s, v18.4s + mul v23.4s, v3.4s, v19.4s + + srshl v16.4s, v20.4s, v31.4s + srshl v17.4s, v21.4s, v31.4s + srshl v18.4s, v22.4s, v31.4s + srshl v19.4s, v23.4s, v31.4s + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 +.ifc \size, 8x8 + b.gt dequant_\size\()_rshift_loop +.endif + ret +endfunc +.endm + +DEQUANT 4x4, 4 +DEQUANT 8x8, 6 + +// dequant_4x4_dc( int32_t dct[16], int dequant_mf[6][16], int i_qp ) +function dequant_4x4_dc_neon, export=1 + DEQUANT_START 6, 6, yes + b.lt dequant_4x4_dc_rshift + + lsl w1, w1, w3 + dup v31.4s, w1 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + + mul v0.4s, v0.4s, v31.4s + mul v1.4s, v1.4s, v31.4s + mul v2.4s, v2.4s, v31.4s + mul v3.4s, v3.4s, v31.4s + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + ret + +dequant_4x4_dc_rshift: + dup v31.4s, w1 + dup v30.4s, w3 + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + + mul v16.4s, v0.4s, v31.4s + mul v17.4s, v1.4s, v31.4s + mul v18.4s, v2.4s, v31.4s + mul v19.4s, v3.4s, v31.4s + + srshl v16.4s, v16.4s, v30.4s + srshl v17.4s, v17.4s, v30.4s + srshl v18.4s, v18.4s, v30.4s + srshl v19.4s, v19.4s, v30.4s + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0] + ret +endfunc + +decimate_score_1x 15 +decimate_score_1x 16 + +decimate_score64 + +// int coeff_last( int32_t *l ) +function coeff_last4_neon, export=1 + ld1 {v0.4s}, [x0] + uqxtn v0.4h, v0.4s + uqxtn v0.8b, v0.8h + mov w4, #3 + cmtst v0.16b, v0.16b, v0.16b + fmov w1, s0 + clz w2, w1 + sub w0, w4, w2, lsr #3 + ret +endfunc + +function coeff_last8_neon, export=1 + ld1 {v0.4s, v1.4s}, [x0] + uqxtn v0.4h, v0.4s + uqxtn2 v0.8h, v1.4s + uqxtn v0.8b, v0.8h + mov w4, #7 + cmtst v0.16b, v0.16b, v0.16b + fmov x1, d0 + clz x2, x1 + sub x0, x4, x2, lsr #3 + ret +endfunc + +COEFF_LAST_1x 15, #4 +COEFF_LAST_1x 16, #4 + +COEFF_LAST64 + +function coeff_level_run4_neon, export=1 + ldr x2, [x0] + ld1 {v0.4s}, [x0] + uqxtn v0.4h, v0.4s + uqxtn v0.8b, v0.8h + fmov x2, d0 + + coeff_level_run_start 8, 16 + + coeff_level_run 3, 10 + + ret +endfunc + +.macro X264_COEFF_LEVEL_RUN size +function coeff_level_run\size\()_neon, export=1 +.if \size == 15 + sub x0, x0, #4 +.endif +.if \size < 15 + ld1 {v0.4s, v1.4s}, [x0] + uqxtn v0.4h, v0.4s + uqxtn2 v0.8h, v1.4s + uqxtn v0.8b, v0.8h + cmtst v0.8b, v0.8b, v0.8b +.else + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + uqxtn v0.4h, v0.4s + uqxtn2 v0.8h, v1.4s + uqxtn v1.4h, v2.4s + uqxtn2 v1.8h, v3.4s + uqxtn v0.8b, v0.8h + uqxtn2 v0.16b, v1.8h + cmtst v0.16b, v0.16b, v0.16b + shrn v0.8b, v0.8h, #4 +.endif + fmov x2, d0 +.if \size == 15 + add x0, x0, #4 +.endif + + coeff_level_run_start \size, 16 + + coeff_level_run (4 - (\size + 1) / 8), 10 + + ret +endfunc +.endm + +X264_COEFF_LEVEL_RUN 8 +X264_COEFF_LEVEL_RUN 15 +X264_COEFF_LEVEL_RUN 16 + +function denoise_dct_neon, export=1 +1: subs w3, w3, #16 + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0] + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1] + + abs v16.4s, v0.4s + abs v17.4s, v1.4s + abs v18.4s, v2.4s + abs v19.4s, v3.4s + + cmlt v24.4s, v0.4s, #0 + cmlt v25.4s, v1.4s, #0 + cmlt v26.4s, v2.4s, #0 + cmlt v27.4s, v3.4s, #0 + + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x2], #64 + + add v4.4s, v4.4s, v16.4s + add v5.4s, v5.4s, v17.4s + sub v28.4s, v16.4s, v20.4s + sub v29.4s, v17.4s, v21.4s + sub v30.4s, v18.4s, v22.4s + sub v31.4s, v19.4s, v23.4s + add v6.4s, v6.4s, v18.4s + add v7.4s, v7.4s, v19.4s + + cmlt v20.4s, v28.4s, #0 + cmlt v21.4s, v29.4s, #0 + cmlt v22.4s, v30.4s, #0 + cmlt v23.4s, v31.4s, #0 + + movi v0.4s, #0 + + bsl v20.16b, v0.16b, v28.16b + bsl v21.16b, v0.16b, v29.16b + bsl v22.16b, v0.16b, v30.16b + bsl v23.16b, v0.16b, v31.16b + + neg v0.4s, v20.4s + neg v1.4s, v21.4s + neg v2.4s, v22.4s + neg v3.4s, v23.4s + + bsl v24.16b, v0.16b, v20.16b + bsl v25.16b, v1.16b, v21.16b + bsl v26.16b, v2.16b, v22.16b + bsl v27.16b, v3.16b, v23.16b + + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 + b.gt 1b + ret +endfunc + +.endif diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h index cddfa0cf2..a2b324fed 100644 --- a/common/aarch64/quant.h +++ b/common/aarch64/quant.h @@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: arm quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -31,49 +31,63 @@ int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias ); #define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon) -int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias ); +int x264_quant_2x2_dc_neon( dctcoef dct[4], int mf, int bias ); #define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon) -int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias ); +int x264_quant_4x4_dc_neon( dctcoef dct[16], int mf, int bias ); #define x264_quant_4x4_neon x264_template(quant_4x4_neon) -int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_4x4_neon( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon) -int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] ); +int x264_quant_4x4x4_neon( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ); #define x264_quant_8x8_neon x264_template(quant_8x8_neon) -int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] ); +int x264_quant_8x8_neon( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] ); #define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon) -void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4_dc_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_4x4_neon x264_template(dequant_4x4_neon) -void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_4x4_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); #define x264_dequant_8x8_neon x264_template(dequant_8x8_neon) -void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_8x8_neon( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); #define x264_decimate_score15_neon x264_template(decimate_score15_neon) -int x264_decimate_score15_neon( int16_t * ); +int x264_decimate_score15_neon( dctcoef * ); #define x264_decimate_score16_neon x264_template(decimate_score16_neon) -int x264_decimate_score16_neon( int16_t * ); +int x264_decimate_score16_neon( dctcoef * ); #define x264_decimate_score64_neon x264_template(decimate_score64_neon) -int x264_decimate_score64_neon( int16_t * ); +int x264_decimate_score64_neon( dctcoef * ); +// BIT DEPTH = 8 #define x264_coeff_last4_aarch64 x264_template(coeff_last4_aarch64) -int x264_coeff_last4_aarch64( int16_t * ); +int x264_coeff_last4_aarch64( dctcoef * ); #define x264_coeff_last8_aarch64 x264_template(coeff_last8_aarch64) -int x264_coeff_last8_aarch64( int16_t * ); +int x264_coeff_last8_aarch64( dctcoef * ); + +// BIT DEPTH = 10 +#define x264_coeff_last4_neon x264_template(coeff_last4_neon) +int x264_coeff_last4_neon( dctcoef * ); +#define x264_coeff_last8_neon x264_template(coeff_last8_neon) +int x264_coeff_last8_neon( dctcoef * ); + #define x264_coeff_last15_neon x264_template(coeff_last15_neon) -int x264_coeff_last15_neon( int16_t * ); +int x264_coeff_last15_neon( dctcoef * ); #define x264_coeff_last16_neon x264_template(coeff_last16_neon) -int x264_coeff_last16_neon( int16_t * ); +int x264_coeff_last16_neon( dctcoef * ); #define x264_coeff_last64_neon x264_template(coeff_last64_neon) -int x264_coeff_last64_neon( int16_t * ); +int x264_coeff_last64_neon( dctcoef * ); +// BIT_DEPTH = 8 #define x264_coeff_level_run4_aarch64 x264_template(coeff_level_run4_aarch64) -int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run4_aarch64( dctcoef *, x264_run_level_t * ); + +// BIT_DEPTH = 10 +#define x264_coeff_level_run4_neon x264_template(coeff_level_run4_neon) +int x264_coeff_level_run4_neon( dctcoef *, x264_run_level_t * ); + #define x264_coeff_level_run8_neon x264_template(coeff_level_run8_neon) -int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run8_neon( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run15_neon x264_template(coeff_level_run15_neon) -int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run15_neon( dctcoef *, x264_run_level_t * ); #define x264_coeff_level_run16_neon x264_template(coeff_level_run16_neon) -int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * ); +int x264_coeff_level_run16_neon( dctcoef *, x264_run_level_t * ); #define x264_denoise_dct_neon x264_template(denoise_dct_neon) void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int ); diff --git a/common/arm/asm.S b/common/arm/asm.S index 833f011ee..88e8ba600 100644 --- a/common/arm/asm.S +++ b/common/arm/asm.S @@ -1,7 +1,7 @@ /***************************************************************************** * asm.S: arm utility macros ***************************************************************************** - * Copyright (C) 2008-2023 x264 project + * Copyright (C) 2008-2024 x264 project * * Authors: Mans Rullgard * David Conrad diff --git a/common/arm/bitstream-a.S b/common/arm/bitstream-a.S index 1a428356d..35157c4b1 100644 --- a/common/arm/bitstream-a.S +++ b/common/arm/bitstream-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * bitstream-a.S: arm bitstream functions ***************************************************************************** - * Copyright (C) 2014-2023 x264 project + * Copyright (C) 2014-2024 x264 project * * Authors: Janne Grunau * diff --git a/common/arm/bitstream.h b/common/arm/bitstream.h index b46a4ccdd..41a32bc56 100644 --- a/common/arm/bitstream.h +++ b/common/arm/bitstream.h @@ -1,7 +1,7 @@ /***************************************************************************** * bitstream.h: arm bitstream functions ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S index c35d16847..88af84ab1 100644 --- a/common/arm/cpu-a.S +++ b/common/arm/cpu-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * cpu-a.S: arm cpu detection ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S index d8ba0458c..a35a8b858 100644 --- a/common/arm/dct-a.S +++ b/common/arm/dct-a.S @@ -1,7 +1,7 @@ /**************************************************************************** * dct-a.S: arm transform and zigzag ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Martin Storsjo @@ -221,9 +221,9 @@ endfunc vadd.s16 q15, q15, q11 SUMSUB_AB q8, q12, q9, q12 - SUMSUB_SHR 2, q9, q15, q10, q15, q0, q1 - SUMSUB_SHR 1, q10, q14, q2, q14, q0, q1 - SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1 + SUMSUB_SHR 2, q9, q15, q10, q15, q0, q1 + SUMSUB_SHR 1, q10, q14, q2, q14, q0, q1 + SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1 .endm function sub8x8_dct8_neon @@ -254,7 +254,7 @@ function sub8x8_dct8_neon vld1.64 {d31}, [r2,:64], ip vsubl.u8 q15, d30, d31 - DCT8_1D row + DCT8_1D row vswp d17, d24 // 8, 12 vswp d21, d28 // 10,14 vtrn.32 q8, q10 @@ -269,7 +269,7 @@ function sub8x8_dct8_neon vtrn.16 q12, q13 vtrn.16 q8, q9 vtrn.16 q14, q15 - DCT8_1D col + DCT8_1D col vst1.64 {d16-d19}, [r0,:128]! vst1.64 {d20-d23}, [r0,:128]! @@ -411,7 +411,7 @@ endfunc .else vswp d19, d26 .endif - SUMSUB_SHR 1, q2, q3, q10, q14, q8, q12 // a6/a4 + SUMSUB_SHR 1, q2, q3, q10, q14, q8, q12 // a6/a4 .ifc \type, col vswp d23, d30 .endif @@ -420,8 +420,8 @@ endfunc SUMSUB_AB q14, q15, q15, q9 SUMSUB_15 q15, q14, q13, q11, q12, q9 // a5/a3 - SUMSUB_SHR 2, q13, q14, q14, q15, q11, q9 // b3/b5 - SUMSUB_SHR2 2, q12, q15, q8, q10, q11, q9 // b1/b7 + SUMSUB_SHR 2, q13, q14, q14, q15, q11, q9 // b3/b5 + SUMSUB_SHR2 2, q12, q15, q8, q10, q11, q9 // b1/b7 SUMSUB_AB q10, q2, q0, q2 // b0/b6 SUMSUB_AB q11, q3, q1, q3 // b2/b4 @@ -441,7 +441,7 @@ function add8x8_idct8_neon vld1.64 {d20-d23}, [r1,:128]! vld1.64 {d24-d27}, [r1,:128]! - IDCT8_1D row + IDCT8_1D row vtrn.16 q10, q11 vtrn.16 q12, q13 vtrn.16 q14, q15 @@ -450,7 +450,7 @@ function add8x8_idct8_neon vtrn.32 q12, q14 vtrn.32 q13, q15 vswp d17, d24 - IDCT8_1D col + IDCT8_1D col vld1.64 {d0}, [r0,:64], r2 vrshr.s16 q8, q8, #6 diff --git a/common/arm/dct.h b/common/arm/dct.h index 758840c61..e26fabfe8 100644 --- a/common/arm/dct.h +++ b/common/arm/dct.h @@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: arm transform and zigzag ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S index 54e55f144..b013e8d98 100644 --- a/common/arm/deblock-a.S +++ b/common/arm/deblock-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.S: arm deblocking ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Mans Rullgard * Martin Storsjo @@ -216,7 +216,7 @@ endfunc beq 9f sub sp, sp, #32 - vst1.8 {q12-q13}, [sp,:128] + vst1.8 {q12-q13}, [sp,:128] vshll.u8 q4, d18, #1 @ 2*p1 vshll.u8 q5, d19, #1 @@ -252,7 +252,7 @@ endfunc vdup.8 q4, r3 @ beta vabd.u8 q5, q10, q8 @ abs(p2 - p0) - vld1.8 {q6-q7}, [sp,:128] @ if_1, if_2 + vld1.8 {q6-q7}, [sp,:128] @ if_1, if_2 vclt.u8 q5, q5, q4 @ < beta if_3 vand q7, q7, q5 @ if_2 && if_3 @@ -309,7 +309,7 @@ endfunc vdup.8 q4, r3 @ beta vabd.u8 q5, q2, q0 @ abs(q2 - q0) - vld1.8 {q6-q7}, [sp,:128]! @ if_1, if_2 + vld1.8 {q6-q7}, [sp,:128]! @ if_1, if_2 vclt.u8 q5, q5, q4 @ < beta if_4 vand q7, q7, q5 @ if_2 && if_4 diff --git a/common/arm/deblock.h b/common/arm/deblock.h index 73adf55d5..cc135cf5c 100644 --- a/common/arm/deblock.h +++ b/common/arm/deblock.h @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.h: arm deblocking ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S index edef85059..8eca86d1a 100644 --- a/common/arm/mc-a.S +++ b/common/arm/mc-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * mc.S: arm motion compensation ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Mans Rullgard @@ -916,8 +916,8 @@ mc_chroma_w\width: beq 4f - vld1.64 {d4}, [r3], r4 - vld1.64 {d6}, [r3], r4 + vld1.64 {d4}, [r3], r4 + vld1.64 {d6}, [r3], r4 3: // vertical interpolation loop @@ -982,8 +982,8 @@ mc_chroma_w\width: pop {r4-r8, pc} .endm - CHROMA_MC 2, 16 - CHROMA_MC 4, 32 + CHROMA_MC 2, 16 + CHROMA_MC 4, 32 mc_chroma_w8: CHROMA_MC_START d4, d7, d8, d11 diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c index 391264087..68f4be81c 100644 --- a/common/arm/mc-c.c +++ b/common/arm/mc-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * mc-c.c: arm motion compensation ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau diff --git a/common/arm/mc.h b/common/arm/mc.h index d1c4f223c..002b30a88 100644 --- a/common/arm/mc.h +++ b/common/arm/mc.h @@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: arm motion compensation ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S index a8df8c403..74be7de6f 100644 --- a/common/arm/pixel-a.S +++ b/common/arm/pixel-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.S: arm pixel metrics ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -392,17 +392,17 @@ SAD_X_FUNC 4, 16, 16 function pixel_vsad_neon subs r2, r2, #2 - vld1.8 {q0}, [r0], r1 - vld1.8 {q1}, [r0], r1 + vld1.8 {q0}, [r0], r1 + vld1.8 {q1}, [r0], r1 vabdl.u8 q2, d0, d2 vabdl.u8 q3, d1, d3 ble 2f 1: subs r2, r2, #2 - vld1.8 {q0}, [r0], r1 + vld1.8 {q0}, [r0], r1 vabal.u8 q2, d2, d0 vabal.u8 q3, d3, d1 - vld1.8 {q1}, [r0], r1 + vld1.8 {q1}, [r0], r1 blt 2f vabal.u8 q2, d0, d2 vabal.u8 q3, d1, d3 @@ -417,20 +417,20 @@ endfunc function pixel_asd8_neon ldr r12, [sp, #0] sub r12, r12, #2 - vld1.8 {d0}, [r0], r1 - vld1.8 {d1}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d3}, [r2], r3 + vld1.8 {d0}, [r0], r1 + vld1.8 {d1}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d3}, [r2], r3 vsubl.u8 q8, d0, d1 1: subs r12, r12, #2 - vld1.8 {d4}, [r0], r1 - vld1.8 {d5}, [r2], r3 + vld1.8 {d4}, [r0], r1 + vld1.8 {d5}, [r2], r3 vsubl.u8 q9, d2, d3 vsubl.u8 q10, d4, d5 vadd.s16 q8, q9 - vld1.8 {d2}, [r0], r1 - vld1.8 {d3}, [r2], r3 + vld1.8 {d2}, [r0], r1 + vld1.8 {d3}, [r2], r3 vadd.s16 q8, q10 bgt 1b vsubl.u8 q9, d2, d3 @@ -545,7 +545,7 @@ SSD_FUNC 16, 8 SSD_FUNC 16, 16 function pixel_ssd_nv12_core_neon - push {r4-r5} + push {r4-r5} ldrd r4, r5, [sp, #8] add r12, r4, #8 bic r12, r12, #15 @@ -555,10 +555,10 @@ function pixel_ssd_nv12_core_neon sub r3, r3, r12, lsl #1 1: subs r12, r4, #16 - vld2.8 {d0,d1}, [r0]! - vld2.8 {d2,d3}, [r2]! - vld2.8 {d4,d5}, [r0]! - vld2.8 {d6,d7}, [r2]! + vld2.8 {d0,d1}, [r0]! + vld2.8 {d2,d3}, [r2]! + vld2.8 {d4,d5}, [r0]! + vld2.8 {d6,d7}, [r2]! vsubl.u8 q10, d0, d2 vsubl.u8 q11, d1, d3 @@ -574,8 +574,8 @@ function pixel_ssd_nv12_core_neon 2: vmlal.s16 q14, d24, d24 vmlal.s16 q15, d26, d26 - vld2.8 {d0,d1}, [r0]! - vld2.8 {d2,d3}, [r2]! + vld2.8 {d0,d1}, [r0]! + vld2.8 {d2,d3}, [r2]! vmlal.s16 q14, d25, d25 vmlal.s16 q15, d27, d27 @@ -584,8 +584,8 @@ function pixel_ssd_nv12_core_neon vsubl.u8 q11, d1, d3 vmlal.s16 q14, d20, d20 vmlal.s16 q15, d22, d22 - vld2.8 {d4,d5}, [r0]! - vld2.8 {d6,d7}, [r2]! + vld2.8 {d4,d5}, [r0]! + vld2.8 {d6,d7}, [r2]! vmlal.s16 q14, d21, d21 vmlal.s16 q15, d23, d23 blt 4f @@ -611,10 +611,10 @@ function pixel_ssd_nv12_core_neon vadd.u64 d16, d16, d17 vadd.u64 d18, d18, d19 ldrd r4, r5, [sp, #16] - vst1.64 {d16}, [r4] - vst1.64 {d18}, [r5] + vst1.64 {d16}, [r4] + vst1.64 {d18}, [r5] - pop {r4-r5} + pop {r4-r5} bx lr endfunc @@ -942,7 +942,7 @@ endfunc function pixel_satd_8x8_neon mov ip, lr - bl satd_8x8_neon + bl satd_8x8_neon vadd.u16 q0, q12, q13 vadd.u16 q1, q14, q15 @@ -957,11 +957,11 @@ function pixel_satd_8x16_neon vpush {d8-d11} mov ip, lr - bl satd_8x8_neon + bl satd_8x8_neon vadd.u16 q4, q12, q13 vadd.u16 q5, q14, q15 - bl satd_8x8_neon + bl satd_8x8_neon vadd.u16 q4, q4, q12 vadd.u16 q5, q5, q13 vadd.u16 q4, q4, q14 @@ -1444,9 +1444,9 @@ function pixel_ssim_4x4x2_core_neon vld1.64 {d28}, [r0], r1 vmull.u8 q15, d2, d2 - SSIM_ITER 1, q8, q9, q14, q2, q3, q15, d28, d29, d26 - SSIM_ITER 2, q10,q11,q13, q8, q9, q14, d26, d27, d28 - SSIM_ITER 3, q8, q9, q15, q10,q11,q13, d28, d29 + SSIM_ITER 1, q8, q9, q14, q2, q3, q15, d28, d29, d26 + SSIM_ITER 2, q10,q11,q13, q8, q9, q14, d26, d27, d28 + SSIM_ITER 3, q8, q9, q15, q10,q11,q13, d28, d29 vpadal.u16 q2, q8 vpaddl.u16 q0, q0 diff --git a/common/arm/pixel.h b/common/arm/pixel.h index 3065027a9..50da9c5de 100644 --- a/common/arm/pixel.h +++ b/common/arm/pixel.h @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.h: arm pixel metrics ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S index 421bae702..eed73077f 100644 --- a/common/arm/predict-a.S +++ b/common/arm/predict-a.S @@ -1,7 +1,7 @@ /***************************************************************************** * predict.S: arm intra prediction ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Mans Rullgard @@ -223,11 +223,11 @@ function predict_8x8_dc_neon add r2, r2, r3 lsr r2, r2, #4 - vdup.8 d0, r2 + vdup.8 d0, r2 .rept 8 vst1.64 {d0}, [r0,:64], ip .endr - pop {r4-r5,pc} + pop {r4-r5,pc} endfunc function predict_8x8_h_neon diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c index 00902f502..88b6cd61e 100644 --- a/common/arm/predict-c.c +++ b/common/arm/predict-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * predict.c: arm intra prediction ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * diff --git a/common/arm/predict.h b/common/arm/predict.h index 451fcdeea..1722c70dc 100644 --- a/common/arm/predict.h +++ b/common/arm/predict.h @@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: arm intra prediction ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S index 42111f21b..92dd6ab0c 100644 --- a/common/arm/quant-a.S +++ b/common/arm/quant-a.S @@ -1,7 +1,7 @@ /**************************************************************************** * quant.S: arm quantization and level-run ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: David Conrad * Janne Grunau @@ -136,9 +136,9 @@ function quant_4x4x4_neon vabs.s16 q9, q15 QUANT_TWO q0, q1, d4, d5, d6, d7, q7 vorr d8, d8, d9 - vorr d10, d10, d11 - vorr d12, d12, d13 - vorr d14, d14, d15 + vorr d10, d10, d11 + vorr d12, d12, d13 + vorr d14, d14, d15 vmov r0, r1, d8 vmov r2, r3, d10 orrs r0, r1 diff --git a/common/arm/quant.h b/common/arm/quant.h index b41b52809..3124c2cee 100644 --- a/common/arm/quant.h +++ b/common/arm/quant.h @@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: arm quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: David Conrad * diff --git a/common/base.c b/common/base.c index d302fed12..faf2050e0 100644 --- a/common/base.c +++ b/common/base.c @@ -1,7 +1,7 @@ /***************************************************************************** * base.c: misc common functions (bit depth independent) ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/common/base.h b/common/base.h index fa765519c..d122a41e0 100644 --- a/common/base.h +++ b/common/base.h @@ -1,7 +1,7 @@ /***************************************************************************** * base.h: misc common functions (bit depth independent) ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/bitstream.c b/common/bitstream.c index 9bf2b61c7..702e3e3aa 100644 --- a/common/bitstream.c +++ b/common/bitstream.c @@ -1,7 +1,7 @@ /***************************************************************************** * bitstream.c: bitstream writing ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Fiona Glaser diff --git a/common/bitstream.h b/common/bitstream.h index a772eb241..2116f71b3 100644 --- a/common/bitstream.h +++ b/common/bitstream.h @@ -1,7 +1,7 @@ /***************************************************************************** * bitstream.h: bitstream writing ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Fiona Glaser diff --git a/common/cabac.c b/common/cabac.c index 743afee78..ff34bc91c 100644 --- a/common/cabac.c +++ b/common/cabac.c @@ -1,7 +1,7 @@ /***************************************************************************** * cabac.c: arithmetic coder ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/cabac.h b/common/cabac.h index 5b757560c..4e7e86bc9 100644 --- a/common/cabac.h +++ b/common/cabac.h @@ -1,7 +1,7 @@ /***************************************************************************** * cabac.h: arithmetic coder ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/common/common.c b/common/common.c index de3014613..d4ca657c5 100644 --- a/common/common.c +++ b/common/common.c @@ -1,7 +1,7 @@ /***************************************************************************** * common.c: misc common functions ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/common/common.h b/common/common.h index 975e1d4f6..4918cc059 100644 --- a/common/common.h +++ b/common/common.h @@ -1,7 +1,7 @@ /***************************************************************************** * common.h: misc common functions ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/cpu.c b/common/cpu.c index 4f08f0a13..fb7c07d53 100644 --- a/common/cpu.c +++ b/common/cpu.c @@ -1,7 +1,7 @@ /***************************************************************************** * cpu.c: cpu detection ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar @@ -96,8 +96,13 @@ const x264_cpu_name_t x264_cpu_names[] = #elif ARCH_AARCH64 {"ARMv8", X264_CPU_ARMV8}, {"NEON", X264_CPU_NEON}, + {"SVE", X264_CPU_SVE}, + {"SVE2", X264_CPU_SVE2}, #elif ARCH_MIPS {"MSA", X264_CPU_MSA}, +#elif ARCH_LOONGARCH + {"LSX", X264_CPU_LSX}, + {"LASX", X264_CPU_LASX}, #endif {"", 0}, }; @@ -305,7 +310,7 @@ uint32_t x264_cpu_detect( void ) #elif HAVE_ALTIVEC -#if SYS_MACOSX || SYS_OPENBSD || SYS_FREEBSD +#if SYS_MACOSX || SYS_OPENBSD || SYS_FREEBSD || SYS_NETBSD uint32_t x264_cpu_detect( void ) { @@ -320,6 +325,8 @@ uint32_t x264_cpu_detect( void ) size_t length = sizeof( has_altivec ); #if SYS_MACOSX || SYS_OPENBSD int error = sysctl( selectors, 2, &has_altivec, &length, NULL, 0 ); +#elif SYS_NETBSD + int error = sysctlbyname( "machdep.altivec", &has_altivec, &length, NULL, 0 ); #else int error = sysctlbyname( "hw.altivec", &has_altivec, &length, NULL, 0 ); #endif @@ -358,6 +365,14 @@ uint32_t x264_cpu_detect( void ) return X264_CPU_ALTIVEC; #endif } + +#else + +uint32_t x264_cpu_detect( void ) +{ + return 0; +} + #endif #elif HAVE_ARMV6 @@ -405,13 +420,49 @@ uint32_t x264_cpu_detect( void ) #elif HAVE_AARCH64 +#ifdef __linux__ +#include + +#define HWCAP_AARCH64_SVE (1 << 22) +#define HWCAP2_AARCH64_SVE2 (1 << 1) + +static uint32_t detect_flags( void ) +{ + uint32_t flags = 0; + + unsigned long hwcap = getauxval( AT_HWCAP ); + unsigned long hwcap2 = getauxval( AT_HWCAP2 ); + if ( hwcap & HWCAP_AARCH64_SVE ) + flags |= X264_CPU_SVE; + if ( hwcap2 & HWCAP2_AARCH64_SVE2 ) + flags |= X264_CPU_SVE2; + + return flags; +} +#endif + uint32_t x264_cpu_detect( void ) { + uint32_t flags = X264_CPU_ARMV8; #if HAVE_NEON - return X264_CPU_ARMV8 | X264_CPU_NEON; -#else - return X264_CPU_ARMV8; + flags |= X264_CPU_NEON; +#endif + + // If these features are enabled unconditionally in the compiler, we can + // assume that they are available. +#ifdef __ARM_FEATURE_SVE + flags |= X264_CPU_SVE; +#endif +#ifdef __ARM_FEATURE_SVE2 + flags |= X264_CPU_SVE2; #endif + + // Where possible, try to do runtime detection as well. +#ifdef __linux__ + flags |= detect_flags(); +#endif + + return flags; } #elif HAVE_MSA @@ -421,6 +472,25 @@ uint32_t x264_cpu_detect( void ) return X264_CPU_MSA; } +#elif HAVE_LSX +#include + +#define LA_HWCAP_LSX ( 1U << 4 ) +#define LA_HWCAP_LASX ( 1U << 5 ) + +uint32_t x264_cpu_detect( void ) +{ + uint32_t flags = 0; + uint32_t hwcap = (uint32_t)getauxval( AT_HWCAP ); + + if( hwcap & LA_HWCAP_LSX ) + flags |= X264_CPU_LSX; + if( hwcap & LA_HWCAP_LASX ) + flags |= X264_CPU_LASX; + + return flags; +} + #else uint32_t x264_cpu_detect( void ) diff --git a/common/cpu.h b/common/cpu.h index be3182ed5..0d29b251e 100644 --- a/common/cpu.h +++ b/common/cpu.h @@ -1,7 +1,7 @@ /***************************************************************************** * cpu.h: cpu detection ***************************************************************************** - * Copyright (C) 2004-2023 x264 project + * Copyright (C) 2004-2024 x264 project * * Authors: Loren Merritt * diff --git a/common/dct.c b/common/dct.c index ff3f48c40..e03c7a905 100644 --- a/common/dct.c +++ b/common/dct.c @@ -1,7 +1,7 @@ /***************************************************************************** * dct.c: transform and zigzag ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar @@ -41,7 +41,9 @@ #if HAVE_MSA # include "mips/dct.h" #endif - +#if HAVE_LSX +# include "loongarch/dct.h" +#endif static void dct4x4dc( dctcoef d[16] ) { dctcoef tmp[16]; @@ -705,6 +707,18 @@ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf ) dctf->add16x16_idct8= x264_add16x16_idct8_neon; dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon; } +#if HAVE_SVE + if ( cpu&X264_CPU_SVE ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_sve; + } +#endif +#if HAVE_SVE2 + if ( cpu&X264_CPU_SVE2 ) + { + dctf->add4x4_idct = x264_add4x4_idct_sve2; + } +#endif #endif #if HAVE_MSA @@ -727,6 +741,38 @@ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf ) } #endif +#if HAVE_LSX + if( cpu&X264_CPU_LSX ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_lsx; + dctf->add4x4_idct = x264_add4x4_idct_lsx; + dctf->dct4x4dc = x264_dct4x4dc_lsx; + dctf->idct4x4dc = x264_idct4x4dc_lsx; + dctf->sub8x8_dct8 = x264_sub8x8_dct8_lsx; + dctf->sub8x8_dct = x264_sub8x8_dct_lsx; + dctf->add8x8_idct = x264_add8x8_idct_lsx; + dctf->add8x8_idct8 = x264_add8x8_idct8_lsx; + dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lsx; + dctf->add16x16_idct = x264_add16x16_idct_lsx; + dctf->sub16x16_dct = x264_sub16x16_dct_lsx; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lsx; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_lsx; + } + if( cpu&X264_CPU_LASX ) + { + dctf->sub8x8_dct = x264_sub8x8_dct_lasx; + dctf->sub16x16_dct = x264_sub16x16_dct_lasx; + dctf->add8x8_idct = x264_add8x8_idct_lasx; + dctf->add8x8_idct8 = x264_add8x8_idct8_lasx; + dctf->add16x16_idct = x264_add16x16_idct_lasx; + dctf->sub16x16_dct8 = x264_sub16x16_dct8_lasx; + dctf->add8x8_idct_dc = x264_add8x8_idct_dc_lasx; + dctf->add16x16_idct_dc = x264_add16x16_idct_dc_lasx; + dctf->dct4x4dc = x264_dct4x4dc_lasx; + dctf->idct4x4dc = x264_idct4x4dc_lasx; + } +#endif + #endif // HIGH_BIT_DEPTH } @@ -1071,6 +1117,12 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26 pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon; } +#if HAVE_SVE + if( cpu&X264_CPU_SVE ) + { + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sve; + } +#endif #endif // HAVE_AARCH64 #if HAVE_ALTIVEC @@ -1087,5 +1139,12 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa; } #endif + +#if HAVE_LSX + if( cpu&X264_CPU_LASX ) + { + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_lasx; + } +#endif #endif // !HIGH_BIT_DEPTH } diff --git a/common/dct.h b/common/dct.h index ff6888b0e..324034d95 100644 --- a/common/dct.h +++ b/common/dct.h @@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: transform and zigzag ***************************************************************************** - * Copyright (C) 2004-2023 x264 project + * Copyright (C) 2004-2024 x264 project * * Authors: Loren Merritt * diff --git a/common/deblock.c b/common/deblock.c index c17ade26e..d5418e2d0 100644 --- a/common/deblock.c +++ b/common/deblock.c @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.c: deblocking ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt @@ -680,6 +680,9 @@ void x264_macroblock_deblock( x264_t *h ) #if HAVE_MSA #include "mips/deblock.h" #endif +#if HAVE_LSX +#include "loongarch/deblock.h" +#endif void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff ) { @@ -800,6 +803,12 @@ void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon; pf->deblock_strength = x264_deblock_strength_neon; } +#if HAVE_SVE + if ( cpu&X264_CPU_SVE ) + { + pf->deblock_chroma[1] = x264_deblock_v_chroma_sve; + } +#endif #endif #if HAVE_MSA @@ -816,6 +825,24 @@ void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff ) pf->deblock_strength = x264_deblock_strength_msa; } #endif + +#if HAVE_LSX + if( cpu&X264_CPU_LSX ) + { + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lsx; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lsx; + pf->deblock_strength = x264_deblock_strength_lsx; + } + if( cpu&X264_CPU_LASX ) + { + pf->deblock_luma[1] = x264_deblock_v_luma_lasx; + pf->deblock_luma[0] = x264_deblock_h_luma_lasx; + pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_lasx; + pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_lasx; + pf->deblock_strength = x264_deblock_strength_lasx; + } +#endif + #endif // !HIGH_BIT_DEPTH /* These functions are equivalent, so don't duplicate them. */ diff --git a/common/frame.c b/common/frame.c index e282c8f54..6bc1b3d72 100644 --- a/common/frame.c +++ b/common/frame.c @@ -1,7 +1,7 @@ /***************************************************************************** * frame.c: frame handling ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/frame.h b/common/frame.h index dd0a6d2da..f5393f021 100644 --- a/common/frame.h +++ b/common/frame.h @@ -1,7 +1,7 @@ /***************************************************************************** * frame.h: frame handling ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/loongarch/dct-a.S b/common/loongarch/dct-a.S new file mode 100644 index 000000000..86780e4e9 --- /dev/null +++ b/common/loongarch/dct-a.S @@ -0,0 +1,2016 @@ +/***************************************************************************** + * dct-a.S: LoongArch transform and zigzag + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Peng Zhou + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ +#include "loongson_asm.S" +#include "loongson_util.S" + +const hsub_mul +.rept 16 +.byte 1, -1 +.endr +endconst + +const last64_shuf +.int 0, 4, 1, 5, 2, 6, 3, 7 +endconst + +const zigzag_scan4 +.short 0, 4, 1, 2, 5, 8, 12, 9, 6, 3, 7, 10, 13, 14, 11, 15 +endconst + +.macro LOAD_DIFF8x4_LASX s1, s2, s3, s4, s5, s6, s7, s8, s9, s10 + fld.d $f\s1, a1, FENC_STRIDE * \s7 + fld.d $f\s2, a1, FENC_STRIDE * \s8 + fld.d $f\s5, a1, FENC_STRIDE * \s9 + fld.d $f\s6, a1, FENC_STRIDE * \s10 + xvinsve0.d $xr\s1, $xr\s5, 2 + xvinsve0.d $xr\s2, $xr\s6, 2 + fld.d $f\s3, a2, FDEC_STRIDE * \s7 + fld.d $f\s4, a2, FDEC_STRIDE * \s8 + fld.d $f\s5, a2, FDEC_STRIDE * \s9 + fld.d $f\s6, a2, FDEC_STRIDE * \s10 + xvinsve0.d $xr\s3, $xr\s5, 2 + xvinsve0.d $xr\s4, $xr\s6, 2 + xvilvl.b $xr\s1, xr8, $xr\s1 + xvilvl.b $xr\s2, xr8, $xr\s2 + xvilvl.b $xr\s3, xr8, $xr\s3 + xvilvl.b $xr\s4, xr8, $xr\s4 + xvsub.h $xr\s1, $xr\s1, $xr\s3 + xvsub.h $xr\s2, $xr\s2, $xr\s4 +.endm + +.macro DCT4_1D_LASX s0, s1, s2, s3, s4 + xvadd.h \s4, \s3, \s0 + xvsub.h \s0, \s0, \s3 + xvadd.h \s3, \s2, \s1 + xvsub.h \s1, \s1, \s2 + xvadd.h \s2, \s3, \s4 + xvsub.h \s4, \s4, \s3 + xvsub.h \s3, \s0, \s1 + xvsub.h \s3, \s3, \s1 + xvadd.h \s0, \s0, \s0 + xvadd.h \s0, \s0, \s1 +.endm + +.macro LSX_SUMSUB_H sum, sub, a, b + vadd.h \sum, \a, \b + vsub.h \sub, \a, \b +.endm + +.macro DCT4_1D_LSX s0, s1, s2, s3, s4, s5, s6, s7 + LSX_SUMSUB_H \s1, \s6, \s5, \s6 + LSX_SUMSUB_H \s3, \s7, \s4, \s7 + vadd.h \s0, \s3, \s1 + vadd.h \s4, \s7, \s7 + vadd.h \s5, \s6, \s6 + vsub.h \s2, \s3, \s1 + vadd.h \s1, \s4, \s6 + vsub.h \s3, \s7, \s5 +.endm + +.macro SUB8x8_DCT_CORE_LASX + LOAD_DIFF8x4_LASX 0, 1, 2, 3, 4, 5, 0, 1, 4, 5 + LOAD_DIFF8x4_LASX 2, 3, 4, 5, 6, 7, 2, 3, 6, 7 + DCT4_1D_LASX xr0, xr1, xr2, xr3, xr4 + LASX_TRANSPOSE2x4x4_H xr0, xr2, xr3, xr4, xr0, xr1, \ + xr2, xr3, xr10, xr12, xr13 + + DCT4_1D_LASX xr2, xr0, xr3, xr1, xr4 + xvilvh.d xr0, xr2, xr3 /* 6, 2 */ + xvilvl.d xr3, xr2, xr3 /* 4, 0 */ + xvilvh.d xr2, xr1, xr4 /* 7, 3 */ + xvilvl.d xr4, xr1, xr4 /* 5, 1 */ + xvor.v xr1, xr3, xr3 + xvpermi.q xr3, xr4, 0x02 /* 1, 0 */ + xvor.v xr5, xr0, xr0 + xvpermi.q xr0, xr2, 0x02 /* 3, 2 */ + xvpermi.q xr1, xr4, 0x13 /* 4, 5 */ + xvpermi.q xr5, xr2, 0x13 /* 7, 6 */ + xvst xr3, a0, 0 + xvst xr0, a0, 16 * 2 + xvst xr1, a0, 16 * 4 + xvst xr5, a0, 16 * 6 +.endm + +.macro SUB8x8_DCT_CORE_LSX + fld.d f0, a1, FENC_STRIDE * 0 + fld.d f1, a1, FENC_STRIDE * 1 + fld.d f4, a1, FENC_STRIDE * 4 + fld.d f5, a1, FENC_STRIDE * 5 + fld.d f2, a2, FDEC_STRIDE * 0 + fld.d f3, a2, FDEC_STRIDE * 1 + fld.d f6, a2, FDEC_STRIDE * 4 + fld.d f7, a2, FDEC_STRIDE * 5 + + vilvl.b vr0, vr8, vr0 + vilvl.b vr1, vr8, vr1 + vilvl.b vr4, vr8, vr4 + vilvl.b vr5, vr8, vr5 + vilvl.b vr2, vr8, vr2 + vilvl.b vr3, vr8, vr3 + vilvl.b vr6, vr8, vr6 + vilvl.b vr7, vr8, vr7 + vsub.h vr0, vr0, vr2 + vsub.h vr4, vr4, vr6 + vsub.h vr1, vr1, vr3 + vsub.h vr5, vr5, vr7 + + fld.d f2, a1, FENC_STRIDE * 2 + fld.d f3, a1, FENC_STRIDE * 3 + fld.d f6, a1, FENC_STRIDE * 6 + fld.d f7, a1, FENC_STRIDE * 7 + fld.d f9, a2, FDEC_STRIDE * 2 + fld.d f11, a2, FDEC_STRIDE * 3 + fld.d f10, a2, FDEC_STRIDE * 6 + fld.d f12, a2, FDEC_STRIDE * 7 + + vilvl.b vr2, vr8, vr2 + vilvl.b vr3, vr8, vr3 + vilvl.b vr6, vr8, vr6 + vilvl.b vr7, vr8, vr7 + vilvl.b vr9, vr8, vr9 + vilvl.b vr11, vr8, vr11 + vilvl.b vr10, vr8, vr10 + vilvl.b vr12, vr8, vr12 + vsub.h vr2, vr2, vr9 + vsub.h vr6, vr6, vr10 + vsub.h vr3, vr3, vr11 + vsub.h vr7, vr7, vr12 + + vadd.h vr9, vr3, vr0 + vadd.h vr10, vr7, vr4 + vsub.h vr0, vr0, vr3 + vsub.h vr4, vr4, vr7 + vadd.h vr3, vr2, vr1 + vadd.h vr7, vr6, vr5 + vsub.h vr1, vr1, vr2 + vsub.h vr5, vr5, vr6 + + vadd.h vr2, vr3, vr9 + vadd.h vr6, vr7, vr10 + vsub.h vr9, vr9, vr3 + vsub.h vr10, vr10, vr7 + + vsub.h vr3, vr0, vr1 + vsub.h vr7, vr4, vr5 + vsub.h vr3, vr3, vr1 + vsub.h vr7, vr7, vr5 + vadd.h vr0, vr0, vr0 + vadd.h vr4, vr4, vr4 + vadd.h vr0, vr0, vr1 + vadd.h vr4, vr4, vr5 + + vilvh.h vr11, vr0, vr2 + vilvh.h vr12, vr4, vr6 + vilvl.h vr13, vr0, vr2 + vilvl.h vr14, vr4, vr6 + vilvh.h vr15, vr3, vr9 + vilvh.h vr16, vr7, vr10 + vilvl.h vr17, vr3, vr9 + vilvl.h vr18, vr7, vr10 + + vilvh.w vr19, vr17, vr13 + vilvh.w vr20, vr18, vr14 + vilvl.w vr13, vr17, vr13 + vilvl.w vr14, vr18, vr14 + vilvh.w vr17, vr15, vr11 + vilvh.w vr18, vr16, vr12 + vilvl.w vr11, vr15, vr11 + vilvl.w vr12, vr16, vr12 + + vilvh.d vr0, vr11, vr13 + vilvh.d vr4, vr12, vr14 + vilvl.d vr2, vr11, vr13 + vilvl.d vr6, vr12, vr14 + vilvh.d vr1, vr17, vr19 + vilvh.d vr5, vr18, vr20 + vilvl.d vr3, vr17, vr19 + vilvl.d vr7, vr18, vr20 + + vadd.h vr9, vr1, vr2 + vadd.h vr10, vr5, vr6 + vsub.h vr2, vr2, vr1 + vsub.h vr6, vr6, vr5 + vadd.h vr1, vr3, vr0 + vadd.h vr5, vr7, vr4 + vsub.h vr0, vr0, vr3 + vsub.h vr4, vr4, vr7 + + vadd.h vr3, vr1, vr9 + vadd.h vr7, vr5, vr10 + vsub.h vr9, vr9, vr1 + vsub.h vr10, vr10, vr5 + + vsub.h vr1, vr2, vr0 + vsub.h vr5, vr6, vr4 + vsub.h vr1, vr1, vr0 + vsub.h vr5, vr5, vr4 + vadd.h vr2, vr2, vr2 + vadd.h vr6, vr6, vr6 + vadd.h vr2, vr2, vr0 + vadd.h vr6, vr6, vr4 + + vilvh.d vr0, vr2, vr3 + vilvh.d vr4, vr6, vr7 + vilvl.d vr3, vr2, vr3 + vilvl.d vr7, vr6, vr7 + vilvh.d vr2, vr1, vr9 + vilvh.d vr6, vr5, vr10 + vilvl.d vr9, vr1, vr9 + vilvl.d vr10, vr5, vr10 + + vor.v vr1, vr3, vr3 + vor.v vr5, vr7, vr7 + vor.v vr12, vr4, vr4 + + vst vr3, a0, 0 + vst vr9, a0, 16 + vst vr0, a0, 32 + vst vr2, a0, 48 + vst vr5, a0, 64 + vst vr10, a0, 80 + vst vr12, a0, 96 + vst vr6, a0, 112 +.endm + +/* void subwxh_dct( dctcoef*, pixel*, pixel* ) */ +function_x264 sub4x4_dct_lsx + fld.s f0, a1, 0 + fld.s f4, a2, 0 + fld.s f1, a1, FENC_STRIDE + fld.s f5, a2, FDEC_STRIDE + + vsllwil.hu.bu vr0, vr0, 0 + vsllwil.hu.bu vr1, vr1, 0 + vsllwil.hu.bu vr4, vr4, 0 + vsllwil.hu.bu vr5, vr5, 0 + fld.s f2, a1, FENC_STRIDE * 2 + fld.s f6, a2, FDEC_STRIDE * 2 + fld.s f3, a1, FENC_STRIDE * 3 + fld.s f7, a2, FDEC_STRIDE * 3 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.hu.bu vr3, vr3, 0 + vsllwil.hu.bu vr6, vr6, 0 + vsllwil.hu.bu vr7, vr7, 0 + vsub.h vr0, vr0, vr4 + vsub.h vr1, vr1, vr5 + vsub.h vr2, vr2, vr6 + vsub.h vr3, vr3, vr7 + + DCT4_1D_LSX vr4, vr5, vr6, vr7, vr0, vr1, vr2, vr3 + LSX_TRANSPOSE4x4_H vr4, vr5, vr6, vr7, vr4, vr5, vr6, vr7, vr0, vr1 + DCT4_1D_LSX vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + vshuf4i.d vr0, vr1, 0x8 + vshuf4i.d vr2, vr3, 0x8 + vst vr0, a0, 0 + vst vr2, a0, 16 +endfunc_x264 + +function_x264 sub8x8_dct_lasx + xvxor.v xr8, xr8, xr8 + SUB8x8_DCT_CORE_LASX +endfunc_x264 + +function_x264 sub8x8_dct_lsx + vxor.v vr8, vr8, vr8 + SUB8x8_DCT_CORE_LSX +endfunc_x264 + +function_x264 sub16x16_dct_lasx + xvxor.v xr8, xr8, xr8 + SUB8x8_DCT_CORE_LASX + addi.d a0, a0, 32 * 4 + addi.d a1, a1, 8 + addi.d a2, a2, 8 + SUB8x8_DCT_CORE_LASX + addi.d a0, a0, 32 * 4 + addi.d a1, a1, 8*FENC_STRIDE - 8 + addi.d a2, a2, 8*FDEC_STRIDE - 8 + SUB8x8_DCT_CORE_LASX + addi.d a0, a0, 32 * 4 + addi.d a1, a1, 8 + addi.d a2, a2, 8 + SUB8x8_DCT_CORE_LASX +endfunc_x264 + +function_x264 sub16x16_dct_lsx + vxor.v vr8, vr8, vr8 + SUB8x8_DCT_CORE_LSX + addi.d a0, a0, 32 * 4 + addi.d a1, a1, 8 + addi.d a2, a2, 8 + SUB8x8_DCT_CORE_LSX + addi.d a0, a0, 32 * 4 + addi.d a1, a1, 8*FENC_STRIDE - 8 + addi.d a2, a2, 8*FDEC_STRIDE - 8 + SUB8x8_DCT_CORE_LSX + addi.d a0, a0, 32 * 4 + addi.d a1, a1, 8 + addi.d a2, a2, 8 + SUB8x8_DCT_CORE_LSX +endfunc_x264 + +/* + * void add4x4_idct( pixel *p_dst, dctcoef dct[16] ) + */ +function_x264 add4x4_idct_lsx + vxor.v vr0, vr1, vr1 + + fld.d f1, a1, 0 + fld.d f2, a1, 8 + fld.d f3, a1, 16 + fld.d f4, a1, 24 + + vsrai.h vr5, vr2, 1 + vsrai.h vr6, vr4, 1 + + vilvl.h vr1, vr1, vr3 + vilvl.h vr15, vr2, vr6 + vilvl.h vr16, vr5, vr4 + + vhaddw.w.h vr7, vr1, vr1 + vhsubw.w.h vr8, vr1, vr1 + vhaddw.w.h vr9, vr15, vr15 + vhsubw.w.h vr10, vr16, vr16 + + vadd.w vr1, vr7, vr9 + vadd.w vr2, vr8, vr10 + vsub.w vr3, vr8, vr10 + vsub.w vr4, vr7, vr9 + + vpickev.h vr1, vr1, vr1 + vpickev.h vr2, vr2, vr2 + vpickev.h vr3, vr3, vr3 + vpickev.h vr4, vr4, vr4 + + LSX_TRANSPOSE4x4_H vr1, vr2, vr3, vr4, vr1, vr2, vr3, vr4, vr5, vr6 + vsrai.h vr5, vr2, 1 + vsrai.h vr6, vr4, 1 + + vilvl.h vr1, vr1, vr3 + vilvl.h vr15, vr2, vr6 + vilvl.h vr16, vr5, vr4 + + vhaddw.w.h vr7, vr1, vr1 + vhsubw.w.h vr8, vr1, vr1 + vhaddw.w.h vr9, vr15, vr15 + vhsubw.w.h vr10, vr16, vr16 + + vadd.w vr1, vr7, vr9 + vadd.w vr2, vr8, vr10 + vsub.w vr3, vr8, vr10 + vsub.w vr4, vr7, vr9 + + vssrarni.h.w vr2, vr1, 6 + vssrarni.h.w vr4, vr3, 6 + + fld.s f1, a0, 0 + fld.s f5, a0, FDEC_STRIDE + fld.s f3, a0, FDEC_STRIDE * 2 + fld.s f6, a0, FDEC_STRIDE * 3 + + vilvl.b vr1, vr0, vr1 + vilvl.b vr5, vr0, vr5 + vilvl.b vr3, vr0, vr3 + vilvl.b vr6, vr0, vr6 + + vilvl.d vr1, vr5, vr1 + vilvl.d vr3, vr6, vr3 + vadd.h vr7, vr1, vr2 + vadd.h vr8, vr3, vr4 + + vssrarni.bu.h vr8, vr7, 0 + + vstelm.w vr8, a0, 0, 0 + vstelm.w vr8, a0, FDEC_STRIDE, 1 + vstelm.w vr8, a0, FDEC_STRIDE * 2, 2 + vstelm.w vr8, a0, FDEC_STRIDE * 3, 3 +endfunc_x264 + +.macro LASX_SUMSUB_W sum, diff, in0, in1 + xvadd.w \sum, \in0, \in1 + xvsub.w \diff, \in0, \in1 +.endm + +.macro add8x4_idct_core_lasx + fld.d f1, a1, 0 + fld.d f2, a1, 8 + fld.d f3, a1, 16 + fld.d f4, a1, 24 + + fld.d f5, a1, 32 + fld.d f6, a1, 40 + fld.d f7, a1, 48 + fld.d f8, a1, 56 + + xvinsve0.d xr1, xr5, 1 + xvinsve0.d xr2, xr6, 1 + xvinsve0.d xr3, xr7, 1 + xvinsve0.d xr4, xr8, 1 + + xvsrai.h xr8, xr2, 1 + xvsrai.h xr9, xr4, 1 + + vext2xv.w.h xr1, xr1 + vext2xv.w.h xr5, xr2 + vext2xv.w.h xr6, xr3 + vext2xv.w.h xr7, xr4 + vext2xv.w.h xr8, xr8 + vext2xv.w.h xr9, xr9 + + LASX_SUMSUB_W xr10, xr11, xr1, xr6 + xvadd.w xr12, xr5, xr9 + xvsub.w xr13, xr8, xr7 + + LASX_SUMSUB_W xr6, xr9, xr10, xr12 + LASX_SUMSUB_W xr7, xr8, xr11, xr13 + + xvpickev.h xr10, xr6, xr6 + xvpickev.h xr11, xr7, xr7 + xvpickev.h xr12, xr8, xr8 + xvpickev.h xr13, xr9, xr9 + + LASX_TRANSPOSE4x8_H xr10, xr11, xr12, xr13, xr10, xr11, xr12, xr13, \ + xr4, xr5 + + xvsllwil.w.h xr10, xr10, 0 + xvsllwil.w.h xr11, xr11, 0 + xvsllwil.w.h xr12, xr12, 0 + xvsllwil.w.h xr13, xr13, 0 + xvsrai.w xr14, xr11, 1 + xvsrai.w xr15, xr13, 1 + + LASX_SUMSUB_W xr4, xr5, xr10, xr12 + xvadd.w xr6, xr11, xr15 + xvsub.w xr7, xr14, xr13 + + LASX_SUMSUB_W xr10, xr13, xr4, xr6 + LASX_SUMSUB_W xr11, xr12, xr5, xr7 + + xvssrarni.h.w xr11, xr10, 6 + xvssrarni.h.w xr13, xr12, 6 + + fld.s f1, a0, 0 + fld.s f2, a0, FDEC_STRIDE + fld.s f3, a0, FDEC_STRIDE * 2 + fld.s f4, a0, FDEC_STRIDE * 3 + + fld.s f5, a0, 4 + fld.s f6, a0, FDEC_STRIDE + 4 + fld.s f7, a0, FDEC_STRIDE * 2 + 4 + fld.s f8, a0, FDEC_STRIDE * 3 + 4 + + xvinsve0.w xr1, xr2, 1 + xvinsve0.w xr3, xr4, 1 + xvinsve0.w xr5, xr6, 1 + xvinsve0.w xr7, xr8, 1 + + xvinsve0.d xr1, xr5, 2 + xvinsve0.d xr3, xr7, 2 + + xvilvl.b xr1, xr0, xr1 + xvilvl.b xr3, xr0, xr3 + + xvadd.h xr1, xr1, xr11 + xvadd.h xr3, xr3, xr13 + + xvssrarni.bu.h xr3, xr1, 0 + + xvstelm.w xr3, a0, 0, 0 + xvstelm.w xr3, a0, FDEC_STRIDE, 1 + xvstelm.w xr3, a0, FDEC_STRIDE * 2, 2 + xvstelm.w xr3, a0, FDEC_STRIDE * 3, 3 + + xvstelm.w xr3, a0, 4, 4 + xvstelm.w xr3, a0, FDEC_STRIDE + 4, 5 + xvstelm.w xr3, a0, FDEC_STRIDE * 2 + 4, 6 + xvstelm.w xr3, a0, FDEC_STRIDE * 3 + 4, 7 +.endm + +.macro LSX_SUMSUB_W sum0, sum1, diff0, diff1, in0, in1, in2, in3 + vadd.w \sum0, \in0, \in2 + vadd.w \sum1, \in1, \in3 + vsub.w \diff0, \in0, \in2 + vsub.w \diff1, \in1, \in3 +.endm + +.macro add8x4_idct_core_lsx + fld.d f1, a1, 0 + fld.d f2, a1, 8 + fld.d f3, a1, 16 + fld.d f4, a1, 24 + fld.d f5, a1, 32 + fld.d f6, a1, 40 + fld.d f7, a1, 48 + fld.d f8, a1, 56 + + vpermi.w vr9, vr6, 0x04 + vpermi.w vr9, vr2, 0x44 + vpermi.w vr10, vr8, 0x04 + vpermi.w vr10, vr4, 0x44 + + vsrai.h vr9, vr9, 1 + vsrai.h vr10, vr10, 1 + + vsllwil.w.h vr1, vr1, 0 + vsllwil.w.h vr5, vr5, 0 + vsllwil.w.h vr2, vr2, 0 + vsllwil.w.h vr6, vr6, 0 + vsllwil.w.h vr3, vr3, 0 + vsllwil.w.h vr7, vr7, 0 + vsllwil.w.h vr4, vr4, 0 + vsllwil.w.h vr8, vr8, 0 + vexth.w.h vr11, vr9 + vsllwil.w.h vr9, vr9, 0 + vexth.w.h vr12, vr10 + vsllwil.w.h vr10, vr10, 0 + + LSX_SUMSUB_W vr13, vr14, vr15, vr16, vr1, vr5, vr3, vr7 + vadd.w vr17, vr2, vr10 + vadd.w vr18, vr6, vr12 + vsub.w vr19, vr9, vr4 + vsub.w vr20, vr11, vr8 + + LSX_SUMSUB_W vr3, vr7, vr10, vr12, vr13, vr14, vr17, vr18 + LSX_SUMSUB_W vr4, vr8, vr9, vr11, vr15, vr16, vr19, vr20 + + vpickev.h vr13, vr3, vr3 + vpickev.h vr14, vr7, vr7 + vpickev.h vr15, vr4, vr4 + vpickev.h vr16, vr8, vr8 + vpickev.h vr17, vr9, vr9 + vpickev.h vr18, vr11, vr11 + vpickev.h vr19, vr10, vr10 + vpickev.h vr20, vr12, vr12 + + LSX_TRANSPOSE4x4_H vr13, vr15, vr17, vr19, vr13, vr15, vr17, vr19, vr1, vr3 + LSX_TRANSPOSE4x4_H vr14, vr16, vr18, vr20, vr14, vr16, vr18, vr20, vr2, vr4 + + vsllwil.w.h vr13, vr13, 0 + vsllwil.w.h vr14, vr14, 0 + vsllwil.w.h vr15, vr15, 0 + vsllwil.w.h vr16, vr16, 0 + vsllwil.w.h vr17, vr17, 0 + vsllwil.w.h vr18, vr18, 0 + vsllwil.w.h vr19, vr19, 0 + vsllwil.w.h vr20, vr20, 0 + + vsrai.w vr1, vr15, 1 + vsrai.w vr2, vr16, 1 + vsrai.w vr3, vr19, 1 + vsrai.w vr4, vr20, 1 + + LSX_SUMSUB_W vr5, vr6, vr21, vr22, vr13, vr14, vr17, vr18 + vadd.w vr8, vr15, vr3 + vadd.w vr9, vr16, vr4 + vsub.w vr10, vr1, vr19 + vsub.w vr11, vr2, vr20 + + LSX_SUMSUB_W vr13, vr14, vr19, vr20, vr5, vr6, vr8, vr9 + LSX_SUMSUB_W vr15, vr16, vr17, vr18, vr21, vr22, vr10, vr11 + + vssrarni.h.w vr15, vr13, 6 + vssrarni.h.w vr16, vr14, 6 + vssrarni.h.w vr19, vr17, 6 + vssrarni.h.w vr20, vr18, 6 + + fld.s f1, a0, 0 + fld.s f2, a0, FDEC_STRIDE + fld.s f3, a0, FDEC_STRIDE * 2 + fld.s f4, a0, FDEC_STRIDE * 3 + fld.s f5, a0, 4 + fld.s f6, a0, FDEC_STRIDE + 4 + fld.s f7, a0, FDEC_STRIDE * 2 + 4 + fld.s f8, a0, FDEC_STRIDE * 3 + 4 + + vpickve2gr.w t0, vr2, 0 + vinsgr2vr.w vr1, t0, 1 + vpickve2gr.w t0, vr4, 0 + vinsgr2vr.w vr3, t0, 1 + vpickve2gr.w t0, vr6, 0 + vinsgr2vr.w vr5, t0, 1 + vpickve2gr.w t0, vr8, 0 + vinsgr2vr.w vr7, t0, 1 + + vilvl.b vr1, vr0, vr1 + vilvl.b vr5, vr0, vr5 + vilvl.b vr3, vr0, vr3 + vilvl.b vr7, vr0, vr7 + + vadd.h vr1, vr1, vr15 + vadd.h vr5, vr5, vr16 + vadd.h vr3, vr3, vr19 + vadd.h vr7, vr7, vr20 + + vssrarni.bu.h vr3, vr1, 0 + vssrarni.bu.h vr7, vr5, 0 + + vstelm.w vr3, a0, 0, 0 + vstelm.w vr3, a0, FDEC_STRIDE, 1 + vstelm.w vr3, a0, FDEC_STRIDE * 2, 2 + vstelm.w vr3, a0, FDEC_STRIDE * 3, 3 + + vstelm.w vr7, a0, 4, 0 + vstelm.w vr7, a0, FDEC_STRIDE + 4, 1 + vstelm.w vr7, a0, FDEC_STRIDE * 2 + 4, 2 + vstelm.w vr7, a0, FDEC_STRIDE * 3 + 4, 3 +.endm + +/* + * void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] ) + * + */ +function_x264 add8x8_idct_lasx + xvxor.v xr0, xr1, xr1 + add8x4_idct_core_lasx + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 64 + add8x4_idct_core_lasx +endfunc_x264 + +.macro add8x8_idct_core_lsx + add8x4_idct_core_lsx + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 64 + add8x4_idct_core_lsx +.endm + +function_x264 add8x8_idct_lsx + vxor.v vr0, vr1, vr1 + add8x8_idct_core_lsx +endfunc_x264 +/* + * void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] ) + */ +function_x264 add16x16_idct_lasx + move t4, a0 + move t5, a1 + + xvxor.v xr0, xr1, xr1 + add8x4_idct_core_lasx + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 64 + add8x4_idct_core_lasx + + addi.d a0, t4, 8 + addi.d a1, t5, 128 + add8x4_idct_core_lasx + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 64 + add8x4_idct_core_lasx + + addi.d t6, t4, FDEC_STRIDE * 8 + move a0, t6 + addi.d a1, t5, 256 + add8x4_idct_core_lasx + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 64 + add8x4_idct_core_lasx + + addi.d a0, t6, 8 + addi.d a1, t5, 384 + add8x4_idct_core_lasx + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 64 + add8x4_idct_core_lasx +endfunc_x264 + +function_x264 add16x16_idct_lsx + move t4, a0 + move t5, a1 + + vxor.v vr0, vr1, vr1 + add8x8_idct_core_lsx + + addi.d a0, t4, 8 + addi.d a1, t5, 128 + add8x8_idct_core_lsx + + addi.d t6, t4, FDEC_STRIDE * 8 + move a0, t6 + addi.d a1, t5, 256 + add8x8_idct_core_lsx + + addi.d a0, t6, 8 + addi.d a1, t5, 384 + add8x8_idct_core_lsx +endfunc_x264 + +/* + * void add8x8_idct8( pixel *dst, dctcoef dct[64] ) + */ +function_x264 add8x8_idct8_lasx + xvxor.v xr20, xr1, xr1 + + // dct[0] += 32 + ld.h t0, a1, 0 + addi.w t0, t0, 32 + st.h t0, a1, 0 + + vld vr0, a1, 0 + vld vr2, a1, 32 + vld vr4, a1, 64 + vld vr6, a1, 96 + + vsrai.h vr8, vr2, 1 + vsrai.h vr10, vr6, 1 + + vext2xv.w.h xr0, xr0 + vext2xv.w.h xr2, xr2 + vext2xv.w.h xr4, xr4 + vext2xv.w.h xr6, xr6 + vext2xv.w.h xr8, xr8 + vext2xv.w.h xr10, xr10 + + LASX_SUMSUB_W xr11, xr12, xr0, xr4 + xvsub.w xr13, xr8, xr6 + xvadd.w xr14, xr10, xr2 + + LASX_SUMSUB_W xr15, xr18, xr11, xr14 + LASX_SUMSUB_W xr16, xr17, xr12, xr13 + + vld vr0, a1, 16 + vld vr2, a1, 48 + vld vr4, a1, 80 + vld vr6, a1, 112 + + vsrai.h vr1, vr0, 1 + vsrai.h vr3, vr2, 1 + vsrai.h vr5, vr4, 1 + vsrai.h vr7, vr6, 1 + + vext2xv.w.h xr0, xr0 + vext2xv.w.h xr2, xr2 + vext2xv.w.h xr4, xr4 + vext2xv.w.h xr6, xr6 + vext2xv.w.h xr1, xr1 + vext2xv.w.h xr3, xr3 + vext2xv.w.h xr5, xr5 + vext2xv.w.h xr7, xr7 + + LASX_SUMSUB_W xr9, xr10, xr4, xr2 + LASX_SUMSUB_W xr11, xr12, xr6, xr0 + + xvsub.w xr10, xr10, xr6 + xvsub.w xr10, xr10, xr7 + xvsub.w xr11, xr11, xr2 + xvsub.w xr11, xr11, xr3 + xvadd.w xr12, xr12, xr4 + xvadd.w xr12, xr12, xr5 + xvadd.w xr9, xr9, xr0 + xvadd.w xr9, xr9, xr1 + + xvsrai.w xr1, xr10, 2 + xvsrai.w xr2, xr11, 2 + xvsrai.w xr3, xr12, 2 + xvsrai.w xr4, xr9, 2 + + xvadd.w xr5, xr4, xr10 + xvadd.w xr6, xr3, xr11 + xvsub.w xr7, xr2, xr12 + xvsub.w xr8, xr9, xr1 + + LASX_SUMSUB_W xr1, xr14, xr15, xr8 + LASX_SUMSUB_W xr2, xr13, xr16, xr7 + LASX_SUMSUB_W xr3, xr12, xr17, xr6 + LASX_SUMSUB_W xr4, xr11, xr18, xr5 + + LASX_TRANSPOSE8x8_W xr1, xr2, xr3, xr4, xr11, xr12, xr13, xr14, \ + xr5, xr6, xr7, xr8, xr15, xr16, xr17, xr18, \ + xr9, xr10, xr21, xr22 + + xvsrai.h xr9, xr7, 1 + xvsrai.h xr10, xr17, 1 + + xvaddwev.w.h xr1, xr5, xr15 + xvsubwev.w.h xr2, xr5, xr15 + xvsubwev.w.h xr3, xr9, xr17 + xvaddwev.w.h xr4, xr10, xr7 + + LASX_SUMSUB_W xr11, xr14, xr1, xr4 + LASX_SUMSUB_W xr12, xr13, xr2, xr3 + + xvsrai.h xr1, xr6, 1 + xvsrai.h xr2, xr8, 1 + xvsrai.h xr3, xr16, 1 + xvsrai.h xr4, xr18, 1 + + xvaddwev.w.h xr5, xr16, xr8 + xvsubwev.w.h xr10, xr16, xr8 + xvaddwev.w.h xr7, xr18, xr6 + xvsubwev.w.h xr9, xr18, xr6 + + xvaddwev.w.h xr4, xr18, xr4 + xvsub.w xr10, xr10, xr4 + xvaddwev.w.h xr2, xr8, xr2 + xvsub.w xr7, xr7, xr2 + xvaddwev.w.h xr3, xr16, xr3 + xvadd.w xr9, xr9, xr3 + xvaddwev.w.h xr1, xr6, xr1 + xvadd.w xr5, xr5, xr1 + + xvsrai.w xr1, xr10, 2 + xvsrai.w xr2, xr7, 2 + xvsrai.w xr3, xr9, 2 + xvsrai.w xr4, xr5, 2 + + xvadd.w xr15, xr4, xr10 + xvadd.w xr16, xr7, xr3 + xvsub.w xr17, xr2, xr9 + xvsub.w xr18, xr5, xr1 + + LASX_SUMSUB_W xr1, xr8, xr11, xr18 + LASX_SUMSUB_W xr2, xr7, xr12, xr17 + LASX_SUMSUB_W xr3, xr6, xr13, xr16 + LASX_SUMSUB_W xr4, xr5, xr14, xr15 + + xvsrai.w xr11, xr1, 6 + xvsrai.w xr12, xr2, 6 + xvsrai.w xr13, xr3, 6 + xvsrai.w xr14, xr4, 6 + xvsrai.w xr15, xr5, 6 + xvsrai.w xr16, xr6, 6 + xvsrai.w xr17, xr7, 6 + xvsrai.w xr18, xr8, 6 + + fld.d f1, a0, 0 + fld.d f2, a0, FDEC_STRIDE + fld.d f3, a0, FDEC_STRIDE * 2 + fld.d f4, a0, FDEC_STRIDE * 3 + + fld.d f5, a0, FDEC_STRIDE * 4 + fld.d f6, a0, FDEC_STRIDE * 5 + fld.d f7, a0, FDEC_STRIDE * 6 + fld.d f8, a0, FDEC_STRIDE * 7 + + vext2xv.wu.bu xr1, xr1 + vext2xv.wu.bu xr2, xr2 + vext2xv.wu.bu xr3, xr3 + vext2xv.wu.bu xr4, xr4 + vext2xv.wu.bu xr5, xr5 + vext2xv.wu.bu xr6, xr6 + vext2xv.wu.bu xr7, xr7 + vext2xv.wu.bu xr8, xr8 + + xvadd.w xr1, xr1, xr11 + xvadd.w xr2, xr2, xr12 + xvadd.w xr3, xr3, xr13 + xvadd.w xr4, xr4, xr14 + xvadd.w xr5, xr5, xr15 + xvadd.w xr6, xr6, xr16 + xvadd.w xr7, xr7, xr17 + xvadd.w xr8, xr8, xr18 + + xvssrarni.hu.w xr2, xr1, 0 + xvssrarni.hu.w xr4, xr3, 0 + xvssrarni.hu.w xr6, xr5, 0 + xvssrarni.hu.w xr8, xr7, 0 + + xvpermi.d xr12, xr2, 0xd8 + xvpermi.d xr14, xr4, 0xd8 + xvpermi.d xr16, xr6, 0xd8 + xvpermi.d xr18, xr8, 0xd8 + + xvssrlni.bu.h xr14, xr12, 0 + xvssrlni.bu.h xr18, xr16, 0 + + xvstelm.d xr14, a0, 0, 0 + xvstelm.d xr14, a0, FDEC_STRIDE, 2 + xvstelm.d xr14, a0, FDEC_STRIDE * 2, 1 + xvstelm.d xr14, a0, FDEC_STRIDE * 3, 3 + + xvstelm.d xr18, a0, FDEC_STRIDE * 4, 0 + xvstelm.d xr18, a0, FDEC_STRIDE * 5, 2 + xvstelm.d xr18, a0, FDEC_STRIDE * 6, 1 + xvstelm.d xr18, a0, FDEC_STRIDE * 7, 3 +endfunc_x264 + +function_x264 add8x8_idct8_lsx + ld.h t0, a1, 0 + addi.w t0, t0, 32 + st.h t0, a1, 0 + + vld vr0, a1, 0 + vld vr2, a1, 32 + vld vr4, a1, 64 + vld vr6, a1, 96 + + vsrai.h vr8, vr2, 1 + vsrai.h vr10, vr6, 1 + + vexth.w.h vr1, vr0 + vsllwil.w.h vr0, vr0, 0 + vexth.w.h vr3, vr2 + vsllwil.w.h vr2, vr2, 0 + vexth.w.h vr5, vr4 + vsllwil.w.h vr4, vr4, 0 + vexth.w.h vr7, vr6 + vsllwil.w.h vr6, vr6, 0 + vexth.w.h vr9, vr8 + vsllwil.w.h vr8, vr8, 0 + vexth.w.h vr11, vr10 + vsllwil.w.h vr10, vr10, 0 + + LSX_SUMSUB_W vr12, vr13, vr14, vr15, vr0, vr1, vr4, vr5 + vsub.w vr16, vr8, vr6 + vsub.w vr17, vr9, vr7 + vadd.w vr18, vr10, vr2 + vadd.w vr19, vr11, vr3 + + LSX_SUMSUB_W vr20, vr21, vr18, vr19, vr12, vr13, vr18, vr19 + LSX_SUMSUB_W vr22, vr23, vr16, vr17, vr14, vr15, vr16, vr17 + + vld vr0, a1, 16 + vld vr2, a1, 48 + vld vr4, a1, 80 + vld vr6, a1, 112 + + vsrai.h vr1, vr0, 1 + vsrai.h vr3, vr2, 1 + vsrai.h vr5, vr4, 1 + vsrai.h vr7, vr6, 1 + + vexth.w.h vr8, vr0 + vsllwil.w.h vr0, vr0, 0 + vexth.w.h vr10, vr2 + vsllwil.w.h vr2, vr2, 0 + vexth.w.h vr12, vr4 + vsllwil.w.h vr4, vr4, 0 + vexth.w.h vr14, vr6 + vsllwil.w.h vr6, vr6, 0 + vexth.w.h vr9, vr1 + vsllwil.w.h vr1, vr1, 0 + vexth.w.h vr11, vr3 + vsllwil.w.h vr3, vr3, 0 + vexth.w.h vr13, vr5 + vsllwil.w.h vr5, vr5, 0 + vexth.w.h vr15, vr7 + vsllwil.w.h vr7, vr7, 0 + + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + LSX_SUMSUB_W vr24, vr25, vr26, vr27, vr4, vr12, vr2, vr10 + LSX_SUMSUB_W vr28, vr29, vr30, vr31, vr6, vr14, vr0, vr8 + + vsub.w vr26, vr26, vr6 + vsub.w vr27, vr27, vr14 + vsub.w vr26, vr26, vr7 + vsub.w vr27, vr27, vr15 + vsub.w vr28, vr28, vr2 + vsub.w vr29, vr29, vr10 + vsub.w vr28, vr28, vr3 + vsub.w vr29, vr29, vr11 + vadd.w vr30, vr30, vr4 + vadd.w vr31, vr31, vr12 + vadd.w vr30, vr30, vr5 + vadd.w vr31, vr31, vr13 + vadd.w vr24, vr24, vr0 + vadd.w vr25, vr25, vr8 + vadd.w vr24, vr24, vr1 + vadd.w vr25, vr25, vr9 + + vsrai.w vr1, vr26, 2 + vsrai.w vr9, vr27, 2 + vsrai.w vr2, vr28, 2 + vsrai.w vr10, vr29, 2 + vsrai.w vr3, vr30, 2 + vsrai.w vr11, vr31, 2 + vsrai.w vr4, vr24, 2 + vsrai.w vr12, vr25, 2 + + vadd.w vr5, vr4, vr26 + vadd.w vr13, vr12, vr27 + vadd.w vr6, vr3, vr28 + vadd.w vr14, vr11, vr29 + vsub.w vr7, vr2, vr30 + vsub.w vr15, vr10, vr31 + vsub.w vr0, vr24, vr1 + vsub.w vr8, vr25, vr9 + + LSX_SUMSUB_W vr1, vr9, vr30, vr31, vr20, vr21, vr0, vr8 + LSX_SUMSUB_W vr2, vr10, vr28, vr29, vr22, vr23, vr7, vr15 + LSX_SUMSUB_W vr3, vr11, vr26, vr27, vr16, vr17, vr6, vr14 + LSX_SUMSUB_W vr4, vr12, vr24, vr25, vr18, vr19, vr5, vr13 + + LSX_TRANSPOSE4x4_W vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr20, vr22 + LSX_TRANSPOSE4x4_W vr9, vr10, vr11, vr12, vr20, vr22, vr16, vr18, vr1, vr2 + LSX_TRANSPOSE4x4_W vr24, vr26, vr28, vr30, vr13, vr14, vr15, vr8, vr21, vr23 + LSX_TRANSPOSE4x4_W vr25, vr27, vr29, vr31, vr21, vr23, vr17, vr19, vr24, vr26 + + vsrai.h vr3, vr7, 1 + vsrai.h vr11, vr15, 1 + vsrai.h vr4, vr16, 1 + vsrai.h vr12, vr17, 1 + + vaddwev.w.h vr1, vr5, vr20 + vaddwev.w.h vr9, vr13, vr21 + vsubwev.w.h vr2, vr5, vr20 + vsubwev.w.h vr10, vr13, vr21 + vsubwev.w.h vr3, vr3, vr16 + vsubwev.w.h vr11, vr11, vr17 + vaddwev.w.h vr4, vr4, vr7 + vaddwev.w.h vr12, vr12, vr15 + + LSX_SUMSUB_W vr24, vr25, vr30, vr31, vr1, vr9, vr4, vr12 + LSX_SUMSUB_W vr26, vr27, vr28, vr29, vr2, vr10, vr3, vr11 + + vsrai.h vr1, vr6, 1 + vsrai.h vr9, vr14, 1 + vsrai.h vr2, vr0, 1 + vsrai.h vr10, vr8, 1 + vsrai.h vr3, vr22, 1 + vsrai.h vr11, vr23, 1 + vsrai.h vr4, vr18, 1 + vsrai.h vr12, vr19, 1 + + vaddwev.w.h vr5, vr22, vr0 + vaddwev.w.h vr13, vr23, vr8 + vsubwev.w.h vr20, vr22, vr0 + vsubwev.w.h vr21, vr23, vr8 + vaddwev.w.h vr7, vr18, vr6 + vaddwev.w.h vr15, vr19, vr14 + vsubwev.w.h vr16, vr18, vr6 + vsubwev.w.h vr17, vr19, vr14 + + vaddwev.w.h vr4, vr18, vr4 + vaddwev.w.h vr12, vr19, vr12 + vsub.w vr20, vr20, vr4 + vsub.w vr21, vr21, vr12 + vaddwev.w.h vr2, vr0, vr2 + vaddwev.w.h vr10, vr8, vr10 + vsub.w vr7, vr7, vr2 + vsub.w vr15, vr15, vr10 + vaddwev.w.h vr3, vr22, vr3 + vaddwev.w.h vr11, vr23, vr11 + vadd.w vr16, vr16, vr3 + vadd.w vr17, vr17, vr11 + vaddwev.w.h vr1, vr6, vr1 + vaddwev.w.h vr9, vr14, vr9 + vadd.w vr5, vr5, vr1 + vadd.w vr13, vr13, vr9 + + vsrai.w vr1, vr20, 2 + vsrai.w vr9, vr21, 2 + vsrai.w vr2, vr7, 2 + vsrai.w vr10, vr15, 2 + vsrai.w vr3, vr16, 2 + vsrai.w vr11, vr17, 2 + vsrai.w vr4, vr5, 2 + vsrai.w vr12, vr13, 2 + + vadd.w vr20, vr4, vr20 + vadd.w vr21, vr12, vr21 + vadd.w vr22, vr7, vr3 + vadd.w vr23, vr15, vr11 + vsub.w vr16, vr2, vr16 + vsub.w vr17, vr10, vr17 + vsub.w vr18, vr5, vr1 + vsub.w vr19, vr13, vr9 + + LSX_SUMSUB_W vr1, vr9, vr0, vr8, vr24, vr25, vr18, vr19 + LSX_SUMSUB_W vr2, vr10, vr7, vr15, vr26, vr27, vr16, vr17 + LSX_SUMSUB_W vr3, vr11, vr6, vr14, vr28, vr29, vr22, vr23 + LSX_SUMSUB_W vr4, vr12, vr5, vr13, vr30, vr31, vr20, vr21 + + vsrai.w vr24, vr1, 6 + vsrai.w vr25, vr9, 6 + vsrai.w vr26, vr2, 6 + vsrai.w vr27, vr10, 6 + vsrai.w vr28, vr3, 6 + vsrai.w vr29, vr11, 6 + vsrai.w vr30, vr4, 6 + vsrai.w vr31, vr12, 6 + vsrai.w vr20, vr5, 6 + vsrai.w vr21, vr13, 6 + vsrai.w vr22, vr6, 6 + vsrai.w vr23, vr14, 6 + vsrai.w vr16, vr7, 6 + vsrai.w vr17, vr15, 6 + vsrai.w vr18, vr0, 6 + vsrai.w vr19, vr8, 6 + + fld.d f1, a0, 0 + fld.d f2, a0, FDEC_STRIDE + fld.d f3, a0, FDEC_STRIDE * 2 + fld.d f4, a0, FDEC_STRIDE * 3 + + fld.d f5, a0, FDEC_STRIDE * 4 + fld.d f6, a0, FDEC_STRIDE * 5 + fld.d f7, a0, FDEC_STRIDE * 6 + fld.d f8, a0, FDEC_STRIDE * 7 + + vsllwil.hu.bu vr1, vr1, 0 + vexth.wu.hu vr9, vr1 + vsllwil.wu.hu vr1, vr1, 0 + + vsllwil.hu.bu vr2, vr2, 0 + vexth.wu.hu vr10, vr2 + vsllwil.wu.hu vr2, vr2, 0 + + vsllwil.hu.bu vr3, vr3, 0 + vexth.wu.hu vr11, vr3 + vsllwil.wu.hu vr3, vr3, 0 + + vsllwil.hu.bu vr4, vr4, 0 + vexth.wu.hu vr12, vr4 + vsllwil.wu.hu vr4, vr4, 0 + + vsllwil.hu.bu vr5, vr5, 0 + vexth.wu.hu vr13, vr5 + vsllwil.wu.hu vr5, vr5, 0 + + vsllwil.hu.bu vr6, vr6, 0 + vexth.wu.hu vr14, vr6 + vsllwil.wu.hu vr6, vr6, 0 + + vsllwil.hu.bu vr7, vr7, 0 + vexth.wu.hu vr15, vr7 + vsllwil.wu.hu vr7, vr7, 0 + + vsllwil.hu.bu vr8, vr8, 0 + vexth.wu.hu vr0, vr8 + vsllwil.wu.hu vr8, vr8, 0 + + vadd.w vr1, vr1, vr24 + vadd.w vr9, vr9, vr25 + vadd.w vr2, vr2, vr26 + vadd.w vr10, vr10, vr27 + vadd.w vr3, vr3, vr28 + vadd.w vr11, vr11, vr29 + vadd.w vr4, vr4, vr30 + vadd.w vr12, vr12, vr31 + vadd.w vr5, vr5, vr20 + vadd.w vr13, vr13, vr21 + vadd.w vr6, vr6, vr22 + vadd.w vr14, vr14, vr23 + vadd.w vr7, vr7, vr16 + vadd.w vr15, vr15, vr17 + vadd.w vr8, vr8, vr18 + vadd.w vr0, vr0, vr19 + + vssrarni.hu.w vr2, vr1, 0 + vssrarni.hu.w vr10, vr9, 0 + vssrarni.hu.w vr4, vr3, 0 + vssrarni.hu.w vr12, vr11, 0 + vssrarni.hu.w vr6, vr5, 0 + vssrarni.hu.w vr14, vr13, 0 + vssrarni.hu.w vr8, vr7, 0 + vssrarni.hu.w vr0, vr15, 0 + + vpermi.w vr20, vr10, 0x0E + vpermi.w vr10, vr2, 0x44 + vpermi.w vr20, vr2, 0x4E + + vpermi.w vr21, vr12, 0x0E + vpermi.w vr12, vr4, 0x44 + vpermi.w vr21, vr4, 0x4E + + vpermi.w vr22, vr14, 0x0E + vpermi.w vr14, vr6, 0x44 + vpermi.w vr22, vr6, 0x4E + + vpermi.w vr23, vr0, 0x0E + vpermi.w vr0, vr8, 0x44 + vpermi.w vr23, vr8, 0x4E + + vssrlni.bu.h vr12, vr10, 0 + vssrlni.bu.h vr21, vr20, 0 + vssrlni.bu.h vr0, vr14, 0 + vssrlni.bu.h vr23, vr22, 0 + + vstelm.d vr12, a0, 0, 0 + vstelm.d vr21, a0, FDEC_STRIDE, 0 + vstelm.d vr12, a0, FDEC_STRIDE * 2, 1 + vstelm.d vr21, a0, FDEC_STRIDE * 3, 1 + + vstelm.d vr0, a0, FDEC_STRIDE * 4, 0 + vstelm.d vr23, a0, FDEC_STRIDE * 5, 0 + vstelm.d vr0, a0, FDEC_STRIDE * 6, 1 + vstelm.d vr23, a0, FDEC_STRIDE * 7, 1 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 + +.macro add8x4_idct_dc_lasx + xvldrepl.h xr11, a1, 0 + xvldrepl.h xr12, a1, 2 + xvilvl.d xr12, xr12, xr11 + xvsrari.h xr12, xr12, 6 + + fld.d f0, a0, 0 + fld.d f1, a0, FDEC_STRIDE + fld.d f2, a0, FDEC_STRIDE * 2 + fld.d f3, a0, FDEC_STRIDE * 3 + + xvinsve0.d xr0, xr1, 1 + xvinsve0.d xr2, xr3, 1 + + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr2, xr2 + + xvadd.h xr0, xr0, xr12 + xvadd.h xr2, xr2, xr12 + xvssrarni.bu.h xr2, xr0, 0 + + xvstelm.d xr2, a0, 0, 0 + xvstelm.d xr2, a0, FDEC_STRIDE, 2 + xvstelm.d xr2, a0, FDEC_STRIDE * 2, 1 + xvstelm.d xr2, a0, FDEC_STRIDE * 3, 3 +.endm + +.macro add8x4_idct_dc_lsx + vldrepl.h vr11, a1, 0 + vldrepl.h vr12, a1, 2 + vilvl.d vr12, vr12, vr11 + vsrari.h vr12, vr12, 6 + + fld.d f0, a0, 0 + fld.d f1, a0, FDEC_STRIDE + fld.d f2, a0, FDEC_STRIDE * 2 + fld.d f3, a0, FDEC_STRIDE * 3 + + vsllwil.hu.bu vr0, vr0, 0 + vsllwil.hu.bu vr1, vr1, 0 + vsllwil.hu.bu vr2, vr2, 0 + vsllwil.hu.bu vr3, vr3, 0 + + vadd.h vr0, vr0, vr12 + vadd.h vr1, vr1, vr12 + vadd.h vr2, vr2, vr12 + vadd.h vr3, vr3, vr12 + vssrarni.bu.h vr2, vr0, 0 + vssrarni.bu.h vr3, vr1, 0 + + vstelm.d vr2, a0, 0, 0 + vstelm.d vr3, a0, FDEC_STRIDE, 0 + vstelm.d vr2, a0, FDEC_STRIDE * 2, 1 + vstelm.d vr3, a0, FDEC_STRIDE * 3, 1 +.endm +/* + * void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] ) + */ +function_x264 add8x8_idct_dc_lasx + add8x4_idct_dc_lasx + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 4 + add8x4_idct_dc_lasx +endfunc_x264 + +function_x264 add8x8_idct_dc_lsx + add8x4_idct_dc_lsx + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 4 + add8x4_idct_dc_lsx +endfunc_x264 + +.macro add_16x16_idct_dc_core_lasx a0, a1 + vldrepl.h vr11, \a1, 0 + vldrepl.h vr12, \a1, 2 + vldrepl.h vr13, \a1, 4 + vldrepl.h vr14, \a1, 6 + + xvinsve0.d xr11, xr12, 1 + xvinsve0.d xr11, xr13, 2 + xvinsve0.d xr11, xr14, 3 + + xvsrari.h xr11, xr11, 6 + + vld vr0, \a0, 0 + vld vr1, \a0, FDEC_STRIDE + vld vr2, \a0, FDEC_STRIDE * 2 + vld vr3, \a0, FDEC_STRIDE * 3 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + xvadd.h xr0, xr0, xr11 + xvadd.h xr1, xr1, xr11 + xvadd.h xr2, xr2, xr11 + xvadd.h xr3, xr3, xr11 + xvssrarni.bu.h xr1, xr0, 0 + xvssrarni.bu.h xr3, xr2, 0 + xvpermi.d xr4, xr1, 0xD8 + xvpermi.d xr5, xr1, 0x8D + xvpermi.d xr6, xr3, 0xD8 + xvpermi.d xr7, xr3, 0x8D + vst vr4, \a0, 0 + vst vr5, \a0, FDEC_STRIDE + vst vr6, \a0, FDEC_STRIDE * 2 + vst vr7, \a0, FDEC_STRIDE * 3 +.endm + +/* + * void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] ) + */ +function_x264 add16x16_idct_dc_lasx + add_16x16_idct_dc_core_lasx a0, a1 + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 8 + add_16x16_idct_dc_core_lasx a0, a1 + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 8 + add_16x16_idct_dc_core_lasx a0, a1 + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 8 + add_16x16_idct_dc_core_lasx a0, a1 +endfunc_x264 + +.macro add_16x16_idct_dc_core_lsx a0, a1 + vldrepl.h vr11, \a1, 0 + vldrepl.h vr12, \a1, 2 + vldrepl.h vr13, \a1, 4 + vldrepl.h vr14, \a1, 6 + + vpermi.w vr12, vr11, 0x44 + vpermi.w vr14, vr13, 0x44 + vsrari.h vr12, vr12, 6 + vsrari.h vr14, vr14, 6 + + vld vr0, \a0, 0 + vld vr1, \a0, FDEC_STRIDE + vld vr2, \a0, FDEC_STRIDE * 2 + vld vr3, \a0, FDEC_STRIDE * 3 + + vexth.hu.bu vr5, vr0 + vsllwil.hu.bu vr0, vr0, 0 + vexth.hu.bu vr6, vr1 + vsllwil.hu.bu vr1, vr1, 0 + vexth.hu.bu vr7, vr2 + vsllwil.hu.bu vr2, vr2, 0 + vexth.hu.bu vr8, vr3 + vsllwil.hu.bu vr3, vr3, 0 + + vadd.h vr0, vr0, vr12 + vadd.h vr5, vr5, vr14 + vadd.h vr1, vr1, vr12 + vadd.h vr6, vr6, vr14 + vadd.h vr2, vr2, vr12 + vadd.h vr7, vr7, vr14 + vadd.h vr3, vr3, vr12 + vadd.h vr8, vr8, vr14 + + vssrarni.bu.h vr1, vr0, 0 + vssrarni.bu.h vr6, vr5, 0 + vssrarni.bu.h vr3, vr2, 0 + vssrarni.bu.h vr8, vr7, 0 + + vpermi.w vr9, vr6, 0x0E + vpermi.w vr6, vr1, 0x44 + vpermi.w vr9, vr1, 0x4E + vpermi.w vr10, vr8, 0x0E + vpermi.w vr8, vr3, 0x44 + vpermi.w vr10, vr3, 0x4E + + vst vr6, \a0, 0 + vst vr9, \a0, FDEC_STRIDE + vst vr8, \a0, FDEC_STRIDE * 2 + vst vr10, \a0, FDEC_STRIDE * 3 +.endm + +function_x264 add16x16_idct_dc_lsx + add_16x16_idct_dc_core_lsx a0, a1 + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 8 + add_16x16_idct_dc_core_lsx a0, a1 + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 8 + add_16x16_idct_dc_core_lsx a0, a1 + + addi.d a0, a0, FDEC_STRIDE * 4 + addi.d a1, a1, 8 + add_16x16_idct_dc_core_lsx a0, a1 +endfunc_x264 + +/* + * void idct4x4dc( dctcoef d[16] ) + */ +function_x264 idct4x4dc_lasx + la.local t0, last64_shuf + xvld xr0, a0, 0 + xvld xr20, t0, 0 + xvshuf4i.b xr1, xr0, 0x4E + xvhaddw.w.h xr2, xr0, xr0 + xvhsubw.w.h xr3, xr1, xr1 + xvshuf4i.h xr2, xr2, 0x4E + xvshuf4i.h xr3, xr3, 0x4E + xvhaddw.d.w xr4, xr2, xr2 + xvhsubw.d.w xr5, xr2, xr2 + xvhsubw.d.w xr6, xr3, xr3 + xvhaddw.d.w xr7, xr3, xr3 + xvpickev.w xr8, xr5, xr4 + xvpickev.w xr9, xr7, xr6 + xvpickev.h xr10, xr9, xr8 + xvperm.w xr10, xr10, xr20 + xvshuf4i.b xr11, xr10, 0x4E + xvhaddw.w.h xr12, xr10, xr10 + xvhsubw.w.h xr13, xr11, xr11 + xvshuf4i.h xr12, xr12, 0x4E + xvshuf4i.h xr13, xr13, 0x4E + xvhaddw.d.w xr14, xr12, xr12 + xvhsubw.d.w xr15, xr12, xr12 + xvhsubw.d.w xr16, xr13, xr13 + xvhaddw.d.w xr17, xr13, xr13 + xvpackev.w xr18, xr15, xr14 + xvpackev.w xr19, xr17, xr16 + xvilvl.d xr0, xr19, xr18 + xvilvh.d xr1, xr19, xr18 + xvpickev.h xr2, xr1, xr0 + xvst xr2, a0, 0 +endfunc_x264 + +function_x264 idct4x4dc_lsx + vld vr0, a0, 0 + vld vr20, a0, 16 + + vshuf4i.b vr1, vr0, 0x4E + vshuf4i.b vr11, vr20, 0x4E + vhaddw.w.h vr2, vr0, vr0 + vhaddw.w.h vr12, vr20, vr20 + vhsubw.w.h vr3, vr1, vr1 + vhsubw.w.h vr13, vr11, vr11 + vshuf4i.h vr2, vr2, 0x4E + vshuf4i.h vr12, vr12, 0x4E + vshuf4i.h vr3, vr3, 0x4E + vshuf4i.h vr13, vr13, 0x4E + + vhaddw.d.w vr4, vr2, vr2 + vhaddw.d.w vr14, vr12, vr12 + vhsubw.d.w vr5, vr2, vr2 + vhsubw.d.w vr15, vr12, vr12 + vhsubw.d.w vr6, vr3, vr3 + vhsubw.d.w vr16, vr13, vr13 + vhaddw.d.w vr7, vr3, vr3 + vhaddw.d.w vr17, vr13, vr13 + + vpickev.w vr8, vr5, vr4 + vpickev.w vr18, vr15, vr14 + vpickev.w vr9, vr7, vr6 + vpickev.w vr19, vr17, vr16 + vpickev.h vr10, vr9, vr8 + vpickev.h vr21, vr19, vr18 + + vpermi.w vr22, vr21, 0x0E + vpermi.w vr21, vr10, 0x44 + vpermi.w vr22, vr10, 0x4E + vpermi.w vr21, vr21, 0xD8 + vpermi.w vr22, vr22, 0xD8 + + vshuf4i.b vr11, vr21, 0x4E + vshuf4i.b vr12, vr22, 0x4E + vhaddw.w.h vr21, vr21, vr21 + vhaddw.w.h vr22, vr22, vr22 + vhsubw.w.h vr11, vr11, vr11 + vhsubw.w.h vr12, vr12, vr12 + vshuf4i.h vr21, vr21, 0x4E + vshuf4i.h vr22, vr22, 0x4E + vshuf4i.h vr11, vr11, 0x4E + vshuf4i.h vr12, vr12, 0x4E + + vhaddw.d.w vr13, vr21, vr21 + vhaddw.d.w vr14, vr22, vr22 + vhsubw.d.w vr15, vr21, vr21 + vhsubw.d.w vr16, vr22, vr22 + vhsubw.d.w vr17, vr11, vr11 + vhsubw.d.w vr18, vr12, vr12 + vhaddw.d.w vr19, vr11, vr11 + vhaddw.d.w vr20, vr12, vr12 + + vpackev.w vr7, vr15, vr13 + vpackev.w vr8, vr16, vr14 + vpackev.w vr9, vr19, vr17 + vpackev.w vr10, vr20, vr18 + vilvl.d vr0, vr9, vr7 + vilvl.d vr4, vr10, vr8 + vilvh.d vr1, vr9, vr7 + vilvh.d vr5, vr10, vr8 + + vpickev.h vr2, vr1, vr0 + vpickev.h vr3, vr5, vr4 + vst vr2, a0, 0 + vst vr3, a0, 16 +endfunc_x264 + +/* + * void dct4x4dc( dctcoef d[16] ) + */ +function_x264 dct4x4dc_lasx + la.local t0, last64_shuf + xvld xr0, a0, 0 + xvld xr20, t0, 0 + xvshuf4i.b xr1, xr0, 0x4E + xvhaddw.w.h xr2, xr0, xr0 + xvhsubw.w.h xr3, xr1, xr1 + xvshuf4i.h xr2, xr2, 0x4E + xvshuf4i.h xr3, xr3, 0x4E + xvhaddw.d.w xr4, xr2, xr2 + xvhsubw.d.w xr5, xr2, xr2 + xvhsubw.d.w xr6, xr3, xr3 + xvhaddw.d.w xr7, xr3, xr3 + xvpickev.w xr8, xr5, xr4 + xvpickev.w xr9, xr7, xr6 + xvpickev.h xr10, xr9, xr8 + xvperm.w xr10, xr10, xr20 + xvshuf4i.b xr11, xr10, 0x4E + xvhaddw.w.h xr12, xr10, xr10 + xvhsubw.w.h xr13, xr11, xr11 + xvshuf4i.h xr12, xr12, 0x4E + xvshuf4i.h xr13, xr13, 0x4E + xvhaddw.d.w xr14, xr12, xr12 + xvhsubw.d.w xr15, xr12, xr12 + xvhsubw.d.w xr16, xr13, xr13 + xvhaddw.d.w xr17, xr13, xr13 + xvpackev.w xr18, xr15, xr14 + xvpackev.w xr19, xr17, xr16 + xvsrari.w xr18, xr18, 1 + xvsrari.w xr19, xr19, 1 + xvilvl.d xr0, xr19, xr18 + xvilvh.d xr1, xr19, xr18 + xvpickev.h xr2, xr1, xr0 + xvst xr2, a0, 0 +endfunc_x264 + +function_x264 dct4x4dc_lsx + vld vr0, a0, 0 + vld vr20, a0, 16 + + vshuf4i.b vr1, vr0, 0x4E + vshuf4i.b vr11, vr20, 0x4E + vhaddw.w.h vr2, vr0, vr0 + vhaddw.w.h vr12, vr20, vr20 + vhsubw.w.h vr3, vr1, vr1 + vhsubw.w.h vr13, vr11, vr11 + vshuf4i.h vr2, vr2, 0x4E + vshuf4i.h vr12, vr12, 0x4E + vshuf4i.h vr3, vr3, 0x4E + vshuf4i.h vr13, vr13, 0x4E + + vhaddw.d.w vr4, vr2, vr2 + vhaddw.d.w vr14, vr12, vr12 + vhsubw.d.w vr5, vr2, vr2 + vhsubw.d.w vr15, vr12, vr12 + vhsubw.d.w vr6, vr3, vr3 + vhsubw.d.w vr16, vr13, vr13 + vhaddw.d.w vr7, vr3, vr3 + vhaddw.d.w vr17, vr13, vr13 + + vpickev.w vr8, vr5, vr4 + vpickev.w vr18, vr15, vr14 + vpickev.w vr9, vr7, vr6 + vpickev.w vr19, vr17, vr16 + vpickev.h vr10, vr9, vr8 + vpickev.h vr21, vr19, vr18 + + vpermi.w vr22, vr21, 0x0E + vpermi.w vr21, vr10, 0x44 + vpermi.w vr22, vr10, 0x4E + vpermi.w vr21, vr21, 0xD8 + vpermi.w vr22, vr22, 0xD8 + + vshuf4i.b vr11, vr21, 0x4E + vshuf4i.b vr12, vr22, 0x4E + vhaddw.w.h vr21, vr21, vr21 + vhaddw.w.h vr22, vr22, vr22 + vhsubw.w.h vr11, vr11, vr11 + vhsubw.w.h vr12, vr12, vr12 + vshuf4i.h vr21, vr21, 0x4E + vshuf4i.h vr22, vr22, 0x4E + vshuf4i.h vr11, vr11, 0x4E + vshuf4i.h vr12, vr12, 0x4E + + vhaddw.d.w vr13, vr21, vr21 + vhaddw.d.w vr14, vr22, vr22 + vhsubw.d.w vr15, vr21, vr21 + vhsubw.d.w vr16, vr22, vr22 + vhsubw.d.w vr17, vr11, vr11 + vhsubw.d.w vr18, vr12, vr12 + vhaddw.d.w vr19, vr11, vr11 + vhaddw.d.w vr20, vr12, vr12 + + vpackev.w vr7, vr15, vr13 + vpackev.w vr8, vr16, vr14 + vpackev.w vr9, vr19, vr17 + vpackev.w vr10, vr20, vr18 + + vsrari.w vr7, vr7, 1 + vsrari.w vr8, vr8, 1 + vsrari.w vr9, vr9, 1 + vsrari.w vr10, vr10, 1 + + vilvl.d vr0, vr9, vr7 + vilvl.d vr4, vr10, vr8 + vilvh.d vr1, vr9, vr7 + vilvh.d vr10, vr10, vr8 + vpickev.h vr2, vr1, vr0 + vpickev.h vr3, vr10, vr4 + vst vr2, a0, 0 + vst vr3, a0, 16 +endfunc_x264 + +.macro LSX_LOAD_PIX_2 data1, data2 + vld vr0, a1, 0 + vld vr1, a1, FENC_STRIDE + vld vr2, a2, 0 + vld vr3, a2, FDEC_STRIDE + + vilvl.b vr0, vr8, vr0 + vilvl.b vr1, vr8, vr1 + vilvl.b vr2, vr8, vr2 + vilvl.b vr3, vr8, vr3 + + vsub.h \data1, vr0, vr2 + vsub.h \data2, vr1, vr3 + addi.d a1, a1, FENC_STRIDE * 2 + addi.d a2, a2, FDEC_STRIDE * 2 +.endm + +.macro LSX_DCT8_1D + LSX_SUMSUB_H vr0, vr8, vr12, vr19 + LSX_SUMSUB_H vr1, vr9, vr13, vr18 + LSX_SUMSUB_H vr2, vr10, vr14, vr17 + LSX_SUMSUB_H vr3, vr11, vr15, vr16 + + LSX_SUMSUB_H vr4, vr6, vr0, vr3 + LSX_SUMSUB_H vr5, vr7, vr1, vr2 + + vsrai.h vr20, vr8, 1 + vadd.h vr20, vr20, vr9 + vadd.h vr20, vr20, vr10 + vadd.h vr0, vr20, vr8 + + vsrai.h vr20, vr10, 1 + vsub.h vr21, vr8, vr11 + vsub.h vr21, vr21, vr10 + vsub.h vr1, vr21, vr20 + + vsrai.h vr20, vr9, 1 + vadd.h vr21, vr8, vr11 + vsub.h vr21, vr21, vr9 + vsub.h vr2, vr21, vr20 + + vsrai.h vr20, vr11, 1 + vsub.h vr21, vr9, vr10 + vadd.h vr21, vr21, vr11 + vadd.h vr3, vr21, vr20 + + vadd.h vr12, vr4, vr5 + vsrai.h vr20, vr3, 2 + vadd.h vr13, vr0, vr20 + vsrai.h vr20, vr7, 1 + vadd.h vr14, vr6, vr20 + vsrai.h vr20, vr2, 2 + vadd.h vr15, vr1, vr20 + + vsub.h vr16, vr4, vr5 + vsrai.h vr20, vr1, 2 + vsub.h vr17, vr2, vr20 + vsrai.h vr20, vr6, 1 + vsub.h vr18, vr20, vr7 + vsrai.h vr20, vr0, 2 + vsub.h vr19, vr20, vr3 +.endm + +/* + * void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 ) + */ +function_x264 sub8x8_dct8_lsx + vxor.v vr8, vr0, vr0 + + // vr12 ... vr19 + LSX_LOAD_PIX_2 vr12, vr13 + LSX_LOAD_PIX_2 vr14, vr15 + LSX_LOAD_PIX_2 vr16, vr17 + LSX_LOAD_PIX_2 vr18, vr19 + + LSX_DCT8_1D + LSX_TRANSPOSE8x8_H vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ + vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + LSX_DCT8_1D + + vst vr12, a0, 0 + vst vr13, a0, 16 + vst vr14, a0, 32 + vst vr15, a0, 48 + vst vr16, a0, 64 + vst vr17, a0, 80 + vst vr18, a0, 96 + vst vr19, a0, 112 +endfunc_x264 + +.macro LASX_LOAD_PIX_2 data1, data2 + xvld xr0, a1, 0 + xvld xr1, a1, FENC_STRIDE + xvld xr2, a2, 0 + xvld xr3, a2, FDEC_STRIDE + + xvpermi.d xr0, xr0, 0x50 + xvpermi.d xr1, xr1, 0x50 + xvpermi.d xr2, xr2, 0x50 + xvpermi.d xr3, xr3, 0x50 + + xvxor.v xr4, xr0, xr0 + xvilvl.b xr0, xr4, xr0 + xvilvl.b xr1, xr4, xr1 + xvilvl.b xr2, xr4, xr2 + xvilvl.b xr3, xr4, xr3 + + xvsub.h \data1, xr0, xr2 + xvsub.h \data2, xr1, xr3 + addi.d a1, a1, FENC_STRIDE * 2 + addi.d a2, a2, FDEC_STRIDE * 2 +.endm + +.macro LASX_SUMSUB_H sum, diff, a, b + xvadd.h \sum, \a, \b + xvsub.h \diff, \a, \b +.endm + +.macro LASX_DCT8_1D + LASX_SUMSUB_H xr0, xr8, xr12, xr19 + LASX_SUMSUB_H xr1, xr9, xr13, xr18 + LASX_SUMSUB_H xr2, xr10, xr14, xr17 + LASX_SUMSUB_H xr3, xr11, xr15, xr16 + + LASX_SUMSUB_H xr4, xr6, xr0, xr3 + LASX_SUMSUB_H xr5, xr7, xr1, xr2 + + xvsrai.h xr20, xr8, 1 + xvadd.h xr20, xr20, xr9 + xvadd.h xr20, xr20, xr10 + xvadd.h xr0, xr20, xr8 + + xvsrai.h xr20, xr10, 1 + xvsub.h xr21, xr8, xr11 + xvsub.h xr21, xr21, xr10 + xvsub.h xr1, xr21, xr20 + + xvsrai.h xr20, xr9, 1 + xvadd.h xr21, xr8, xr11 + xvsub.h xr21, xr21, xr9 + xvsub.h xr2, xr21, xr20 + + xvsrai.h xr20, xr11, 1 + xvsub.h xr21, xr9, xr10 + xvadd.h xr21, xr21, xr11 + xvadd.h xr3, xr21, xr20 + + xvadd.h xr12, xr4, xr5 + xvsrai.h xr20, xr3, 2 + xvadd.h xr13, xr0, xr20 + xvsrai.h xr20, xr7, 1 + xvadd.h xr14, xr6, xr20 + xvsrai.h xr20, xr2, 2 + xvadd.h xr15, xr1, xr20 + + xvsub.h xr16, xr4, xr5 + xvsrai.h xr20, xr1, 2 + xvsub.h xr17, xr2, xr20 + xvsrai.h xr20, xr6, 1 + xvsub.h xr18, xr20, xr7 + xvsrai.h xr20, xr0, 2 + xvsub.h xr19, xr20, xr3 +.endm + +.macro SUB16x8_DCT8_LASX + LASX_LOAD_PIX_2 xr12, xr13 + LASX_LOAD_PIX_2 xr14, xr15 + LASX_LOAD_PIX_2 xr16, xr17 + LASX_LOAD_PIX_2 xr18, xr19 + + LASX_DCT8_1D + LASX_TRANSPOSE8x8_H xr12, xr13, xr14, xr15, xr16, xr17, xr18, xr19, \ + xr12, xr13, xr14, xr15, xr16, xr17, xr18, xr19, \ + xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7 + LASX_DCT8_1D + + xmov xr0, xr13 + xvpermi.q xr13, xr12, 0x20 + xvst xr13, a0, 0 + xmov xr1, xr15 + xvpermi.q xr15, xr14, 0x20 + xvst xr15, a0, 32 + xmov xr2, xr17 + xvpermi.q xr17, xr16, 0x20 + xvst xr17, a0, 64 + xmov xr3, xr19 + xvpermi.q xr19, xr18, 0x20 + xvst xr19, a0, 96 + + xvpermi.q xr12, xr0, 0x13 + xvpermi.q xr14, xr1, 0x13 + xvpermi.q xr16, xr2, 0x13 + xvpermi.q xr18, xr3, 0x13 + + xvst xr12, a0, 128 + xvst xr14, a0, 160 + xvst xr16, a0, 192 + xvst xr18, a0, 224 +.endm + +/* + * void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 ) + */ +function_x264 sub16x16_dct8_lasx + move t1, a1 + move t3, a2 + SUB16x8_DCT8_LASX + + addi.d a0, a0, 256 + addi.d a1, t1, FENC_STRIDE * 8 + addi.d a2, t3, FDEC_STRIDE * 8 + SUB16x8_DCT8_LASX +endfunc_x264 + + +.macro LSX_LOAD_PIX_22 data1, data2, data3, data4 + vld vr0, a1, 0 + vld vr4, a1, 16 + vld vr1, a1, FENC_STRIDE + vld vr5, a1, FENC_STRIDE + 16 + vld vr2, a2, 0 + vld vr6, a2, 16 + vld vr3, a2, FDEC_STRIDE + vld vr7, a2, FDEC_STRIDE + 16 + + vpermi.w vr8, vr0, 0x0E + vpermi.w vr0, vr0, 0x44 + vpermi.w vr8, vr8, 0x44 + vpermi.w vr9, vr1, 0x0E + vpermi.w vr1, vr1, 0x44 + vpermi.w vr9, vr9, 0x44 + vpermi.w vr10, vr2, 0x0E + vpermi.w vr2, vr2, 0x44 + vpermi.w vr10, vr10, 0x44 + vpermi.w vr11, vr3, 0x0E + vpermi.w vr3, vr3, 0x44 + vpermi.w vr11, vr11, 0x44 + + vxor.v vr30, vr0, vr0 + vxor.v vr31, vr8, vr8 + + vilvl.b vr0, vr30, vr0 + vilvl.b vr8, vr31, vr8 + vilvl.b vr1, vr30, vr1 + vilvl.b vr9, vr31, vr9 + vilvl.b vr2, vr30, vr2 + vilvl.b vr10, vr31, vr10 + vilvl.b vr3, vr30, vr3 + vilvl.b vr11, vr31, vr11 + + vsub.h \data1, vr0, vr2 + vsub.h \data3, vr8, vr10 + vsub.h \data2, vr1, vr3 + vsub.h \data4, vr9, vr11 + addi.d a1, a1, FENC_STRIDE * 2 + addi.d a2, a2, FDEC_STRIDE * 2 +.endm + +.macro SUB16x8_DCT8_LSX + LSX_LOAD_PIX_22 vr12, vr13, vr22, vr23 + LSX_LOAD_PIX_22 vr14, vr15, vr24, vr25 + LSX_LOAD_PIX_22 vr16, vr17, vr26, vr27 + LSX_LOAD_PIX_22 vr18, vr19, vr28, vr29 + + LSX_DCT8_1D + LSX_TRANSPOSE8x8_H vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ + vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + LSX_DCT8_1D + + vst vr12, a0, 0 + vst vr13, a0, 16 + vst vr14, a0, 32 + vst vr15, a0, 48 + vst vr16, a0, 64 + vst vr17, a0, 80 + vst vr18, a0, 96 + vst vr19, a0, 112 + + vmov vr12, vr22 + vmov vr13, vr23 + vmov vr14, vr24 + vmov vr15, vr25 + vmov vr16, vr26 + vmov vr17, vr27 + vmov vr18, vr28 + vmov vr19, vr29 + + LSX_DCT8_1D + LSX_TRANSPOSE8x8_H vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ + vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \ + vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7 + LSX_DCT8_1D + + vst vr12, a0, 128 + vst vr13, a0, 144 + vst vr14, a0, 160 + vst vr15, a0, 176 + vst vr16, a0, 192 + vst vr17, a0, 208 + vst vr18, a0, 224 + vst vr19, a0, 240 +.endm + +function_x264 sub16x16_dct8_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + move t1, a1 + move t3, a2 + SUB16x8_DCT8_LSX + + addi.d a0, a0, 256 + addi.d a1, t1, FENC_STRIDE * 8 + addi.d a2, t3, FDEC_STRIDE * 8 + SUB16x8_DCT8_LSX + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 + +/* + * void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] ) + */ +function_x264 zigzag_scan_4x4_frame_lasx + xvld xr1, a1, 0 + xvor.v xr2, xr1, xr1 + xvpermi.q xr2, xr2, 0x13 + xvpermi.q xr1, xr1, 0x02 + la.local t0, zigzag_scan4 + xvld xr3, t0, 0 + xvshuf.h xr3, xr2, xr1 + xvst xr3, a0, 0 +endfunc_x264 + +function_x264 zigzag_scan_4x4_frame_lsx + vld vr1, a1, 0 + vld vr2, a1, 16 + vor.v vr3, vr1, vr1 + vor.v vr4, vr2, vr2 + la.local t0, zigzag_scan4 + vld vr5, t0, 0 + vld vr6, t0, 16 + vshuf.h vr5, vr4, vr1 + vshuf.h vr6, vr4, vr1 + vst vr5, a0, 0 + vst vr6, a0, 16 +endfunc_x264 diff --git a/common/loongarch/dct.h b/common/loongarch/dct.h new file mode 100644 index 000000000..fad725eca --- /dev/null +++ b/common/loongarch/dct.h @@ -0,0 +1,95 @@ +/***************************************************************************** + * dct.h: loongarch transform and zigzag + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Peng Zhou + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_LOONGARCH_DCT_H +#define X264_LOONGARCH_DCT_H + +#define x264_sub8x8_dct_lasx x264_template(sub8x8_dct_lasx) +void x264_sub8x8_dct_lasx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref ); +#define x264_sub16x16_dct_lasx x264_template(sub16x16_dct_lasx) +void x264_sub16x16_dct_lasx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref ); + +#define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx) +void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 ); +#define x264_sub16x16_dct8_lasx x264_template(sub16x16_dct8_lasx) +void x264_sub16x16_dct8_lasx( int16_t pi_dct[4][64], uint8_t *p_pix1, + uint8_t *p_pix2 ); + +#define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx) +void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] ); +#define x264_add8x8_idct_lasx x264_template(add8x8_idct_lasx) +void x264_add8x8_idct_lasx( uint8_t *p_dst, int16_t pi_dct[4][16] ); +#define x264_add16x16_idct_lasx x264_template(add16x16_idct_lasx) +void x264_add16x16_idct_lasx( uint8_t *p_dst, int16_t pi_dct[16][16] ); +#define x264_add8x8_idct8_lasx x264_template(add8x8_idct8_lasx) +void x264_add8x8_idct8_lasx( uint8_t *p_dst, int16_t pi_dct[64] ); +#define x264_add8x8_idct_dc_lasx x264_template(add8x8_idct_dc_lasx) +void x264_add8x8_idct_dc_lasx( uint8_t *p_dst, int16_t dct[4] ); +#define x264_add16x16_idct_dc_lasx x264_template(add16x16_idct_dc_lasx) +void x264_add16x16_idct_dc_lasx( uint8_t *p_dst, int16_t dct[16] ); + +#define x264_idct4x4dc_lasx x264_template(idct4x4dc_lasx) +void x264_idct4x4dc_lasx( int16_t d[16] ); +#define x264_dct4x4dc_lasx x264_template(dct4x4dc_lasx) +void x264_dct4x4dc_lasx( int16_t d[16] ); + +#define x264_zigzag_scan_4x4_frame_lasx x264_template(zigzag_scan_4x4_frame_lasx) +void x264_zigzag_scan_4x4_frame_lasx( int16_t level[16], int16_t dct[16] ); + +#define x264_sub4x4_dct_lsx x264_template(sub4x4_dct_lsx) +void x264_sub4x4_dct_lsx( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref ); +#define x264_sub8x8_dct_lsx x264_template(sub8x8_dct_lsx) +void x264_sub8x8_dct_lsx( int16_t p_dst[4][16], uint8_t *p_src, uint8_t *p_ref ); +#define x264_sub16x16_dct_lsx x264_template(sub16x16_dct_lsx) +void x264_sub16x16_dct_lsx( int16_t p_dst[16][16], uint8_t *p_src, uint8_t *p_ref ); + +#define x264_sub8x8_dct8_lsx x264_template(sub8x8_dct8_lsx) +void x264_sub8x8_dct8_lsx( int16_t pi_dct[64], uint8_t *p_pix1, uint8_t *p_pix2 ); +#define x264_sub16x16_dct8_lsx x264_template(sub16x16_dct8_lsx) +void x264_sub16x16_dct8_lsx( int16_t pi_dct[4][64], uint8_t *p_pix1, + uint8_t *p_pix2 ); + +#define x264_add4x4_idct_lsx x264_template(add4x4_idct_lsx) +void x264_add4x4_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16] ); +#define x264_add8x8_idct_lsx x264_template(add8x8_idct_lsx) +void x264_add8x8_idct_lsx( uint8_t *p_dst, int16_t pi_dct[4][16] ); +#define x264_add16x16_idct_lsx x264_template(add16x16_idct_lsx) +void x264_add16x16_idct_lsx( uint8_t *p_dst, int16_t pi_dct[16][16] ); +#define x264_add8x8_idct8_lsx x264_template(add8x8_idct8_lsx) +void x264_add8x8_idct8_lsx( uint8_t *p_dst, int16_t pi_dct[64] ); +#define x264_add8x8_idct_dc_lsx x264_template(add8x8_idct_dc_lsx) +void x264_add8x8_idct_dc_lsx( uint8_t *p_dst, int16_t dct[4] ); +#define x264_add16x16_idct_dc_lsx x264_template(add16x16_idct_dc_lsx) +void x264_add16x16_idct_dc_lsx( uint8_t *p_dst, int16_t dct[16] ); + +#define x264_idct4x4dc_lsx x264_template(idct4x4dc_lsx) +void x264_idct4x4dc_lsx( int16_t d[16] ); +#define x264_dct4x4dc_lsx x264_template(dct4x4dc_lsx) +void x264_dct4x4dc_lsx( int16_t d[16] ); + +#define x264_zigzag_scan_4x4_frame_lsx x264_template(zigzag_scan_4x4_frame_lsx) +void x264_zigzag_scan_4x4_frame_lsx( int16_t level[16], int16_t dct[16] ); + +#endif diff --git a/common/loongarch/deblock-a.S b/common/loongarch/deblock-a.S new file mode 100644 index 000000000..d13ea8304 --- /dev/null +++ b/common/loongarch/deblock-a.S @@ -0,0 +1,1618 @@ +/***************************************************************************** + * deblock-a.S: loongarch deblock functions + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Hao Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "loongson_asm.S" +#include "loongson_util.S" + +#if !HIGH_BIT_DEPTH + +const shuf_loc_locn +.byte 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 4, 12, 20, 28 +.byte 16, 24, 0, 8, 17, 25, 1, 9, 18, 26, 2, 10, 19, 27, 3, 11 +endconst + +const shuf_locn +.byte 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 +endconst + +/*Transpose 16 * 6 block with byte elements in vectors*/ +.macro LASX_TRANSPOSE in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15,\ + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,\ + out0, out1, out2, out3, out4, out5 + xvilvl.b \tmp0, \in1, \in0 + xvilvl.b \tmp1, \in3, \in2 + xvilvl.b \tmp2, \in5, \in4 + xvilvl.b \tmp3, \in7, \in6 + xvilvl.b \tmp4, \in9, \in8 + xvilvl.b \tmp5, \in11, \in10 + xvilvl.b \tmp6, \in13, \in12 + xvilvl.b \tmp7, \in15, \in14 + xvpermi.d \tmp0, \tmp0, 0xD8 + xvpermi.d \tmp1, \tmp1, 0xD8 + xvpermi.d \tmp2, \tmp2, 0xD8 + xvpermi.d \tmp3, \tmp3, 0xD8 + xvpermi.d \tmp4, \tmp4, 0xD8 + xvpermi.d \tmp5, \tmp5, 0xD8 + xvpermi.d \tmp6, \tmp6, 0xD8 + xvpermi.d \tmp7, \tmp7, 0xD8 + xvilvl.h \out0, \tmp1, \tmp0 + xvilvl.h \out1, \tmp3, \tmp2 + xvilvl.h \out2, \tmp5, \tmp4 + xvilvl.h \out3, \tmp7, \tmp6 + xvilvl.w \tmp0, \out1, \out0 + xvilvh.w \tmp1, \out1, \out0 + xvilvl.w \tmp2, \out3, \out2 + xvilvh.w \tmp3, \out3, \out2 + xvilvl.d \out0, \tmp2, \tmp0 + xvilvh.d \out1, \tmp2, \tmp0 + xvilvl.d \out2, \tmp3, \tmp1 + xvilvh.d \out3, \tmp3, \tmp1 + xvpermi.d \out4, \out0, 0x4E + xvpermi.d \out5, \out1, 0x4E +.endm + +/* + * void deblock_h_luma_lasx(Pixel *pix, intptr_t stride, int alpha, + * int beta, int8_t *tc0) + */ +function_x264 deblock_h_luma_lasx + slli.d t0, a1, 1 + slli.d t2, a1, 2 + + xvldrepl.w xr1, a4, 0 + add.d t1, t0, a1 + xvreplgr2vr.b xr2, a3 + xvilvl.b xr1, xr1, xr1 + + // Store registers to the stack + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + // Load data from pix + addi.d t4, a0, -3 + FLDD_LOADX_4 t4, a1, t0, t1, f10, f11, f12, f13 + add.d t5, t4, t2 + FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17 + add.d t5, t5, t2 + FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23 + add.d t6, t5, t2 + FLDD_LOADX_4 t6, a1, t0, t1, f24, f25, f26, f27 + + LASX_TRANSPOSE xr10, xr11, xr12, xr13, xr14, xr15, xr16, xr17, \ + xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27, \ + xr8, xr9, xr18, xr19, xr28, xr29, xr30, xr31, \ + xr10, xr11, xr12, xr13, xr14, xr15 + + xvilvl.h xr1, xr1, xr1 + vext2xv.hu.bu xr20, xr10 + vext2xv.hu.bu xr21, xr11 + vext2xv.hu.bu xr22, xr12 + vext2xv.hu.bu xr23, xr13 + vext2xv.hu.bu xr24, xr14 + vext2xv.hu.bu xr25, xr15 + vext2xv.h.b xr3, xr1 + + xvadd.h xr26, xr22, xr23 + xvsrari.h xr26, xr26, 1 + xvneg.h xr4, xr3 + xvadd.h xr27, xr20, xr26 + xvadd.h xr28, xr25, xr26 + xvsub.h xr29, xr23, xr22 + xvsrai.h xr27, xr27, 1 + xvsrai.h xr28, xr28, 1 + xvslli.h xr29, xr29, 2 + xvsub.h xr30, xr21, xr24 + xvsub.h xr27, xr27, xr21 + xvsub.h xr28, xr28, xr24 + xvadd.h xr29, xr29, xr30 + xvclip.h xr27, xr27, xr4, xr3 + xvclip.h xr28, xr28, xr4, xr3 + + xvpickev.b xr16, xr25, xr20 + xvpickev.b xr17, xr23, xr22 + xvabsd.bu xr5, xr16, xr17 + xvaddi.hu xr6, xr3, 1 + xvslt.bu xr5, xr5, xr2 + xvilvl.b xr30, xr5, xr5 + xvilvh.b xr31, xr5, xr5 + xvbitsel.v xr3, xr3, xr6, xr30 + + xvsrari.h xr29, xr29, 3 + xvaddi.hu xr6, xr3, 1 + xvbitsel.v xr3, xr3, xr6, xr31 + xvneg.h xr4, xr3 + + xvclip.h xr29, xr29, xr4, xr3 + xvadd.h xr30, xr21, xr27 + xvadd.h xr18, xr24, xr28 + xvadd.h xr19, xr22, xr29 + xvsub.h xr26, xr23, xr29 + xvssrarni.bu.h xr26, xr19, 0 + + xvpickev.b xr25, xr18, xr30 + xvpickev.b xr27, xr24, xr21 + xvpickev.b xr28, xr23, xr22 + xvpickev.b xr18, xr22, xr21 + + xvabsd.bu xr19, xr18, xr17 + xvreplgr2vr.b xr30, a2 + xvilvl.d xr31, xr30, xr2 + xvabsd.bu xr20, xr14, xr13 + xvslt.bu xr19, xr19, xr31 + xvslt.bu xr20, xr20, xr2 + + xvbitsel.v xr25, xr27, xr25, xr5 + xvpermi.d xr20, xr20, 0x50 + xvand.v xr21, xr20, xr19 + xvpermi.d xr7, xr21, 0xB1 + xvand.v xr21, xr21, xr7 + xvbitsel.v xr25, xr27, xr25, xr21 + xvpermi.d xr1, xr1, 0x50 + xvbitsel.v xr26, xr28, xr26, xr21 + xvslti.b xr30, xr1, 0 + xvbitsel.v xr25, xr25, xr27, xr30 + xvbitsel.v xr26, xr26, xr28, xr30 + + xvilvl.b xr10, xr26, xr25 + xvilvh.b xr20, xr25, xr26 + xvilvl.h xr21, xr20, xr10 + xvilvh.h xr22, xr20, xr10 + + // Store data to pix + addi.d t5, a0, -2 + xvstelm.w xr21, t5, 0, 0 + add.d t5, t5, a1 + xvstelm.w xr21, t5, 0, 1 + add.d t5, t5, a1 + xvstelm.w xr21, t5, 0, 2 + add.d t5, t5, a1 + xvstelm.w xr21, t5, 0, 3 + add.d t5, t5, a1 + xvstelm.w xr22, t5, 0, 0 + add.d t5, t5, a1 + xvstelm.w xr22, t5, 0, 1 + add.d t5, t5, a1 + xvstelm.w xr22, t5, 0, 2 + add.d t5, t5, a1 + xvstelm.w xr22, t5, 0, 3 + add.d t5, t5, a1 + xvstelm.w xr21, t5, 0, 4 + add.d t5, t5, a1 + xvstelm.w xr21, t5, 0, 5 + add.d t5, t5, a1 + xvstelm.w xr21, t5, 0, 6 + add.d t5, t5, a1 + xvstelm.w xr21, t5, 0, 7 + add.d t5, t5, a1 + xvstelm.w xr22, t5, 0, 4 + add.d t5, t5, a1 + xvstelm.w xr22, t5, 0, 5 + add.d t5, t5, a1 + xvstelm.w xr22, t5, 0, 6 + add.d t5, t5, a1 + xvstelm.w xr22, t5, 0, 7 + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 + +/* + * void deblock_v_luma_lasx(Pixel *pix, intptr_t stride, + * int alpha, int beta, int8_t *tc0) + */ +function_x264 deblock_v_luma_lasx + slli.d t0, a1, 1 + + // Load data from tc0 + xvldrepl.w xr1, a4, 0 + add.d t1, t0, a1 + xvreplgr2vr.b xr2, a3 + xvilvl.b xr1, xr1, xr1 + + // Load data from pix + sub.d t5, a0, t1 + vld vr10, t5, 0 + vldx vr11, t5, a1 + vldx vr12, t5, t0 + vld vr13, a0, 0 + vldx vr14, a0, a1 + vldx vr15, a0, t0 + + // Store registers to the stack + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + xvilvl.h xr1, xr1, xr1 + vext2xv.hu.bu xr20, xr10 + vext2xv.hu.bu xr21, xr11 + vext2xv.hu.bu xr22, xr12 + vext2xv.hu.bu xr23, xr13 + vext2xv.hu.bu xr24, xr14 + vext2xv.hu.bu xr25, xr15 + vext2xv.h.b xr3, xr1 + + xvadd.h xr26, xr22, xr23 + xvsrari.h xr26, xr26, 1 + xvneg.h xr4, xr3 + xvadd.h xr27, xr20, xr26 + xvadd.h xr28, xr25, xr26 + xvsub.h xr29, xr23, xr22 + xvsrai.h xr27, xr27, 1 + xvsrai.h xr28, xr28, 1 + xvslli.h xr29, xr29, 2 + xvsub.h xr30, xr21, xr24 + xvsub.h xr27, xr27, xr21 + xvsub.h xr28, xr28, xr24 + xvadd.h xr29, xr29, xr30 + xvclip.h xr27, xr27, xr4, xr3 + xvclip.h xr28, xr28, xr4, xr3 + + xvpickev.b xr16, xr25, xr20 + xvpickev.b xr17, xr23, xr22 + xvabsd.bu xr5, xr16, xr17 + xvaddi.hu xr6, xr3, 1 + xvslt.bu xr5, xr5, xr2 + xvilvl.b xr30, xr5, xr5 + xvilvh.b xr31, xr5, xr5 + xvbitsel.v xr3, xr3, xr6, xr30 + + xvsrari.h xr29, xr29, 3 + xvaddi.hu xr6, xr3, 1 + xvbitsel.v xr3, xr3, xr6, xr31 + xvneg.h xr4, xr3 + + xvclip.h xr29, xr29, xr4, xr3 + xvadd.h xr30, xr21, xr27 + xvadd.h xr18, xr24, xr28 + xvadd.h xr19, xr22, xr29 + xvsub.h xr26, xr23, xr29 + xvssrarni.bu.h xr26, xr19, 0 + + xvpickev.b xr25, xr18, xr30 + xvpickev.b xr27, xr24, xr21 + xvpickev.b xr28, xr23, xr22 + xvpickev.b xr18, xr22, xr21 + + xvabsd.bu xr19, xr18, xr17 + xvreplgr2vr.b xr30, a2 + xvilvl.d xr31, xr30, xr2 + xvabsd.bu xr20, xr14, xr13 + xvslt.bu xr19, xr19, xr31 + xvslt.bu xr20, xr20, xr2 + + xvbitsel.v xr25, xr27, xr25, xr5 + xvpermi.d xr20, xr20, 0x50 + xvand.v xr21, xr20, xr19 + xvpermi.d xr7, xr21, 0xB1 + xvand.v xr21, xr21, xr7 + xvbitsel.v xr25, xr27, xr25, xr21 + xvpermi.d xr1, xr1, 0x50 + xvbitsel.v xr26, xr28, xr26, xr21 + xvslti.b xr30, xr1, 0 + xvbitsel.v xr25, xr25, xr27, xr30 + xvbitsel.v xr26, xr26, xr28, xr30 + + sub.d t5, a0, t0 + xvpermi.d xr0, xr25, 0xd8 + xvpermi.d xr1, xr26, 0xd8 + xvpermi.d xr2, xr26, 0x8D + xvpermi.d xr3, xr25, 0x8D + + // Store data to pix + vst vr0, t5, 0 + vstx vr1, t5, a1 + vst vr2, a0, 0 + vstx vr3, a0, a1 + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 + +/* + * void deblock_v_luma_intra_lasx(Pixel *pix, intptr_t stride, + * int alpha, int beta) + */ +function_x264 deblock_v_luma_intra_lasx + slli.d t0, a1, 1 + slli.d t2, a1, 2 + add.d t1, t0, a1 + + // Load data from pix + sub.d t5, a0, t2 + vld vr9, t5, 0 + vldx vr10, t5, a1 + vldx vr11, t5, t0 + vldx vr12, t5, t1 + vld vr13, a0, 0 + vldx vr14, a0, a1 + vldx vr15, a0, t0 + vldx vr16, a0, t1 + + // Store registers to the stack + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + xvreplgr2vr.b xr1, a2 + xvreplgr2vr.b xr2, a3 + + vext2xv.hu.bu xr19, xr9 + vext2xv.hu.bu xr20, xr10 + vext2xv.hu.bu xr21, xr11 + vext2xv.hu.bu xr22, xr12 + vext2xv.hu.bu xr23, xr13 + vext2xv.hu.bu xr24, xr14 + vext2xv.hu.bu xr25, xr15 + vext2xv.hu.bu xr26, xr16 + + xvadd.h xr27, xr21, xr22 + xvadd.h xr29, xr19, xr20 + xvadd.h xr3, xr27, xr23 + xvadd.h xr6, xr27, xr24 + xvadd.h xr4, xr3, xr20 + + xvslli.h xr29, xr29, 1 + xvadd.h xr5, xr6, xr4 + xvadd.h xr6, xr6, xr21 + xvadd.h xr5, xr5, xr23 + xvadd.h xr7, xr29, xr4 + + xvsrari.h xr3, xr4, 2 + xvsrari.h xr6, xr6, 2 + xvsrari.h xr4, xr5, 3 + xvadd.h xr27, xr24, xr23 + xvadd.h xr28, xr26, xr25 + xvsrari.h xr5, xr7, 3 + + xvadd.h xr29, xr22, xr27 + xvslli.h xr28, xr28, 1 + xvadd.h xr7, xr29, xr25 + xvadd.h xr17, xr27, xr21 + xvadd.h xr8, xr7, xr28 + xvadd.h xr18, xr17, xr7 + xvadd.h xr17, xr17, xr24 + xvadd.h xr18, xr18, xr22 + + xvsrari.h xr7, xr7, 2 + xvsrari.h xr8, xr8, 3 + xvsrari.h xr18, xr18, 3 + xvsrari.h xr17, xr17, 2 + + xvpickev.b xr27, xr25, xr20 + xvpickev.b xr28, xr24, xr21 + xvpickev.b xr29, xr23, xr22 + + xvpickev.b xr9, xr8, xr5 + xvpickev.b xr16, xr7, xr3 + xvabsd.bu xr30, xr27, xr29 + xvpickev.b xr19, xr18, xr4 + xvpickev.b xr26, xr17, xr6 + + xvslt.bu xr31, xr30, xr2 + xvabsd.bu xr20, xr12, xr13 + xvabsd.bu xr21, xr11, xr12 + xvabsd.bu xr22, xr14, xr13 + xvsrli.b xr0, xr1, 2 + xvbitsel.v xr19, xr26, xr19, xr31 + xvbitsel.v xr9, xr27, xr9, xr31 + xvbitsel.v xr16, xr28, xr16, xr31 + xvaddi.bu xr0, xr0, 2 + xvpermi.d xr20, xr20, 0x50 + xvpermi.d xr21, xr21, 0x50 + xvpermi.d xr22, xr22, 0x50 + xvslt.bu xr10, xr20, xr0 + xvslt.bu xr11, xr20, xr1 + xvslt.bu xr12, xr21, xr2 + xvslt.bu xr13, xr22, xr2 + xvand.v xr30, xr11, xr12 + xvand.v xr30, xr30, xr13 + xvbitsel.v xr9, xr27, xr9, xr10 + xvbitsel.v xr16, xr28, xr16, xr10 + xvbitsel.v xr19, xr26, xr19, xr10 + xvbitsel.v xr9, xr27, xr9, xr30 + xvbitsel.v xr16, xr28, xr16, xr30 + xvbitsel.v xr19, xr29, xr19, xr30 + xvpermi.d xr1, xr9, 0xD8 + xvpermi.d xr2, xr16, 0xD8 + xvpermi.d xr3, xr19, 0xD8 + xvpermi.d xr4, xr19, 0x8D + xvpermi.d xr5, xr16, 0x8D + xvpermi.d xr6, xr9, 0x8D + + // Store data to pix + vstx vr1, t5, a1 + vstx vr2, t5, t0 + vstx vr3, t5, t1 + vst vr4, a0, 0 + vstx vr5, a0, a1 + vstx vr6, a0, t0 + + // Restore register values + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 + +/* + * void deblock_h_luma_intra_lasx(Pixel *pix, intptr_t stride, + * int alpha, int beta) + */ +function_x264 deblock_h_luma_intra_lasx + slli.d t0, a1, 1 + slli.d t2, a1, 2 + addi.d t5, a0, -4 + add.d t1, t0, a1 + + // Store registers to the stack + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + // Load data from pix + FLDD_LOADX_4 t5, a1, t0, t1, f10, f11, f12, f13 + add.d t5, t5, t2 + FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17 + add.d t5, t5, t2 + FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23 + add.d t5, t5, t2 + FLDD_LOADX_4 t5, a1, t0, t1, f24, f25, f26, f27 + + LASX_TRANSPOSE16X8_B xr10, xr11, xr12, xr13, xr14, xr15, xr16, xr17, \ + xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27, \ + xr9, xr10, xr11, xr12, xr13, xr14, xr15, xr16, \ + xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7 + + xvreplgr2vr.b xr1, a2 + xvreplgr2vr.b xr2, a3 + vext2xv.hu.bu xr19, xr9 + vext2xv.hu.bu xr20, xr10 + vext2xv.hu.bu xr21, xr11 + vext2xv.hu.bu xr22, xr12 + vext2xv.hu.bu xr23, xr13 + vext2xv.hu.bu xr24, xr14 + vext2xv.hu.bu xr25, xr15 + vext2xv.hu.bu xr26, xr16 + + xvadd.h xr27, xr21, xr22 + xvadd.h xr29, xr19, xr20 + xvadd.h xr3, xr27, xr23 + xvadd.h xr6, xr27, xr24 + xvadd.h xr4, xr3, xr20 + + xvslli.h xr29, xr29, 1 + xvadd.h xr5, xr6, xr4 + xvadd.h xr6, xr6, xr21 + xvadd.h xr5, xr5, xr23 + xvadd.h xr7, xr29, xr4 + + xvsrari.h xr3, xr4, 2 + xvsrari.h xr6, xr6, 2 + xvsrari.h xr4, xr5, 3 + xvadd.h xr27, xr24, xr23 + xvadd.h xr28, xr26, xr25 + xvsrari.h xr5, xr7, 3 + + xvadd.h xr29, xr22, xr27 + xvslli.h xr28, xr28, 1 + xvadd.h xr7, xr29, xr25 + xvadd.h xr17, xr27, xr21 + xvadd.h xr8, xr7, xr28 + xvadd.h xr18, xr17, xr7 + xvadd.h xr17, xr17, xr24 + xvadd.h xr18, xr18, xr22 + + xvsrari.h xr7, xr7, 2 + xvsrari.h xr8, xr8, 3 + xvsrari.h xr18, xr18, 3 + xvsrari.h xr17, xr17, 2 + + xvpickev.b xr27, xr25, xr20 + xvpickev.b xr28, xr24, xr21 + xvpickev.b xr29, xr23, xr22 + + xvpickev.b xr9, xr8, xr5 + xvpickev.b xr16, xr7, xr3 + xvabsd.bu xr30, xr27, xr29 + xvpickev.b xr19, xr18, xr4 + xvpickev.b xr26, xr17, xr6 + + xvslt.bu xr31, xr30, xr2 + xvabsd.bu xr20, xr12, xr13 + xvabsd.bu xr21, xr11, xr12 + xvabsd.bu xr22, xr14, xr13 + xvsrli.b xr0, xr1, 2 + xvbitsel.v xr19, xr26, xr19, xr31 + xvbitsel.v xr9, xr27, xr9, xr31 + xvbitsel.v xr16, xr28, xr16, xr31 + xvaddi.bu xr0, xr0, 2 + xvpermi.d xr20, xr20, 0x50 + xvpermi.d xr21, xr21, 0x50 + xvpermi.d xr22, xr22, 0x50 + xvslt.bu xr10, xr20, xr0 + xvslt.bu xr11, xr20, xr1 + xvslt.bu xr12, xr21, xr2 + xvslt.bu xr13, xr22, xr2 + xvand.v xr30, xr11, xr12 + xvand.v xr30, xr30, xr13 + xvbitsel.v xr9, xr27, xr9, xr10 + xvbitsel.v xr16, xr28, xr16, xr10 + xvbitsel.v xr19, xr26, xr19, xr10 + + xvbitsel.v xr9, xr27, xr9, xr30 + xvbitsel.v xr16, xr28, xr16, xr30 + xvbitsel.v xr19, xr29, xr19, xr30 + + xvilvl.b xr0, xr16, xr9 + xvpermi.d xr18, xr19, 0xB1 + xvilvh.b xr1, xr9, xr16 + xvilvl.b xr2, xr18, xr19 + addi.d t5, a0, -3 + xvilvl.h xr3, xr2, xr0 + xvilvh.h xr4, xr2, xr0 + + // Store data to pix + xvstelm.w xr3, t5, 0, 0 + xvstelm.h xr1, t5, 4, 0 + add.d t5, t5, a1 + xvstelm.w xr3, t5, 0, 1 + xvstelm.h xr1, t5, 4, 1 + add.d t5, t5, a1 + xvstelm.w xr3, t5, 0, 2 + xvstelm.h xr1, t5, 4, 2 + add.d t5, t5, a1 + xvstelm.w xr3, t5, 0, 3 + xvstelm.h xr1, t5, 4, 3 + add.d t5, t5, a1 + xvstelm.w xr4, t5, 0, 0 + xvstelm.h xr1, t5, 4, 4 + add.d t5, t5, a1 + xvstelm.w xr4, t5, 0, 1 + xvstelm.h xr1, t5, 4, 5 + add.d t5, t5, a1 + xvstelm.w xr4, t5, 0, 2 + xvstelm.h xr1, t5, 4, 6 + add.d t5, t5, a1 + xvstelm.w xr4, t5, 0, 3 + xvstelm.h xr1, t5, 4, 7 + add.d t5, t5, a1 + xvstelm.w xr3, t5, 0, 4 + xvstelm.h xr1, t5, 4, 8 + add.d t5, t5, a1 + xvstelm.w xr3, t5, 0, 5 + xvstelm.h xr1, t5, 4, 9 + add.d t5, t5, a1 + xvstelm.w xr3, t5, 0, 6 + xvstelm.h xr1, t5, 4, 10 + add.d t5, t5, a1 + xvstelm.w xr3, t5, 0, 7 + xvstelm.h xr1, t5, 4, 11 + add.d t5, t5, a1 + xvstelm.w xr4, t5, 0, 4 + xvstelm.h xr1, t5, 4, 12 + add.d t5, t5, a1 + xvstelm.w xr4, t5, 0, 5 + xvstelm.h xr1, t5, 4, 13 + add.d t5, t5, a1 + xvstelm.w xr4, t5, 0, 6 + xvstelm.h xr1, t5, 4, 14 + add.d t5, t5, a1 + xvstelm.w xr4, t5, 0, 7 + xvstelm.h xr1, t5, 4, 15 + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 + +/* + * void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + * int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + * int mvy_limit, int bframe ) + */ +function_x264 deblock_strength_lasx + // dir = 0 s1 = 8 s2 = 1 + vldi vr18, 2 + vldi vr19, 1 + addi.d t0, zero, 4 + xvreplgr2vr.h xr20, t0 + xvreplgr2vr.h xr21, a4 + + xvld xr0, a0, 11 + xvpermi.q xr1, xr0, 0x01 + la.local t0, shuf_loc_locn + xvld xr23, t0, 0 + xvshuf.b xr4, xr1, xr0, xr23 + xvpermi.q xr5, xr4, 0x01 + vor.v vr6, vr4, vr5 + vseqi.b vr6, vr6, 0 + vmov vr15, vr6 + vxor.v vr8, vr8, vr8 + vbitsel.v vr8, vr18, vr8, vr6 + + xvld xr0, a1, 11 + xvpermi.q xr1, xr0, 0x01 + xvshuf.b xr4, xr1, xr0, xr23 + xvpermi.q xr5, xr4, 0x01 + vseq.b vr4, vr4, vr5 + vseqi.b vr4, vr4, 0 + + vld vr0, a2, 44 + vld vr1, a2, 76 + vld vr5, a2, 108 + vld vr6, a2, 140 + vilvl.h vr9, vr1, vr0 + vilvl.h vr10, vr6, vr5 + vilvl.w vr11, vr10, vr9 + vilvh.w vr12, vr10, vr9 + vilvh.h vr9, vr1, vr0 + vilvh.h vr10, vr6, vr5 + vilvl.w vr13, vr10, vr9 + vilvh.w vr14, vr10, vr9 + + vilvl.d vr0, vr13, vr12 + ld.h t0, a2, 60 + ld.h t1, a2, 92 + ld.h t2, a2, 124 + ld.h t3, a2, 156 + vmov vr6, vr14 + vinsgr2vr.h vr6, t0, 4 + vinsgr2vr.h vr6, t1, 5 + vinsgr2vr.h vr6, t2, 6 + vinsgr2vr.h vr6, t3, 7 + vilvl.d vr1, vr12, vr11 + vilvl.d vr5, vr14, vr13 + xvpermi.q xr0, xr6, 0x02 // mv[0][loc][0] + xvpermi.q xr5, xr1, 0x20 // mv[0][locn][0] + xvabsd.h xr5, xr0, xr5 + xvsle.h xr5, xr20, xr5 + + vilvh.d vr0, vr13, vr12 + ld.h t0, a2, 62 + ld.h t1, a2, 94 + ld.h t2, a2, 126 + ld.h t3, a2, 158 + vbsrl.v vr7, vr14, 8 + vinsgr2vr.h vr7, t0, 4 + vinsgr2vr.h vr7, t1, 5 + vinsgr2vr.h vr7, t2, 6 + vinsgr2vr.h vr7, t3, 7 + vilvh.d vr1, vr12, vr11 + vilvh.d vr6, vr14, vr13 + xvpermi.q xr0, xr7, 0x02 // mv[0][loc][1] + xvpermi.q xr6, xr1, 0x20 // mv[0][locn][1] + xvabsd.h xr6, xr0, xr6 + xvsle.h xr6, xr21, xr6 + xvor.v xr5, xr5, xr6 + xvpickev.b xr5, xr5, xr5 + xvpermi.d xr5, xr5, 0xd8 + vor.v vr17, vr4, vr5 + + beqz a5, .bframe_iszero_0 + // bframe != 0 + xvld xr0, a1, 51 + xvpermi.q xr1, xr0, 0x01 + xvshuf.b xr4, xr1, xr0, xr23 + xvpermi.q xr5, xr4, 0x01 + vseq.b vr4, vr4, vr5 + vseqi.b vr4, vr4, 0 + + vld vr0, a2, 204 + vld vr1, a2, 236 + vld vr5, a2, 268 + vld vr6, a2, 300 + vilvl.h vr9, vr1, vr0 + vilvl.h vr10, vr6, vr5 + vilvl.w vr11, vr10, vr9 + vilvh.w vr12, vr10, vr9 + vilvh.h vr9, vr1, vr0 + vilvh.h vr10, vr6, vr5 + vilvl.w vr13, vr10, vr9 + vilvh.w vr14, vr10, vr9 + + vilvl.d vr0, vr13, vr12 + ld.h t0, a2, 220 + ld.h t1, a2, 252 + ld.h t2, a2, 284 + ld.h t3, a2, 316 + vmov vr6, vr14 + vinsgr2vr.h vr6, t0, 4 + vinsgr2vr.h vr6, t1, 5 + vinsgr2vr.h vr6, t2, 6 + vinsgr2vr.h vr6, t3, 7 + vilvl.d vr1, vr12, vr11 + vilvl.d vr5, vr14, vr13 + xvpermi.q xr0, xr6, 0x02 // mv[1][loc][0] + xvpermi.q xr5, xr1, 0x20 // mv[1][locn][0] + xvabsd.h xr5, xr0, xr5 + xvsle.h xr5, xr20, xr5 + + vilvh.d vr0, vr13, vr12 + ld.h t0, a2, 222 + ld.h t1, a2, 254 + ld.h t2, a2, 286 + ld.h t3, a2, 318 + vbsrl.v vr7, vr14, 8 + vinsgr2vr.h vr7, t0, 4 + vinsgr2vr.h vr7, t1, 5 + vinsgr2vr.h vr7, t2, 6 + vinsgr2vr.h vr7, t3, 7 + vilvh.d vr1, vr12, vr11 + vilvh.d vr6, vr14, vr13 + xvpermi.q xr0, xr7, 0x02 // mv[1][loc][1] + xvpermi.q xr6, xr1, 0x20 // mv[1][locn][1] + xvabsd.h xr6, xr0, xr6 + xvsle.h xr6, xr21, xr6 + xvor.v xr5, xr5, xr6 + xvpickev.b xr5, xr5, xr5 + xvpermi.d xr5, xr5, 0xd8 + vor.v vr5, vr5, vr4 + vor.v vr17, vr5, vr17 + +.bframe_iszero_0: + vxor.v vr22, vr22, vr22 + vbitsel.v vr22, vr22, vr19, vr17 + vbitsel.v vr22, vr8, vr22, vr15 + vst vr22, a3, 0 + + // dir = 1 s1 = 1 s2 = 8 + vld vr0, a0, 4 + vld vr1, a0, 20 + ld.wu t0, a0, 36 + vpickev.w vr2, vr1, vr0 + vbsrl.v vr3, vr2, 4 + vinsgr2vr.w vr3, t0, 3 + vor.v vr2, vr3, vr2 + vseqi.b vr2, vr2, 0 + vmov vr15, vr2 + vxor.v vr3, vr3, vr3 + vbitsel.v vr3, vr18, vr3, vr2 + + vld vr0, a1, 4 + vld vr1, a1, 20 + ld.w t0, a1, 36 + vpickev.w vr2, vr1, vr0 + vbsrl.v vr4, vr2, 4 + vinsgr2vr.w vr4, t0, 3 + vseq.b vr2, vr4, vr2 + vseqi.b vr2, vr2, 0 + + vld vr0, a2, 16 + vld vr1, a2, 48 + vld vr12, a2, 80 + vld vr13, a2, 112 + vld vr4, a2, 144 + vpickev.h vr5, vr1, vr0 + vpickev.h vr14, vr13, vr12 + xvpermi.q xr5, xr14, 0x02 // mv[0][locn][0] + vpickev.h vr7, vr4, vr4 + xvpermi.d xr6, xr5, 0x39 + xvinsve0.d xr6, xr7, 3 // mv[0][loc][0] + xvabsd.h xr5, xr6, xr5 + xvsle.h xr5, xr20, xr5 + + vpickod.h vr6, vr1, vr0 + vpickod.h vr14, vr13, vr12 + xvpermi.q xr6, xr14, 0x02 // mv[0][locn][1] + vpickod.h vr7, vr4, vr4 + xvpermi.d xr8, xr6, 0x39 + xvinsve0.d xr8, xr7, 3 // mv[0][loc][1] + xvabsd.h xr6, xr8, xr6 + xvsle.h xr6, xr21, xr6 + + xvor.v xr5, xr6, xr5 + xvpickev.b xr6, xr5, xr5 + xvpermi.d xr6, xr6, 0xd8 + vor.v vr2, vr6, vr2 + + beqz a5, .bframe_iszero_1 + // bframe != 0 ref[1] + vld vr0, a1, 44 + vld vr1, a1, 60 + ld.w t0, a1, 76 + vpickev.w vr0, vr1, vr0 + vbsrl.v vr1, vr0, 4 + vinsgr2vr.w vr1, t0, 3 + vseq.b vr11, vr1, vr0 + vseqi.b vr11, vr11, 0 + + vld vr0, a2, 176 + vld vr1, a2, 208 + vld vr12, a2, 240 + vld vr13, a2, 272 + vld vr4, a2, 304 + vpickev.h vr5, vr1, vr0 + vpickev.h vr14, vr13, vr12 + xvpermi.q xr5, xr14, 0x02 // mv[1][locn][0] + vpickev.h vr7, vr4, vr4 + xvpermi.d xr6, xr5, 0x39 + xvinsve0.d xr6, xr7, 3 // mv[1][loc][0] + xvabsd.h xr5, xr6, xr5 + xvsle.h xr5, xr20, xr5 + + vpickod.h vr6, vr1, vr0 + vpickod.h vr14, vr13, vr12 + xvpermi.q xr6, xr14, 0x02 // mv[1][locn][1] + vpickod.h vr7, vr4, vr4 + xvpermi.d xr8, xr6, 0x39 + xvinsve0.d xr8, xr7, 3 // mv[1][loc][1] + xvabsd.h xr6, xr8, xr6 + xvsle.h xr6, xr21, xr6 + + xvor.v xr5, xr6, xr5 + xvpickev.b xr6, xr5, xr5 + xvpermi.d xr6, xr6, 0xd8 + vor.v vr6, vr6, vr11 + vor.v vr2, vr6, vr2 + +.bframe_iszero_1: + vxor.v vr22, vr22, vr22 + vbitsel.v vr22, vr22, vr19, vr2 + vbitsel.v vr22, vr3, vr22, vr15 + vst vr22, a3, 32 +endfunc_x264 + +/* + * void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + * int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + * int mvy_limit, int bframe ) + */ +function_x264 deblock_strength_lsx + // dir = 0 s1 = 8 s2 = 1 + vldi vr18, 2 + vldi vr19, 1 + addi.d t0, zero, 4 + vreplgr2vr.h vr20, t0 + vreplgr2vr.h vr21, a4 + + vld vr0, a0, 11 + vld vr1, a0, 27 + la.local t0, shuf_loc_locn + la.local t1, shuf_locn + vld vr2, t0, 0 + vld vr3, t1, 0 + vshuf.b vr4, vr1, vr0, vr2 + vshuf.b vr5, vr1, vr0, vr3 + vor.v vr6, vr4, vr5 + vseqi.b vr6, vr6, 0 + vmov vr15, vr6 + vxor.v vr8, vr8, vr8 + vbitsel.v vr8, vr18, vr8, vr6 + + vld vr0, a1, 11 + vld vr1, a1, 27 + vshuf.b vr4, vr1, vr0, vr2 + vshuf.b vr5, vr1, vr0, vr3 + vseq.b vr4, vr4, vr5 + vseqi.b vr4, vr4, 0 + + vld vr0, a2, 44 + vld vr1, a2, 76 + vld vr5, a2, 108 + vld vr6, a2, 140 + vilvl.h vr9, vr1, vr0 + vilvl.h vr10, vr6, vr5 + vilvl.w vr11, vr10, vr9 + vilvh.w vr12, vr10, vr9 + vilvh.h vr9, vr1, vr0 + vilvh.h vr10, vr6, vr5 + vilvl.w vr13, vr10, vr9 + vilvh.w vr14, vr10, vr9 + + vilvl.d vr0, vr13, vr12 + ld.h t0, a2, 60 + ld.h t1, a2, 92 + ld.h t2, a2, 124 + ld.h t3, a2, 156 + vmov vr6, vr14 + vinsgr2vr.h vr6, t0, 4 + vinsgr2vr.h vr6, t1, 5 + vinsgr2vr.h vr6, t2, 6 + vinsgr2vr.h vr6, t3, 7 + vilvl.d vr1, vr12, vr11 + vilvl.d vr5, vr14, vr13 + vabsd.h vr9, vr0, vr1 + vabsd.h vr5, vr6, vr5 + vsle.h vr9, vr20, vr9 + vsle.h vr5, vr20, vr5 + + vilvh.d vr0, vr13, vr12 + ld.h t0, a2, 62 + ld.h t1, a2, 94 + ld.h t2, a2, 126 + ld.h t3, a2, 158 + vbsrl.v vr7, vr14, 8 + vinsgr2vr.h vr7, t0, 4 + vinsgr2vr.h vr7, t1, 5 + vinsgr2vr.h vr7, t2, 6 + vinsgr2vr.h vr7, t3, 7 + vilvh.d vr1, vr12, vr11 + vilvh.d vr6, vr14, vr13 + vabsd.h vr0, vr0, vr1 + vabsd.h vr6, vr7, vr6 + vsle.h vr0, vr21, vr0 + vsle.h vr6, vr21, vr6 + + vor.v vr9, vr9, vr0 + vor.v vr5, vr5, vr6 + vpickev.b vr5, vr5, vr9 + vor.v vr17, vr4, vr5 + + beqz a5, .bframeiszero_0_lsx + // bframe != 0 + vld vr0, a1, 51 + vld vr1, a1, 67 + vshuf.b vr4, vr1, vr0, vr2 + vshuf.b vr5, vr1, vr0, vr3 + vseq.b vr4, vr4, vr5 + vseqi.b vr4, vr4, 0 + + vld vr0, a2, 204 + vld vr1, a2, 236 + vld vr5, a2, 268 + vld vr6, a2, 300 + vilvl.h vr9, vr1, vr0 + vilvl.h vr10, vr6, vr5 + vilvl.w vr11, vr10, vr9 + vilvh.w vr12, vr10, vr9 + vilvh.h vr9, vr1, vr0 + vilvh.h vr10, vr6, vr5 + vilvl.w vr13, vr10, vr9 + vilvh.w vr14, vr10, vr9 + + vilvl.d vr0, vr13, vr12 + ld.h t0, a2, 220 + ld.h t1, a2, 252 + ld.h t2, a2, 284 + ld.h t3, a2, 316 + vmov vr6, vr14 + vinsgr2vr.h vr6, t0, 4 + vinsgr2vr.h vr6, t1, 5 + vinsgr2vr.h vr6, t2, 6 + vinsgr2vr.h vr6, t3, 7 + vilvl.d vr1, vr12, vr11 + vilvl.d vr5, vr14, vr13 + vabsd.h vr9, vr0, vr1 + vabsd.h vr5, vr6, vr5 + vsle.h vr9, vr20, vr9 + vsle.h vr5, vr20, vr5 + + vilvh.d vr0, vr13, vr12 + ld.h t0, a2, 222 + ld.h t1, a2, 254 + ld.h t2, a2, 286 + ld.h t3, a2, 318 + vbsrl.v vr7, vr14, 8 + vinsgr2vr.h vr7, t0, 4 + vinsgr2vr.h vr7, t1, 5 + vinsgr2vr.h vr7, t2, 6 + vinsgr2vr.h vr7, t3, 7 + vilvh.d vr1, vr12, vr11 + vilvh.d vr6, vr14, vr13 + vabsd.h vr0, vr0, vr1 + vabsd.h vr6, vr7, vr6 + vsle.h vr0, vr21, vr0 + vsle.h vr6, vr21, vr6 + + vor.v vr9, vr9, vr0 + vor.v vr5, vr5, vr6 + vpickev.b vr5, vr5, vr9 + vor.v vr5, vr5, vr4 + vor.v vr17, vr5, vr17 + +.bframeiszero_0_lsx: + vxor.v vr22, vr22, vr22 + vbitsel.v vr22, vr22, vr19, vr17 + vbitsel.v vr22, vr8, vr22, vr15 + vst vr22, a3, 0 + + // dir = 1 s1 = 1 s2 = 8 + vld vr0, a0, 4 + vld vr1, a0, 20 + ld.wu t0, a0, 36 + vpickev.w vr2, vr1, vr0 + vbsrl.v vr3, vr2, 4 + vinsgr2vr.w vr3, t0, 3 + vor.v vr2, vr3, vr2 + vseqi.b vr2, vr2, 0 + vmov vr15, vr2 + vxor.v vr3, vr3, vr3 + vbitsel.v vr3, vr18, vr3, vr2 + + vld vr0, a1, 4 + vld vr1, a1, 20 + ld.w t0, a1, 36 + vpickev.w vr2, vr1, vr0 + vbsrl.v vr4, vr2, 4 + vinsgr2vr.w vr4, t0, 3 + vseq.b vr2, vr4, vr2 + vseqi.b vr2, vr2, 0 + + vld vr0, a2, 16 + vld vr1, a2, 48 + vld vr12, a2, 80 + vld vr13, a2, 112 + vld vr4, a2, 144 + vpickev.h vr5, vr1, vr0 + vpickev.h vr14, vr13, vr12 + vpickev.h vr7, vr4, vr4 + vbsrl.v vr6, vr5, 8 + vilvl.d vr6, vr14, vr6 + vilvh.d vr9, vr7, vr14 + vabsd.h vr5, vr6, vr5 + vabsd.h vr9, vr9, vr14 + vsle.h vr5, vr20, vr5 + vsle.h vr9, vr20, vr9 + + vpickod.h vr6, vr1, vr0 + vpickod.h vr14, vr13, vr12 + vpickod.h vr7, vr4, vr4 + vbsrl.v vr8, vr6, 8 + vilvl.d vr8, vr14, vr8 + vilvh.d vr7, vr7, vr14 + vabsd.h vr8, vr8, vr6 + vabsd.h vr7, vr7, vr14 + vsle.h vr8, vr21, vr8 + vsle.h vr6, vr21, vr7 + + vor.v vr5, vr5, vr8 + vor.v vr6, vr9, vr6 + vpickev.b vr6, vr6, vr5 + vor.v vr2, vr6, vr2 + + beqz a5, .bframeiszero_1_lsx + // bframe != 0 ref[1] + vld vr0, a1, 44 + vld vr1, a1, 60 + ld.w t0, a1, 76 + vpickev.w vr0, vr1, vr0 + vbsrl.v vr1, vr0, 4 + vinsgr2vr.w vr1, t0, 3 + vseq.b vr11, vr1, vr0 + vseqi.b vr11, vr11, 0 + + vld vr0, a2, 176 + vld vr1, a2, 208 + vld vr12, a2, 240 + vld vr13, a2, 272 + vld vr4, a2, 304 + vpickev.h vr5, vr1, vr0 + vpickev.h vr14, vr13, vr12 + vpickev.h vr7, vr4, vr4 + vbsrl.v vr6, vr5, 8 + vilvl.d vr6, vr14, vr6 + vilvh.d vr9, vr7, vr14 + vabsd.h vr5, vr6, vr5 + vabsd.h vr9, vr9, vr14 + vsle.h vr5, vr20, vr5 + vsle.h vr9, vr20, vr9 + + vpickod.h vr6, vr1, vr0 + vpickod.h vr14, vr13, vr12 + vpickod.h vr7, vr4, vr4 + vbsrl.v vr8, vr6, 8 + vilvl.d vr8, vr14, vr8 + vilvh.d vr7, vr7, vr14 + vabsd.h vr8, vr8, vr6 + vabsd.h vr6, vr7, vr14 + vsle.h vr8, vr21, vr8 + vsle.h vr6, vr21, vr6 + + vor.v vr5, vr5, vr8 + vor.v vr7, vr9, vr6 + vpickev.b vr6, vr7, vr5 + vor.v vr6, vr6, vr11 + vor.v vr2, vr6, vr2 + +.bframeiszero_1_lsx: + vxor.v vr22, vr22, vr22 + vbitsel.v vr22, vr22, vr19, vr2 + vbitsel.v vr22, vr3, vr22, vr15 + vst vr22, a3, 32 +endfunc_x264 + +/* + * void deblock_v_luma_intra_lsx( pixel *pix, intptr_t stride, int alpha, int beta ) + */ +function_x264 deblock_v_luma_intra_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + slli.d t2, a1, 2 + + // Store registers to the stack + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + // Load data from pix + sub.d t3, a0, t2 // t3 = a0 - 4 * stride + vld vr3, t3, 0 // p3 + vldx vr2, t3, a1 // p2 + vldx vr1, t3, t0 // p1 + vldx vr0, t3, t1 // p0 + vld vr10, a0, 0 // q0 + vldx vr11, a0, a1 // q1 + vldx vr12, a0, t0 // q2 + vldx vr13, a0, t1 // q3 + + vsllwil.hu.bu vr7, vr3, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr14, vr10, 0 + vsllwil.hu.bu vr15, vr11, 0 + vsllwil.hu.bu vr16, vr12, 0 + vsllwil.hu.bu vr17, vr13, 0 + + /* p0', p1', p2' */ + vadd.h vr8, vr5, vr4 + vadd.h vr9, vr8, vr14 + vadd.h vr19, vr7, vr6 + vadd.h vr18, vr6, vr9 // pix[-2*xstride] + vslli.h vr19, vr19, 1 + vadd.h vr20, vr9, vr18 + vadd.h vr19, vr19, vr18 // pix[-3*xstride] + vadd.h vr20, vr20, vr15 // pix[-1*xstride] + + /* p0' */ + vadd.h vr8, vr8, vr15 + vadd.h vr21, vr8, vr5 // pix[-1*xstride] + + // /* q0', q1', q2' */ + vadd.h vr8, vr15, vr14 + vadd.h vr9, vr8, vr4 + vadd.h vr23, vr17, vr16 + vadd.h vr22, vr9, vr16 // pix[1*xstride] + vslli.h vr23, vr23, 1 + vadd.h vr24, vr9, vr22 + vadd.h vr23, vr23, vr22 // pix[2*xstride] + vadd.h vr24, vr24, vr5 // pix[0*xstride] + + /* q0' */ + vadd.h vr8, vr8, vr5 + vadd.h vr25, vr8, vr15 // pix[0*xstride] + + vexth.hu.bu vr7, vr3 + vexth.hu.bu vr6, vr2 + vexth.hu.bu vr5, vr1 + vexth.hu.bu vr4, vr0 + vexth.hu.bu vr14, vr10 + vexth.hu.bu vr15, vr11 + vexth.hu.bu vr16, vr12 + vexth.hu.bu vr17, vr13 + + /* p0', p1', p2' */ + vadd.h vr8, vr5, vr4 + vadd.h vr9, vr8, vr14 + vadd.h vr27, vr6, vr9 // pix[-2*xstride] + vadd.h vr28, vr7, vr6 + vslli.h vr28, vr28, 1 + vadd.h vr29, vr9, vr27 + vadd.h vr28, vr28, vr27 // pix[-3*xstride] + vadd.h vr29, vr29, vr15 // pix[-1*xstride] + + /* p0' */ + vadd.h vr8, vr8, vr15 + vadd.h vr30, vr8, vr5 // pix[-1*xstride] + + /* q0', q1', q2' */ + vadd.h vr8, vr15, vr14 + vadd.h vr9, vr8, vr4 + vadd.h vr3, vr17, vr16 + vadd.h vr31, vr9, vr16 // pix[1*xstride] + vslli.h vr3, vr3, 1 + vadd.h vr13, vr9, vr31 + vadd.h vr3, vr3, vr31 // pix[2*xstride] + vadd.h vr13, vr13, vr5 // pix[0*xstride] + + /* q0' */ + vadd.h vr8, vr8, vr5 + vadd.h vr9, vr8, vr15 // pix[0*xstride] + + vsrarni.b.h vr28, vr19, 3 // pix[-3*xstride] + vsrarni.b.h vr27, vr18, 2 // pix[-2*xstride] + vsrarni.b.h vr29, vr20, 3 // pix[-1*xstride] + vsrarni.b.h vr30, vr21, 2 // pix[-1*xstride] p0' + vsrarni.b.h vr13, vr24, 3 // pix[ 0*xstride] + vsrarni.b.h vr31, vr22, 2 // pix[ 1*xstride] + vsrarni.b.h vr3, vr23, 3 // pix[ 2*xstride] + vsrarni.b.h vr9, vr25, 2 // pix[ 0*xstride] q0' + + vreplgr2vr.b vr18, a2 // alpha + vreplgr2vr.b vr19, a3 // beta + + vabsd.bu vr26, vr0, vr10 + vabsd.bu vr8, vr1, vr0 + vabsd.bu vr16, vr11, vr10 + vslt.bu vr20, vr26, vr18 + vslt.bu vr21, vr8, vr19 + vslt.bu vr22, vr16, vr19 + vand.v vr20, vr20, vr21 + vand.v vr20, vr20, vr22 // if_1 + + vsrli.b vr18, vr18, 2 + vaddi.bu vr18, vr18, 2 + + vslt.bu vr26, vr26, vr18 // if_2 + + vabsd.bu vr23, vr2, vr0 + vslt.bu vr23, vr23, vr19 // if_3 + + vand.v vr16, vr23, vr26 // if_2 && if_3 + vnor.v vr24, vr16, vr16 // !(if_2 && if_3) + vand.v vr24, vr24, vr20 // if_1 && !(if_2 && if_3) + vand.v vr16, vr16, vr20 // if_1 && if_2 && if_3 + + vbitsel.v vr4, vr2, vr28, vr16 // pix[-3*xstride] + vbitsel.v vr5, vr1, vr27, vr16 // pix[-2*xstride] + vbitsel.v vr6, vr0, vr30, vr24 + vbitsel.v vr6, vr6, vr29, vr16 // pix[-1*xstride] + + vabsd.bu vr7, vr12, vr10 + vslt.bu vr7, vr7, vr19 // if_4 + + vand.v vr17, vr7, vr26 // if_2 && if_4 + vnor.v vr14, vr17, vr17 // !(if_2 && if_4) + vand.v vr14, vr14, vr20 // if_1 && !(if_2 && if_4) + vand.v vr17, vr17, vr20 // if_1 && if_2 && if_4 + + vbitsel.v vr15, vr10, vr9, vr14 + vbitsel.v vr15, vr15, vr13, vr17 // pix[ 0*xstride] + vbitsel.v vr9, vr11, vr31, vr17 // pix[ 1*xstride] + vbitsel.v vr13, vr12, vr3, vr17 // pix[ 2*xstride] + + vstx vr4, t3, a1 + vstx vr5, t3, t0 + vstx vr6, t3, t1 + vst vr15, a0, 0 + vstx vr9, a0, a1 + vstx vr13, a0, t0 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 + +/* + * void deblock_h_luma_intra_c( pixel *pix, intptr_t stride, int alpha, int beta ) + */ +function_x264 deblock_h_luma_intra_lsx + slli.d t0, a1, 1 + slli.d t2, a1, 2 + addi.d t5, a0, -4 + add.d t1, t0, a1 + + // Store registers to the stack + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + // Load data from pix + FLDD_LOADX_4 t5, a1, t0, t1, f10, f11, f12, f13 + add.d t5, t5, t2 + FLDD_LOADX_4 t5, a1, t0, t1, f14, f15, f16, f17 + add.d t5, t5, t2 + FLDD_LOADX_4 t5, a1, t0, t1, f20, f21, f22, f23 + add.d t5, t5, t2 + FLDD_LOADX_4 t5, a1, t0, t1, f24, f25, f26, f27 + + vilvl.b vr11, vr11, vr10 + vilvl.b vr13, vr13, vr12 + vilvl.b vr15, vr15, vr14 + vilvl.b vr17, vr17, vr16 + vilvl.h vr0, vr13, vr11 + vilvl.h vr1, vr17, vr15 + vilvh.h vr2, vr13, vr11 + vilvh.h vr3, vr17, vr15 + vilvl.w vr4, vr1, vr0 + vilvl.w vr6, vr3, vr2 + vilvh.w vr5, vr1, vr0 + vilvh.w vr7, vr3, vr2 + + vilvl.b vr11, vr21, vr20 + vilvl.b vr13, vr23, vr22 + vilvl.b vr15, vr25, vr24 + vilvl.b vr17, vr27, vr26 + vilvl.h vr0, vr13, vr11 + vilvl.h vr1, vr17, vr15 + vilvh.h vr2, vr13, vr11 + vilvh.h vr3, vr17, vr15 + vilvl.w vr24, vr1, vr0 + vilvl.w vr26, vr3, vr2 + vilvh.w vr25, vr1, vr0 + vilvh.w vr27, vr3, vr2 + + vilvl.d vr3, vr24, vr4 // p3 + vilvh.d vr2, vr24, vr4 // p2 + vilvl.d vr1, vr25, vr5 // p1 + vilvh.d vr0, vr25, vr5 // p0 + vilvl.d vr10, vr26, vr6 // q0 + vilvh.d vr11, vr26, vr6 // q1 + vilvl.d vr12, vr27, vr7 // q2 + vilvh.d vr13, vr27, vr7 // q3 + + vsllwil.hu.bu vr7, vr3, 0 + vsllwil.hu.bu vr6, vr2, 0 + vsllwil.hu.bu vr5, vr1, 0 + vsllwil.hu.bu vr4, vr0, 0 + vsllwil.hu.bu vr14, vr10, 0 + vsllwil.hu.bu vr15, vr11, 0 + vsllwil.hu.bu vr16, vr12, 0 + vsllwil.hu.bu vr17, vr13, 0 + + /* p0', p1', p2' */ + vadd.h vr8, vr5, vr4 + vadd.h vr9, vr8, vr14 + vadd.h vr19, vr7, vr6 + vadd.h vr18, vr6, vr9 // pix[-2*xstride] + vslli.h vr19, vr19, 1 + vadd.h vr20, vr9, vr18 + vadd.h vr19, vr19, vr18 // pix[-3*xstride] + vadd.h vr20, vr20, vr15 // pix[-1*xstride] + + /* p0' */ + vadd.h vr8, vr8, vr15 + vadd.h vr21, vr8, vr5 // pix[-1*xstride] + + /* q0', q1', q2' */ + vadd.h vr8, vr15, vr14 + vadd.h vr9, vr8, vr4 + vadd.h vr23, vr17, vr16 + vadd.h vr22, vr9, vr16 // pix[1*xstride] + vslli.h vr23, vr23, 1 + vadd.h vr24, vr9, vr22 + vadd.h vr23, vr23, vr22 // pix[2*xstride] + vadd.h vr24, vr24, vr5 // pix[0*xstride] + + /* q0' */ + vadd.h vr8, vr8, vr5 + vadd.h vr25, vr8, vr15 // pix[0*xstride] + + vexth.hu.bu vr7, vr3 + vexth.hu.bu vr6, vr2 + vexth.hu.bu vr5, vr1 + vexth.hu.bu vr4, vr0 + vexth.hu.bu vr14, vr10 + vexth.hu.bu vr15, vr11 + vexth.hu.bu vr16, vr12 + vexth.hu.bu vr17, vr13 + + /* p0', p1', p2' */ + vadd.h vr8, vr5, vr4 + vadd.h vr9, vr8, vr14 + vadd.h vr27, vr6, vr9 // pix[-2*xstride] + vadd.h vr28, vr7, vr6 + vslli.h vr28, vr28, 1 + vadd.h vr29, vr9, vr27 + vadd.h vr28, vr28, vr27 // pix[-3*xstride] + vadd.h vr29, vr29, vr15 // pix[-1*xstride] + + /* p0' */ + vadd.h vr8, vr8, vr15 + vadd.h vr30, vr8, vr5 // pix[-1*xstride] + + /* q0', q1', q2' */ + vadd.h vr8, vr15, vr14 + vadd.h vr9, vr8, vr4 + vadd.h vr3, vr17, vr16 + vadd.h vr31, vr9, vr16 // pix[1*xstride] + vslli.h vr3, vr3, 1 + vadd.h vr13, vr9, vr31 + vadd.h vr3, vr3, vr31 // pix[2*xstride] + vadd.h vr13, vr13, vr5 // pix[0*xstride] + + /* q0' */ + vadd.h vr8, vr8, vr5 + vadd.h vr9, vr8, vr15 // pix[0*xstride] + + vsrarni.b.h vr28, vr19, 3 // pix[-3*xstride] + vsrarni.b.h vr27, vr18, 2 // pix[-2*xstride] + vsrarni.b.h vr29, vr20, 3 // pix[-1*xstride] + vsrarni.b.h vr30, vr21, 2 // pix[-1*xstride] p0' + vsrarni.b.h vr13, vr24, 3 // pix[ 0*xstride] + vsrarni.b.h vr31, vr22, 2 // pix[ 1*xstride] + vsrarni.b.h vr3, vr23, 3 // pix[ 2*xstride] + vsrarni.b.h vr9, vr25, 2 // pix[ 0*xstride] q0' + + vreplgr2vr.b vr18, a2 // alpha + vreplgr2vr.b vr19, a3 // beta + + vabsd.bu vr26, vr0, vr10 + vabsd.bu vr8, vr1, vr0 + vabsd.bu vr16, vr11, vr10 + vslt.bu vr20, vr26, vr18 + vslt.bu vr21, vr8, vr19 + vslt.bu vr22, vr16, vr19 + vand.v vr20, vr20, vr21 + vand.v vr20, vr20, vr22 // if_1 + + vsrli.b vr18, vr18, 2 + vaddi.bu vr18, vr18, 2 + + vslt.bu vr26, vr26, vr18 // if_2 + + vabsd.bu vr23, vr2, vr0 + vslt.bu vr23, vr23, vr19 // if_3 + + vand.v vr16, vr23, vr26 // if_2 && if_3 + vnor.v vr24, vr16, vr16 // !(if_2 && if_3) + vand.v vr24, vr24, vr20 // if_1 && !(if_2 && if_3) + vand.v vr16, vr16, vr20 // if_1 && if_2 && if_3 + vbitsel.v vr4, vr2, vr28, vr16 // pix[-3*xstride] + vbitsel.v vr5, vr1, vr27, vr16 // pix[-2*xstride] + vbitsel.v vr6, vr0, vr30, vr24 + vbitsel.v vr6, vr6, vr29, vr16 // pix[-1*xstride] + + vabsd.bu vr7, vr12, vr10 + vslt.bu vr7, vr7, vr19 // if_4 + + vand.v vr17, vr7, vr26 // if_2 && if_4 + vnor.v vr14, vr17, vr17 // !(if_2 && if_4) + vand.v vr14, vr14, vr20 // if_1 && !(if_2 && if_4) + vand.v vr17, vr17, vr20 // if_1 && if_2 && if_4 + vbitsel.v vr15, vr10, vr9, vr14 + vbitsel.v vr15, vr15, vr13, vr17 // pix[ 0*xstride] + vbitsel.v vr9, vr11, vr31, vr17 // pix[ 1*xstride] + vbitsel.v vr13, vr12, vr3, vr17 // pix[ 2*xstride] + + vilvl.b vr16, vr5, vr4 + vilvl.b vr17, vr15, vr6 + vilvl.b vr18, vr13, vr9 + vilvh.b vr19, vr5, vr4 + vilvh.b vr20, vr15, vr6 + vilvh.b vr21, vr13, vr9 + vilvl.h vr0, vr17, vr16 + vilvh.h vr1, vr17, vr16 + vilvl.h vr2, vr20, vr19 + vilvh.h vr3, vr20, vr19 + + addi.d t6, a0, -3 // t6 = a0 -3 + vstelm.w vr0, t6, 0, 0 + vstelm.h vr18, t6, 4, 0 + add.d t6, t6, a1 + vstelm.w vr0, t6, 0, 1 + vstelm.h vr18, t6, 4, 1 + add.d t6, t6, a1 + vstelm.w vr0, t6, 0, 2 + vstelm.h vr18, t6, 4, 2 + add.d t6, t6, a1 + vstelm.w vr0, t6, 0, 3 + vstelm.h vr18, t6, 4, 3 + add.d t6, t6, a1 + + vstelm.w vr1, t6, 0, 0 + vstelm.h vr18, t6, 4, 4 + add.d t6, t6, a1 + vstelm.w vr1, t6, 0, 1 + vstelm.h vr18, t6, 4, 5 + add.d t6, t6, a1 + vstelm.w vr1, t6, 0, 2 + vstelm.h vr18, t6, 4, 6 + add.d t6, t6, a1 + vstelm.w vr1, t6, 0, 3 + vstelm.h vr18, t6, 4, 7 + add.d t6, t6, a1 + + vstelm.w vr2, t6, 0, 0 + vstelm.h vr21, t6, 4, 0 + add.d t6, t6, a1 + vstelm.w vr2, t6, 0, 1 + vstelm.h vr21, t6, 4, 1 + add.d t6, t6, a1 + vstelm.w vr2, t6, 0, 2 + vstelm.h vr21, t6, 4, 2 + add.d t6, t6, a1 + vstelm.w vr2, t6, 0, 3 + vstelm.h vr21, t6, 4, 3 + add.d t6, t6, a1 + + vstelm.w vr3, t6, 0, 0 + vstelm.h vr21, t6, 4, 4 + add.d t6, t6, a1 + vstelm.w vr3, t6, 0, 1 + vstelm.h vr21, t6, 4, 5 + add.d t6, t6, a1 + vstelm.w vr3, t6, 0, 2 + vstelm.h vr21, t6, 4, 6 + add.d t6, t6, a1 + vstelm.w vr3, t6, 0, 3 + vstelm.h vr21, t6, 4, 7 + + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 +#endif /* !HIGH_BIT_DEPTH */ diff --git a/common/loongarch/deblock.h b/common/loongarch/deblock.h new file mode 100644 index 000000000..452c5a3b0 --- /dev/null +++ b/common/loongarch/deblock.h @@ -0,0 +1,54 @@ +/***************************************************************************** + * deblock.h: loongarch deblock + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Hao Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_LOONGARCH_DEBLOCK_H +#define X264_LOONGARCH_DEBLOCK_H + +#if !HIGH_BIT_DEPTH +#define x264_deblock_v_luma_lasx x264_template(deblock_v_luma_lasx) +void x264_deblock_v_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +#define x264_deblock_h_luma_lasx x264_template(deblock_h_luma_lasx) +void x264_deblock_h_luma_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); + +#define x264_deblock_v_luma_intra_lsx x264_template(deblock_v_luma_intra_lsx) +void x264_deblock_v_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta ); +#define x264_deblock_h_luma_intra_lsx x264_template(deblock_h_luma_intra_lsx) +void x264_deblock_h_luma_intra_lsx( uint8_t *pix, intptr_t stride, int alpha, int beta ); + +#define x264_deblock_v_luma_intra_lasx x264_template(deblock_v_luma_intra_lasx) +void x264_deblock_v_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta ); +#define x264_deblock_h_luma_intra_lasx x264_template(deblock_h_luma_intra_lasx) +void x264_deblock_h_luma_intra_lasx( uint8_t *pix, intptr_t stride, int alpha, int beta ); +#define x264_deblock_strength_lsx x264_template(deblock_strength_lsx) +void x264_deblock_strength_lsx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +#define x264_deblock_strength_lasx x264_template(deblock_strength_lasx) +void x264_deblock_strength_lasx( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +#endif + +#endif diff --git a/common/loongarch/loongson_asm.S b/common/loongarch/loongson_asm.S new file mode 100644 index 000000000..356d857d3 --- /dev/null +++ b/common/loongarch/loongson_asm.S @@ -0,0 +1,770 @@ +/********************************************************************* + * Copyright (c) 2022-2024 Loongson Technology Corporation Limited + * Contributed by Xiwei Gu + * Shiyou Yin + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + *********************************************************************/ + +/* + * This file is a LoongArch assembly helper file and available under ISC + * license. It provides a large number of macros and alias to simplify + * writing assembly code, especially for LSX and LASX optimizations. + * + * Any one can modify it or add new features for his/her own purposes. + * Contributing a patch will be appreciated as it might be useful for + * others as well. Send patches to loongson contributor mentioned above. + * + * MAJOR version: Usage changes, incompatible with previous version. + * MINOR version: Add new macros/functions, or bug fixes. + * MICRO version: Comment changes or implementation changes. + */ + +#define LML_VERSION_MAJOR 0 +#define LML_VERSION_MINOR 4 +#define LML_VERSION_MICRO 0 + +#define ASM_PREF +#define DEFAULT_ALIGN 5 + +/* + *============================================================================ + * macros for specific projetc, set them as needed. + * Following LoongML macros for your reference. + *============================================================================ + */ + +.macro function name, align=DEFAULT_ALIGN +.macro endfunc + jirl $r0, $r1, 0x0 + .size ASM_PREF\name, . - ASM_PREF\name + .purgem endfunc +.endm +.text ; +.align \align ; +.globl ASM_PREF\name ; +.type ASM_PREF\name, @function ; +ASM_PREF\name: ; +.endm + +.macro const name, align=DEFAULT_ALIGN + .macro endconst + .size \name, . - \name + .purgem endconst + .endm +.section .rodata +.align \align +\name: +.endm + +/* + *============================================================================ + * LoongArch register alias + *============================================================================ + */ + +#define a0 $a0 +#define a1 $a1 +#define a2 $a2 +#define a3 $a3 +#define a4 $a4 +#define a5 $a5 +#define a6 $a6 +#define a7 $a7 + +#define t0 $t0 +#define t1 $t1 +#define t2 $t2 +#define t3 $t3 +#define t4 $t4 +#define t5 $t5 +#define t6 $t6 +#define t7 $t7 +#define t8 $t8 + +#define s0 $s0 +#define s1 $s1 +#define s2 $s2 +#define s3 $s3 +#define s4 $s4 +#define s5 $s5 +#define s6 $s6 +#define s7 $s7 +#define s8 $s8 + +#define zero $zero +#define sp $sp +#define ra $ra + +#define fa0 $fa0 +#define fa1 $fa1 +#define fa2 $fa2 +#define fa3 $fa3 +#define fa4 $fa4 +#define fa5 $fa5 +#define fa6 $fa6 +#define fa7 $fa7 +#define ft0 $ft0 +#define ft1 $ft1 +#define ft2 $ft2 +#define ft3 $ft3 +#define ft4 $ft4 +#define ft5 $ft5 +#define ft6 $ft6 +#define ft7 $ft7 +#define ft8 $ft8 +#define ft9 $ft9 +#define ft10 $ft10 +#define ft11 $ft11 +#define ft12 $ft12 +#define ft13 $ft13 +#define ft14 $ft14 +#define ft15 $ft15 +#define fs0 $fs0 +#define fs1 $fs1 +#define fs2 $fs2 +#define fs3 $fs3 +#define fs4 $fs4 +#define fs5 $fs5 +#define fs6 $fs6 +#define fs7 $fs7 + +#define f0 $f0 +#define f1 $f1 +#define f2 $f2 +#define f3 $f3 +#define f4 $f4 +#define f5 $f5 +#define f6 $f6 +#define f7 $f7 +#define f8 $f8 +#define f9 $f9 +#define f10 $f10 +#define f11 $f11 +#define f12 $f12 +#define f13 $f13 +#define f14 $f14 +#define f15 $f15 +#define f16 $f16 +#define f17 $f17 +#define f18 $f18 +#define f19 $f19 +#define f20 $f20 +#define f21 $f21 +#define f22 $f22 +#define f23 $f23 +#define f24 $f24 +#define f25 $f25 +#define f26 $f26 +#define f27 $f27 +#define f28 $f28 +#define f29 $f29 +#define f30 $f30 +#define f31 $f31 + +#define vr0 $vr0 +#define vr1 $vr1 +#define vr2 $vr2 +#define vr3 $vr3 +#define vr4 $vr4 +#define vr5 $vr5 +#define vr6 $vr6 +#define vr7 $vr7 +#define vr8 $vr8 +#define vr9 $vr9 +#define vr10 $vr10 +#define vr11 $vr11 +#define vr12 $vr12 +#define vr13 $vr13 +#define vr14 $vr14 +#define vr15 $vr15 +#define vr16 $vr16 +#define vr17 $vr17 +#define vr18 $vr18 +#define vr19 $vr19 +#define vr20 $vr20 +#define vr21 $vr21 +#define vr22 $vr22 +#define vr23 $vr23 +#define vr24 $vr24 +#define vr25 $vr25 +#define vr26 $vr26 +#define vr27 $vr27 +#define vr28 $vr28 +#define vr29 $vr29 +#define vr30 $vr30 +#define vr31 $vr31 + +#define xr0 $xr0 +#define xr1 $xr1 +#define xr2 $xr2 +#define xr3 $xr3 +#define xr4 $xr4 +#define xr5 $xr5 +#define xr6 $xr6 +#define xr7 $xr7 +#define xr8 $xr8 +#define xr9 $xr9 +#define xr10 $xr10 +#define xr11 $xr11 +#define xr12 $xr12 +#define xr13 $xr13 +#define xr14 $xr14 +#define xr15 $xr15 +#define xr16 $xr16 +#define xr17 $xr17 +#define xr18 $xr18 +#define xr19 $xr19 +#define xr20 $xr20 +#define xr21 $xr21 +#define xr22 $xr22 +#define xr23 $xr23 +#define xr24 $xr24 +#define xr25 $xr25 +#define xr26 $xr26 +#define xr27 $xr27 +#define xr28 $xr28 +#define xr29 $xr29 +#define xr30 $xr30 +#define xr31 $xr31 + +/* + *============================================================================ + * LSX/LASX synthesize instructions + *============================================================================ + */ + +/* + * Description : Dot product of byte vector elements + * Arguments : Inputs - vj, vk + * Outputs - vd + * Return Type - halfword + */ +.macro vdp2.h.bu vd, vj, vk + vmulwev.h.bu \vd, \vj, \vk + vmaddwod.h.bu \vd, \vj, \vk +.endm + +.macro vdp2.h.bu.b vd, vj, vk + vmulwev.h.bu.b \vd, \vj, \vk + vmaddwod.h.bu.b \vd, \vj, \vk +.endm + +.macro vdp2.w.h vd, vj, vk + vmulwev.w.h \vd, \vj, \vk + vmaddwod.w.h \vd, \vj, \vk +.endm + +.macro xvdp2.h.bu xd, xj, xk + xvmulwev.h.bu \xd, \xj, \xk + xvmaddwod.h.bu \xd, \xj, \xk +.endm + +.macro xvdp2.h.bu.b xd, xj, xk + xvmulwev.h.bu.b \xd, \xj, \xk + xvmaddwod.h.bu.b \xd, \xj, \xk +.endm + +.macro xvdp2.w.h xd, xj, xk + xvmulwev.w.h \xd, \xj, \xk + xvmaddwod.w.h \xd, \xj, \xk +.endm + +/* + * Description : Dot product & addition of halfword vector elements + * Arguments : Inputs - vj, vk + * Outputs - vd + * Return Type - twice size of input + */ +.macro vdp2add.h.bu vd, vj, vk + vmaddwev.h.bu \vd, \vj, \vk + vmaddwod.h.bu \vd, \vj, \vk +.endm + +.macro vdp2add.h.bu.b vd, vj, vk + vmaddwev.h.bu.b \vd, \vj, \vk + vmaddwod.h.bu.b \vd, \vj, \vk +.endm + +.macro vdp2add.w.h vd, vj, vk + vmaddwev.w.h \vd, \vj, \vk + vmaddwod.w.h \vd, \vj, \vk +.endm + +.macro xvdp2add.h.bu.b xd, xj, xk + xvmaddwev.h.bu.b \xd, \xj, \xk + xvmaddwod.h.bu.b \xd, \xj, \xk +.endm + +.macro xvdp2add.w.h xd, xj, xk + xvmaddwev.w.h \xd, \xj, \xk + xvmaddwod.w.h \xd, \xj, \xk +.endm + +/* + * Description : Range element vj[i] to vk[i] ~ vj[i] + * clip: vj > vk ? vj : vk && vj < va ? vj : va + */ +.macro vclip.h vd, vj, vk, va + vmax.h \vd, \vj, \vk + vmin.h \vd, \vd, \va +.endm + +.macro vclip.w vd, vj, vk, va + vmax.w \vd, \vj, \vk + vmin.w \vd, \vd, \va +.endm + +.macro xvclip.h xd, xj, xk, xa + xvmax.h \xd, \xj, \xk + xvmin.h \xd, \xd, \xa +.endm + +.macro xvclip.w xd, xj, xk, xa + xvmax.w \xd, \xj, \xk + xvmin.w \xd, \xd, \xa +.endm + +/* + * Description : Range element vj[i] to 0 ~ 255 + * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0 + */ +.macro vclip255.h vd, vj + vmaxi.h \vd, \vj, 0 + vsat.hu \vd, \vd, 7 +.endm + +.macro vclip255.w vd, vj + vmaxi.w \vd, \vj, 0 + vsat.wu \vd, \vd, 7 +.endm + +.macro xvclip255.h xd, xj + xvmaxi.h \xd, \xj, 0 + xvsat.hu \xd, \xd, 7 +.endm + +.macro xvclip255.w xd, xj + xvmaxi.w \xd, \xj, 0 + xvsat.wu \xd, \xd, 7 +.endm + +/* + * Description : Store elements of vector + * vd : Data vector to be stroed + * rk : Address of data storage + * ra : Offset of address + * si : Index of data in vd + */ +.macro vstelmx.b vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.b \vd, \rk, 0, \si +.endm + +.macro vstelmx.h vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.h \vd, \rk, 0, \si +.endm + +.macro vstelmx.w vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.w \vd, \rk, 0, \si +.endm + +.macro vstelmx.d vd, rk, ra, si + add.d \rk, \rk, \ra + vstelm.d \vd, \rk, 0, \si +.endm + +.macro vmov xd, xj + vor.v \xd, \xj, \xj +.endm + +.macro xmov xd, xj + xvor.v \xd, \xj, \xj +.endm + +.macro xvstelmx.d xd, rk, ra, si + add.d \rk, \rk, \ra + xvstelm.d \xd, \rk, 0, \si +.endm + +/* + *============================================================================ + * LSX/LASX custom macros + *============================================================================ + */ + +/* + * Load 4 float, double, V128, v256 elements with stride. + */ +.macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + fld.s \out0, \src, 0 + fldx.s \out1, \src, \stride + fldx.s \out2, \src, \stride2 + fldx.s \out3, \src, \stride3 +.endm + +.macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + fld.d \out0, \src, 0 + fldx.d \out1, \src, \stride + fldx.d \out2, \src, \stride2 + fldx.d \out3, \src, \stride3 +.endm + +.macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + vld \out0, \src, 0 + vldx \out1, \src, \stride + vldx \out2, \src, \stride2 + vldx \out3, \src, \stride3 +.endm + +.macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 + xvld \out0, \src, 0 + xvldx \out1, \src, \stride + xvldx \out2, \src, \stride2 + xvldx \out3, \src, \stride3 +.endm + +/* + * Description : Transpose 4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + vilvl.h \tmp0, \in1, \in0 + vilvl.h \tmp1, \in3, \in2 + vilvl.w \out0, \tmp1, \tmp0 + vilvh.w \out2, \tmp1, \tmp0 + vilvh.d \out1, \out0, \out0 + vilvh.d \out3, \out0, \out2 +.endm + +/* + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4 1, 5, 9,13 + * 5, 6, 7, 8 to 2, 6,10,14 + * 9,10,11,12 =====> 3, 7,11,15 + * 13,14,15,16 4, 8,12,16 + */ +.macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + + vilvl.w \tmp0, \in1, \in0 + vilvh.w \out1, \in1, \in0 + vilvl.w \tmp1, \in3, \in2 + vilvh.w \out3, \in3, \in2 + + vilvl.d \out0, \tmp1, \tmp0 + vilvl.d \out2, \out3, \out1 + vilvh.d \out3, \out3, \out1 + vilvh.d \out1, \tmp1, \tmp0 +.endm + +/* + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \ + tmp3, tmp4, tmp5, tmp6, tmp7 + vilvl.h \tmp0, \in6, \in4 + vilvl.h \tmp1, \in7, \in5 + vilvl.h \tmp2, \in2, \in0 + vilvl.h \tmp3, \in3, \in1 + + vilvl.h \tmp4, \tmp1, \tmp0 + vilvh.h \tmp5, \tmp1, \tmp0 + vilvl.h \tmp6, \tmp3, \tmp2 + vilvh.h \tmp7, \tmp3, \tmp2 + + vilvh.h \tmp0, \in6, \in4 + vilvh.h \tmp1, \in7, \in5 + vilvh.h \tmp2, \in2, \in0 + vilvh.h \tmp3, \in3, \in1 + + vpickev.d \out0, \tmp4, \tmp6 + vpickod.d \out1, \tmp4, \tmp6 + vpickev.d \out2, \tmp5, \tmp7 + vpickod.d \out3, \tmp5, \tmp7 + + vilvl.h \tmp4, \tmp1, \tmp0 + vilvh.h \tmp5, \tmp1, \tmp0 + vilvl.h \tmp6, \tmp3, \tmp2 + vilvh.h \tmp7, \tmp3, \tmp2 + + vpickev.d \out4, \tmp4, \tmp6 + vpickod.d \out5, \tmp4, \tmp6 + vpickev.d \out6, \tmp5, \tmp7 + vpickod.d \out7, \tmp5, \tmp7 +.endm + +/* + * Description : Transpose 16x8 block with byte elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7,\ + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 + xvilvl.b \tmp0, \in2, \in0 + xvilvl.b \tmp1, \in3, \in1 + xvilvl.b \tmp2, \in6, \in4 + xvilvl.b \tmp3, \in7, \in5 + xvilvl.b \tmp4, \in10, \in8 + xvilvl.b \tmp5, \in11, \in9 + xvilvl.b \tmp6, \in14, \in12 + xvilvl.b \tmp7, \in15, \in13 + xvilvl.b \out0, \tmp1, \tmp0 + xvilvh.b \out1, \tmp1, \tmp0 + xvilvl.b \out2, \tmp3, \tmp2 + xvilvh.b \out3, \tmp3, \tmp2 + xvilvl.b \out4, \tmp5, \tmp4 + xvilvh.b \out5, \tmp5, \tmp4 + xvilvl.b \out6, \tmp7, \tmp6 + xvilvh.b \out7, \tmp7, \tmp6 + xvilvl.w \tmp0, \out2, \out0 + xvilvh.w \tmp2, \out2, \out0 + xvilvl.w \tmp4, \out3, \out1 + xvilvh.w \tmp6, \out3, \out1 + xvilvl.w \tmp1, \out6, \out4 + xvilvh.w \tmp3, \out6, \out4 + xvilvl.w \tmp5, \out7, \out5 + xvilvh.w \tmp7, \out7, \out5 + xvilvl.d \out0, \tmp1, \tmp0 + xvilvh.d \out1, \tmp1, \tmp0 + xvilvl.d \out2, \tmp3, \tmp2 + xvilvh.d \out3, \tmp3, \tmp2 + xvilvl.d \out4, \tmp5, \tmp4 + xvilvh.d \out5, \tmp5, \tmp4 + xvilvl.d \out6, \tmp7, \tmp6 + xvilvh.d \out7, \tmp7, \tmp6 +.endm + +/* + * Description : Transpose 4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.h \tmp0, \in1, \in0 + xvilvl.h \tmp1, \in3, \in2 + xvilvl.w \out0, \tmp1, \tmp0 + xvilvh.w \out2, \tmp1, \tmp0 + xvilvh.d \out1, \out0, \out0 + xvilvh.d \out3, \out0, \out2 +.endm + +/* + * Description : Transpose 4x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.h \tmp0, \in2, \in0 + xvilvl.h \tmp1, \in3, \in1 + xvilvl.h \out2, \tmp1, \tmp0 + xvilvh.h \out3, \tmp1, \tmp0 + + xvilvl.d \out0, \out2, \out2 + xvilvh.d \out1, \out2, \out2 + xvilvl.d \out2, \out3, \out3 + xvilvh.d \out3, \out3, \out3 +.endm + +/* + * Description : Transpose 8x8 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 + */ +.macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7, \ + tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 + xvilvl.h \tmp0, \in6, \in4 + xvilvl.h \tmp1, \in7, \in5 + xvilvl.h \tmp2, \in2, \in0 + xvilvl.h \tmp3, \in3, \in1 + + xvilvl.h \tmp4, \tmp1, \tmp0 + xvilvh.h \tmp5, \tmp1, \tmp0 + xvilvl.h \tmp6, \tmp3, \tmp2 + xvilvh.h \tmp7, \tmp3, \tmp2 + + xvilvh.h \tmp0, \in6, \in4 + xvilvh.h \tmp1, \in7, \in5 + xvilvh.h \tmp2, \in2, \in0 + xvilvh.h \tmp3, \in3, \in1 + + xvpickev.d \out0, \tmp4, \tmp6 + xvpickod.d \out1, \tmp4, \tmp6 + xvpickev.d \out2, \tmp5, \tmp7 + xvpickod.d \out3, \tmp5, \tmp7 + + xvilvl.h \tmp4, \tmp1, \tmp0 + xvilvh.h \tmp5, \tmp1, \tmp0 + xvilvl.h \tmp6, \tmp3, \tmp2 + xvilvh.h \tmp7, \tmp3, \tmp2 + + xvpickev.d \out4, \tmp4, \tmp6 + xvpickod.d \out5, \tmp4, \tmp6 + xvpickev.d \out6, \tmp5, \tmp7 + xvpickod.d \out7, \tmp5, \tmp7 +.endm + +/* + * Description : Transpose 2x4x4 block with half-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + */ +.macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1, tmp2 + xvilvh.h \tmp1, \in0, \in1 + xvilvl.h \out1, \in0, \in1 + xvilvh.h \tmp0, \in2, \in3 + xvilvl.h \out3, \in2, \in3 + + xvilvh.w \tmp2, \out3, \out1 + xvilvl.w \out3, \out3, \out1 + + xvilvl.w \out2, \tmp0, \tmp1 + xvilvh.w \tmp1, \tmp0, \tmp1 + + xvilvh.d \out0, \out2, \out3 + xvilvl.d \out2, \out2, \out3 + xvilvh.d \out1, \tmp1, \tmp2 + xvilvl.d \out3, \tmp1, \tmp2 +.endm + +/* + * Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : + * Example : + * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13 + * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14 + * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15 + * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16 + */ +.macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + + xvilvl.w \tmp0, \in1, \in0 + xvilvh.w \out1, \in1, \in0 + xvilvl.w \tmp1, \in3, \in2 + xvilvh.w \out3, \in3, \in2 + + xvilvl.d \out0, \tmp1, \tmp0 + xvilvl.d \out2, \out3, \out1 + xvilvh.d \out3, \out3, \out1 + xvilvh.d \out1, \tmp1, \tmp0 +.endm + +/* + * Description : Transpose 8x8 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 + * Outputs - out0, out1, out2, out3, out4, out5, out6, + * _out7 + * Example : LASX_TRANSPOSE8x8_W + * in0 : 1,2,3,4,5,6,7,8 + * in1 : 2,2,3,4,5,6,7,8 + * in2 : 3,2,3,4,5,6,7,8 + * in3 : 4,2,3,4,5,6,7,8 + * in4 : 5,2,3,4,5,6,7,8 + * in5 : 6,2,3,4,5,6,7,8 + * in6 : 7,2,3,4,5,6,7,8 + * in7 : 8,2,3,4,5,6,7,8 + * + * out0 : 1,2,3,4,5,6,7,8 + * out1 : 2,2,2,2,2,2,2,2 + * out2 : 3,3,3,3,3,3,3,3 + * out3 : 4,4,4,4,4,4,4,4 + * out4 : 5,5,5,5,5,5,5,5 + * out5 : 6,6,6,6,6,6,6,6 + * out6 : 7,7,7,7,7,7,7,7 + * out7 : 8,8,8,8,8,8,8,8 + */ +.macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\ + out0, out1, out2, out3, out4, out5, out6, out7,\ + tmp0, tmp1, tmp2, tmp3 + xvilvl.w \tmp0, \in2, \in0 + xvilvl.w \tmp1, \in3, \in1 + xvilvh.w \tmp2, \in2, \in0 + xvilvh.w \tmp3, \in3, \in1 + xvilvl.w \out0, \tmp1, \tmp0 + xvilvh.w \out1, \tmp1, \tmp0 + xvilvl.w \out2, \tmp3, \tmp2 + xvilvh.w \out3, \tmp3, \tmp2 + + xvilvl.w \tmp0, \in6, \in4 + xvilvl.w \tmp1, \in7, \in5 + xvilvh.w \tmp2, \in6, \in4 + xvilvh.w \tmp3, \in7, \in5 + xvilvl.w \out4, \tmp1, \tmp0 + xvilvh.w \out5, \tmp1, \tmp0 + xvilvl.w \out6, \tmp3, \tmp2 + xvilvh.w \out7, \tmp3, \tmp2 + + xmov \tmp0, \out0 + xmov \tmp1, \out1 + xmov \tmp2, \out2 + xmov \tmp3, \out3 + xvpermi.q \out0, \out4, 0x02 + xvpermi.q \out1, \out5, 0x02 + xvpermi.q \out2, \out6, 0x02 + xvpermi.q \out3, \out7, 0x02 + xvpermi.q \out4, \tmp0, 0x31 + xvpermi.q \out5, \tmp1, 0x31 + xvpermi.q \out6, \tmp2, 0x31 + xvpermi.q \out7, \tmp3, 0x31 +.endm + +/* + * Description : Transpose 4x4 block with double-word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Example : LASX_TRANSPOSE4x4_D + * in0 : 1,2,3,4 + * in1 : 1,2,3,4 + * in2 : 1,2,3,4 + * in3 : 1,2,3,4 + * + * out0 : 1,1,1,1 + * out1 : 2,2,2,2 + * out2 : 3,3,3,3 + * out3 : 4,4,4,4 + */ +.macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ + tmp0, tmp1 + xvilvl.d \tmp0, \in1, \in0 + xvilvh.d \out1, \in1, \in0 + xvilvh.d \tmp1, \in3, \in2 + xvilvl.d \out2, \in3, \in2 + + xvor.v \out0, \tmp0, \tmp0 + xvor.v \out3, \tmp1, \tmp1 + + xvpermi.q \out0, \out2, 0x02 + xvpermi.q \out2, \tmp0, 0x31 + xvpermi.q \out3, \out1, 0x31 + xvpermi.q \out1, \tmp1, 0x02 +.endm diff --git a/common/loongarch/loongson_util.S b/common/loongarch/loongson_util.S new file mode 100644 index 000000000..683a1a8a6 --- /dev/null +++ b/common/loongarch/loongson_util.S @@ -0,0 +1,47 @@ +/***************************************************************************** + * loongson_util.S: loongson utility macros + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Shiyou Yin + * Xiwei Gu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) + +/* Set prefix as needed. */ +#define ASM_REF JOIN(JOIN(x264_, BIT_DEPTH), _) + +#define FENC_STRIDE 16 +#define FDEC_STRIDE 32 + +.macro function_x264 name, align=DEFAULT_ALIGN +.macro endfunc_x264 + jirl $r0, $r1, 0x0 + .size ASM_REF\name, . - ASM_REF\name + .purgem endfunc_x264 +.endm +.text ; +.align \align ; +.globl ASM_REF\name ; +.type ASM_REF\name, @function ; +ASM_REF\name: ; +.endm diff --git a/common/loongarch/mc-a.S b/common/loongarch/mc-a.S new file mode 100644 index 000000000..75041f23d --- /dev/null +++ b/common/loongarch/mc-a.S @@ -0,0 +1,2702 @@ +/***************************************************************************** + * mc-a.S: LoongArch motion compensation + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Xiwei Gu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "loongson_asm.S" +#include "loongson_util.S" + +const ch_shuf +.byte 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 +.byte 0, 2, 2, 4, 4, 6, 6, 8, 1, 3, 3, 5, 5, 7, 7, 9 +endconst + +const pw_1024 +.rept 16 +.short 1024 +.endr +endconst + +const filt_mul20 +.rept 32 +.byte 20 +.endr +endconst + +const filt_mul15 +.rept 16 +.byte 1, -5 +.endr +endconst + +const filt_mul51 +.rept 16 +.byte -5, 1 +.endr +endconst + +const hpel_shuf +.rept 2 +.byte 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 +.endr +endconst + +const shuf_12 +.rept 2 +.byte 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 +.endr +endconst + +const shuf_14 +.rept 2 +.byte 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 +.endr +endconst + +const shuf_15 +.rept 2 +.byte 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 +.endr +endconst + +const shuf_1 +.rept 2 +.byte 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +.endr +endconst + +const shuf_2 +.rept 2 +.byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +.endr +endconst + +const shuf_3 +.rept 2 +.byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 +.endr +endconst + +const shuf_4 +.rept 2 +.byte 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 +.endr +endconst + +const shuf_6 +.rept 2 +.byte 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +.endr +endconst + +#if !HIGH_BIT_DEPTH + +.macro MC_CHROMA_START + srai.d t0, a5, 3 + srai.d t1, a6, 3 + slli.d t0, t0, 1 + mul.d t1, t1, a4 + add.d t1, t1, t0 + add.d a3, a3, t1 /* src += (m_vy >> 3) * i_src_stride + (m_vx >> 3) * 2 */ +.endm + +/* + * void mc_chroma( uint8_t *p_dst_u, uint8_t *p_dst_v, + * intptr_t i_dst_stride, + * uint8_t *p_src, intptr_t i_src_stride, + * int32_t m_vx, int32_t m_vy, + * int32_t i_width, int32_t i_height ) + */ +function_x264 mc_chroma_lasx + MC_CHROMA_START + andi a5, a5, 0x07 /* m_vx & 0x07 */ + andi a6, a6, 0x07 /* m_vy & 0x07 */ + move t0, a5 + slli.d t0, t0, 8 + sub.d t0, t0, a5 + li.d a5, 8 + addi.d t0, t0, 8 + sub.d a5, a5, a6 + mul.d a6, a6, t0 /* (x * 255 + 8) * y */ + mul.d a5, a5, t0 /* (x * 255 + 8) * (8 - y) */ + xvreplgr2vr.h xr6, a6 /* cD cC ... cD cC */ + xvreplgr2vr.h xr7, a5 /* cB cA ... cB cA */ + la.local t0, ch_shuf + xvld xr5, t0, 0 + addi.d t0, a7, -4 + ldptr.w a7, sp, 0 /* a7 = i_height */ + slli.d t1, a4, 1 + blt zero, t0, .L_WIDTH8 +.L_LOOP4: + vld vr0, a3, 0 + vldx vr1, a3, a4 + vldx vr2, a3, t1 + xvpermi.q xr0, xr1, 0x02 + xvpermi.q xr1, xr2, 0x02 + xvshuf.b xr0, xr0, xr0, xr5 + xvshuf.b xr1, xr1, xr1, xr5 + xvdp2.h.bu xr2, xr0, xr7 + xvdp2.h.bu xr3, xr1, xr6 + xvadd.h xr0, xr2, xr3 + xvssrlrni.bu.h xr0, xr0, 6 + xvstelm.w xr0, a0, 0, 0 + xvstelm.w xr0, a1, 0, 1 + add.d a0, a0, a2 + add.d a1, a1, a2 + xvstelm.w xr0, a0, 0, 4 + xvstelm.w xr0, a1, 0, 5 + add.d a0, a0, a2 + add.d a1, a1, a2 + add.d a3, a3, t1 + addi.d a7, a7, -2 + blt zero, a7, .L_LOOP4 + b .ENDFUNC +.L_WIDTH8: + xvld xr0, a3, 0 + xvpermi.d xr0, xr0, 0x94 + xvshuf.b xr0, xr0, xr0, xr5 +.L_LOOP8: + xvldx xr3, a3, a4 + xvpermi.d xr3, xr3, 0x94 + xvshuf.b xr3, xr3, xr3, xr5 + xvdp2.h.bu xr1, xr0, xr7 + xvdp2.h.bu xr2, xr3, xr6 + xvdp2.h.bu xr8, xr3, xr7 + + xvldx xr0, a3, t1 + xvpermi.d xr0, xr0, 0x94 + xvshuf.b xr0, xr0, xr0, xr5 + xvdp2.h.bu xr4, xr0, xr6 + xvadd.h xr1, xr1, xr2 + xvadd.h xr3, xr8, xr4 + + xvssrlrni.bu.h xr3, xr1, 6 + + xvpermi.q xr4, xr3, 0x01 + xvpackev.w xr8, xr4, xr3 + xvpackod.w xr9, xr4, xr3 + vstelm.d vr8, a0, 0, 0 + vstelm.d vr9, a1, 0, 0 + add.d a0, a0, a2 + add.d a1, a1, a2 + vstelm.d vr8, a0, 0, 1 + vstelm.d vr9, a1, 0, 1 + + addi.d a7, a7, -2 + add.d a0, a0, a2 + add.d a1, a1, a2 + add.d a3, a3, t1 + blt zero, a7, .L_LOOP8 +.ENDFUNC: +endfunc_x264 + +.macro PIXEL_AVG_START + slli.d t0, a3, 1 + add.w t1, t0, a3 + slli.d t2, a3, 2 + slli.d t3, a5, 1 + add.w t4, t3, a5 + slli.d t5, a5, 2 + slli.d t6, a1, 1 + add.w t7, t6, a1 + slli.d t8, a1, 2 +.endm + +.macro BIWEIGHT_AVG_START + addi.d t0, zero, 64 + sub.d t0, t0, a6 + xvreplgr2vr.b xr0, a6 + xvreplgr2vr.b xr1, t0 + xvpackev.b xr8, xr1, xr0 + xvxor.v xr9, xr9, xr9 + xvaddi.hu xr9, xr9, 6 +.endm + +.macro BIWEIGHT_AVG_CORE a, b + xvpermi.d \a, \a, 0x50 + xvpermi.d \b, \b, 0x50 + xvilvl.b \a, \b, \a + xvmulwev.h.bu.b \b, \a, xr8 + xvmaddwod.h.bu.b \b, \a, xr8 + xvssrarn.bu.h \b, \b, xr9 + xvpermi.d \b, \b, 0x08 +.endm + +.macro PIXEL_AVG_START_W8 + slli.d t0, a3, 1 + add.w t1, t0, a3 + slli.d t3, a5, 1 + add.w t4, t3, a5 +.endm + +function_x264 pixel_avg_weight_w4_lasx + addi.d t0, zero, 64 + sub.d t0, t0, a6 + vreplgr2vr.b vr0, a6 + vreplgr2vr.b vr1, t0 + vpackev.b vr8, vr1, vr0 +.LOOP_HEIGHT_W4_1: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fld.s f2, a4, 0 + fldx.s f3, a4, a5 + vilvl.w vr0, vr1, vr0 + vilvl.w vr2, vr3, vr2 + vilvl.b vr0, vr2, vr0 + vmulwev.h.bu.b vr1, vr0, vr8 + vmaddwod.h.bu.b vr1, vr0, vr8 + vssrarni.bu.h vr1, vr1, 6 + fst.s f1, a0, 0 + add.d a0, a0, a1 + vstelm.w vr1, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a4, a5, a4, 1 + addi.w a7, a7, -2 + bnez a7, .LOOP_HEIGHT_W4_1 +endfunc_x264 + +function_x264 pixel_avg_w4_lasx +.LOOP_HEIGHT_W4: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fld.s f4, a4, 0 + fldx.s f5, a4, a5 + vilvl.w vr0, vr1, vr0 + vilvl.w vr4, vr5, vr4 + vavgr.bu vr0, vr0, vr4 + fst.s f0, a0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a4, a5, a4, 1 + addi.w a7, a7, -2 + bnez a7, .LOOP_HEIGHT_W4 +endfunc_x264 + +function_x264 pixel_avg_weight_w8_lasx + addi.d t0, zero, 64 + sub.d t0, t0, a6 + xvreplgr2vr.b xr0, a6 + xvreplgr2vr.b xr1, t0 + xvpackev.b xr8, xr1, xr0 + PIXEL_AVG_START_W8 +.LOOP_HEIGHT_W8_1: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t0 + fldx.d f3, a2, t1 + fld.d f4, a4, 0 + fldx.d f5, a4, a5 + fldx.d f6, a4, t3 + fldx.d f7, a4, t4 + vilvl.b vr0, vr4, vr0 + vilvl.b vr1, vr5, vr1 + vilvl.b vr2, vr6, vr2 + vilvl.b vr3, vr7, vr3 + xvpermi.q xr1, xr0, 0x20 + xvpermi.q xr3, xr2, 0x20 + xvmulwev.h.bu.b xr2, xr1, xr8 + xvmaddwod.h.bu.b xr2, xr1, xr8 + xvmulwev.h.bu.b xr4, xr3, xr8 + xvmaddwod.h.bu.b xr4, xr3, xr8 + xvssrarni.bu.h xr4, xr2, 6 + fst.d f4, a0, 0 + add.d a0, a0, a1 + xvstelm.d xr4, a0, 0, 2 + add.d a0, a0, a1 + xvstelm.d xr4, a0, 0, 1 + add.d a0, a0, a1 + xvstelm.d xr4, a0, 0, 3 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 2 + alsl.d a4, a5, a4, 2 + addi.w a7, a7, -4 + bnez a7, .LOOP_HEIGHT_W8_1 +endfunc_x264 + +function_x264 pixel_avg_w8_lasx + PIXEL_AVG_START_W8 +.LOOP_HEIGHT_W8: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t0 + fldx.d f3, a2, t1 + fld.d f4, a4, 0 + fldx.d f5, a4, a5 + fldx.d f6, a4, t3 + fldx.d f7, a4, t4 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr6, vr7, vr6 + vavgr.bu vr0, vr0, vr4 + vavgr.bu vr2, vr2, vr6 + fst.d f0, a0, 0 + add.d a0, a0, a1 + vstelm.d vr0, a0, 0, 1 + fstx.d f2, a0, a1 + alsl.d a0, a1, a0, 1 + vstelm.d vr2, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 2 + alsl.d a4, a5, a4, 2 + addi.w a7, a7, -4 + bnez a7, .LOOP_HEIGHT_W8 +endfunc_x264 + +function_x264 pixel_avg_weight_w16_lasx + BIWEIGHT_AVG_START + PIXEL_AVG_START +.L_HEIGHT_LOOP_T: + LSX_LOADX_4 a2, a3, t0, t1, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a4, a5, t3, t4, vr4, vr5, vr6, vr7 + BIWEIGHT_AVG_CORE xr0, xr4 + BIWEIGHT_AVG_CORE xr1, xr5 + vst vr4, a0, 0 + vstx vr5, a0, a1 + BIWEIGHT_AVG_CORE xr2, xr6 + BIWEIGHT_AVG_CORE xr3, xr7 + vstx vr6, a0, t6 + vstx vr7, a0, t7 + add.d a2, a2, t2 + add.d a4, a4, t5 + add.d a0, a0, t8 + addi.d a7, a7, -4 + bnez a7, .L_HEIGHT_LOOP_T +endfunc_x264 + +function_x264 pixel_avg_w16_lasx + PIXEL_AVG_START +.L_HEIGHT_LOOP: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t0 + vldx vr3, a2, t1 + vld vr4, a4, 0 + vldx vr5, a4, a5 + vldx vr6, a4, t3 + vldx vr7, a4, t4 + vavgr.bu vr0, vr0, vr4 + vavgr.bu vr1, vr1, vr5 + vavgr.bu vr2, vr2, vr6 + vavgr.bu vr3, vr3, vr7 + vst vr0, a0, 0 + vstx vr1, a0, a1 + vstx vr2, a0, t6 + vstx vr3, a0, t7 + add.d a0, a0, t8 + add.d a2, a2, t2 + add.d a4, a4, t5 + + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t0 + vldx vr3, a2, t1 + vld vr4, a4, 0 + vldx vr5, a4, a5 + vldx vr6, a4, t3 + vldx vr7, a4, t4 + vavgr.bu vr0, vr0, vr4 + vavgr.bu vr1, vr1, vr5 + vavgr.bu vr2, vr2, vr6 + vavgr.bu vr3, vr3, vr7 + vst vr0, a0, 0 + vstx vr1, a0, a1 + vstx vr2, a0, t6 + vstx vr3, a0, t7 + add.d a2, a2, t2 + add.d a4, a4, t5 + add.d a0, a0, t8 + addi.d a7, a7, -8 + bnez a7, .L_HEIGHT_LOOP +endfunc_x264 + +.macro FILT_PACK_LASX s1, s2, s3 + xvmulwev.w.h xr16, \s1, \s3 + xvmulwev.w.h xr17, \s2, \s3 + xvsrarni.h.w xr17, xr16, 15 + xvmaxi.h xr17, xr17, 0 + xvsat.hu xr17, xr17, 7 + xvmulwod.w.h xr18, \s1, \s3 + xvmulwod.w.h xr19, \s2, \s3 + xvsrarni.h.w xr19, xr18, 15 + xvmaxi.h xr19, xr19, 0 + xvsat.hu xr19, xr19, 7 + xvpackev.b \s1, xr19, xr17 +.endm + +/* s3: temp, s4: UNUSED, s5: imm */ +.macro DO_FILT_V_LASX s1, s2, s3, s4, s5 + alsl.d t1, a2, a1, 1 /* t1 = a1 + 2 * a2 */ + alsl.d t2, a2, a3, 1 /* t2 = a3 + 2 * a2 */ + xvld xr1, a3, 0 + xvldx xr2, a3, a2 + xvld \s3, t2, 0 + xvld xr3, a1, 0 + xvldx \s1, a1, a2 + xvld \s2, t1, 0 + xvilvh.b xr16, xr2, xr1 + xvilvl.b xr17, xr2, xr1 + xvilvh.b xr18, \s2, \s1 + xvilvl.b xr19, \s2, \s1 + xvilvh.b xr20, \s3, xr3 + xvilvl.b xr21, \s3, xr3 + xvdp2.h.bu.b xr1, xr17, xr12 + xvdp2.h.bu.b xr4, xr16, xr12 + xvdp2.h.bu.b \s1, xr19, xr0 + xvdp2.h.bu.b xr2, xr18, xr0 + xvdp2.h.bu.b xr3, xr21, xr14 + xvdp2.h.bu.b \s2, xr20, xr14 + xvadd.h xr1, xr1, \s1 + xvadd.h xr4, xr4, xr2 + xvadd.h xr1, xr1, xr3 + xvadd.h xr4, xr4, \s2 + xmov \s1, xr1 + xmov \s2, xr1 + addi.d a3, a3, 32 + addi.d a1, a1, 32 + xvpermi.q \s1, xr4, 0x2 + xvpermi.q \s2, xr4, 0x13 + FILT_PACK_LASX xr1, xr4, xr15 + addi.d t1, a4, \s5 + xvstx xr1, t0, t1 +.endm + +.macro FILT_H s1, s2, s3 + xvsub.h \s1, \s1, \s2 + xvsrai.h \s1, \s1, 2 + xvsub.h \s1, \s1, \s2 + xvadd.h \s1, \s1, \s3 + xvsrai.h \s1, \s1, 2 + xvadd.h \s1, \s1, \s3 +.endm + +.macro FILT_C s1, s2, s3 + xmov xr3, \s1 + xvpermi.q xr3, \s2, 0x03 + xvshuf.b xr1, \s2, xr3, xr23 + xvshuf.b xr2, \s2, xr3, xr24 + xmov \s1, \s2 + xvpermi.q \s1, \s3, 0x03 + xvshuf.b xr3, \s1, \s2, xr29 + xvshuf.b xr4, \s1, \s2, xr27 + xvadd.h xr3, xr2, xr3 + xmov xr2, \s1 + xmov \s1, \s3 + xvshuf.b \s3, xr2, \s2, xr30 + xvadd.h xr4, xr4, \s2 + xvadd.h \s3, \s3, xr1 + FILT_H \s3, xr3, xr4 +.endm + +.macro DO_FILT_C_LASX s1, s2, s3, s4 + FILT_C \s1, \s2, \s3 + FILT_C \s2, \s1, \s4 + FILT_PACK_LASX \s3, \s4, xr15 + xvpermi.d \s3, \s3, 0xd8 + xvstx \s3, a5, a4 +.endm + +.macro DO_FILT_H_LASX s1, s2, s3 + xmov xr3, \s1 + xvpermi.q xr3, \s2, 0x03 + xvshuf.b xr1, \s2, xr3, xr24 + xvshuf.b xr2, \s2, xr3, xr25 + xmov xr3, \s2 + xvpermi.q xr3, \s3, 0x03 + xvshuf.b xr4, xr3, \s2, xr26 + xvshuf.b xr5, xr3, \s2, xr27 + xvshuf.b xr6, xr3, \s2, xr28 + xmov \s1, \s2 + xvdp2.h.bu.b xr16, xr1, xr12 + xvdp2.h.bu.b xr17, xr2, xr12 + xvdp2.h.bu.b xr18, \s2, xr14 + xvdp2.h.bu.b xr19, xr4, xr14 + xvdp2.h.bu.b xr20, xr5, xr0 + xvdp2.h.bu.b xr21, xr6, xr0 + xvadd.h xr1, xr16, xr18 + xvadd.h xr2, xr17, xr19 + xvadd.h xr1, xr1, xr20 + xvadd.h xr2, xr2, xr21 + FILT_PACK_LASX xr1, xr2, xr15 + xvshuf.b xr1, xr1, xr1, xr22 + xvstx xr1, a0, a4 + xmov \s2, \s3 +.endm + +/* + * void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, + * uint8_t *src, intptr_t stride, int width, int height ) + */ +function_x264 hpel_filter_lasx + addi.d sp, sp, -56 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + + move a7, a3 + addi.d a5, a5, -32 + move t0, a1 + andi a7, a7, 31 + sub.d a3, a3, a7 + add.d a0, a0, a5 + add.d t0, t0, a5 + add.d a7, a7, a5 + add.d a5, a5, a2 + move a2, a4 + sub.d a7, zero, a7 + add.d a1, a3, a2 + sub.d a3, a3, a2 + sub.d a3, a3, a2 + move a4, a7 + la.local t1, filt_mul51 + xvld xr0, t1, 0 + la.local t2, filt_mul15 + xvld xr12, t2, 0 + la.local t3, filt_mul20 + xvld xr14, t3, 0 + la.local t4, pw_1024 + xvld xr15, t4, 0 + la.local t1, hpel_shuf + xvld xr22, t1, 0 + la.local t2, shuf_12 + xvld xr23, t2, 0 + la.local t3, shuf_1 + xvld xr26, t3, 0 + xvaddi.bu xr24, xr23, 2 /* shuf_14 */ + xvaddi.bu xr25, xr23, 3 /* shuf_15 */ + xvaddi.bu xr27, xr26, 1 /* shuf_2 */ + xvaddi.bu xr28, xr26, 2 /* shuf_3 */ + xvaddi.bu xr29, xr26, 3 /* shuf_4 */ + xvaddi.bu xr30, xr26, 5 /* shuf_6 */ + xvxor.v xr9, xr9, xr9 + xvxor.v xr10, xr10, xr10 +.LOOPY: + DO_FILT_V_LASX xr8, xr7, xr13, xr12, 0 +.LOOPX: + DO_FILT_V_LASX xr6, xr5, xr11, xr12, 32 +.LASTX: + xvsrli.h xr15, xr15, 1 + DO_FILT_C_LASX xr9, xr8, xr7, xr6 + xvadd.h xr15, xr15, xr15 + xmov xr7, xr5 + DO_FILT_H_LASX xr10, xr13, xr11 + addi.d a4, a4, 32 + blt a4, zero, .LOOPX + addi.d t1, a4, -32 + blt t1, zero, .LASTX + //setup regs for next y + sub.d a4, a4, a7 + sub.d a4, a4, a2 + sub.d a1, a1, a4 + sub.d a3, a3, a4 + add.d a0, a0, a2 + add.d t0, t0, a2 + add.d a5, a5, a2 + move a4, a7 + addi.d a6, a6, -1 + blt zero, a6, .LOOPY + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + addi.d sp, sp, 56 +endfunc_x264 + +/* + * void pixel_avg_wxh(pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, + * pixel *src2, intptr_t src2_stride, int weight); + */ +.macro PIXEL_AVG w, h +function_x264 pixel_avg_\w\()x\h\()_lasx + addi.d t0, a6, -32 + addi.d a7, zero, \h + bne t0, zero, x264_8_pixel_avg_weight_w\w\()_lasx + b x264_8_pixel_avg_w\w\()_lasx +endfunc_x264 +.endm + +PIXEL_AVG 16, 8 +PIXEL_AVG 8, 16 +PIXEL_AVG 8, 8 +PIXEL_AVG 8, 4 +PIXEL_AVG 4, 16 +PIXEL_AVG 4, 8 +PIXEL_AVG 4, 4 +PIXEL_AVG 4, 2 + +function_x264 mc_weight_w20_noden_lasx + xvldrepl.h xr1, a4, 40 // offset + xvldrepl.b xr0, a4, 36 // scale +.LOOP_WEIGHTW20_NODEN: + xvld xr3, a2, 0 + xvldx xr4, a2, a3 + xvmulwev.h.bu.b xr7, xr3, xr0 + xvmulwev.h.bu.b xr8, xr4, xr0 + xvmulwod.h.bu.b xr3, xr3, xr0 + xvmulwod.h.bu.b xr4, xr4, xr0 + xvadd.h xr7, xr7, xr1 + xvadd.h xr8, xr8, xr1 + xvadd.h xr3, xr3, xr1 + xvadd.h xr4, xr4, xr1 + xvssrarni.bu.h xr8, xr7, 0 + xvssrarni.bu.h xr4, xr3, 0 + xvilvl.b xr3, xr4, xr8 + xvilvh.b xr4, xr4, xr8 + vst vr3, a0, 0 + xvstelm.w xr3, a0, 16, 4 + add.d a0, a0, a1 + vst vr4, a0, 0 + xvstelm.w xr4, a0, 16, 4 + alsl.d a2, a3, a2, 1 + add.d a0, a0, a1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHTW20_NODEN +endfunc_x264 + +function_x264 mc_weight_w16_noden_lasx + xvldrepl.h xr1, a4, 40 // offset + xvldrepl.h xr0, a4, 36 // scale +.LOOP_WEIGHTW16_NODEN: + vld vr3, a2, 0 + vldx vr4, a2, a3 + vext2xv.hu.bu xr3, xr3 + vext2xv.hu.bu xr4, xr4 + xvmul.h xr3, xr3, xr0 + xvmul.h xr4, xr4, xr0 + xvadd.h xr3, xr3, xr1 + xvadd.h xr4, xr4, xr1 + xvssrarni.bu.h xr4, xr3, 0 + xvpermi.d xr3, xr4, 8 + xvpermi.d xr4, xr4, 13 + vst vr3, a0, 0 + vstx vr4, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHTW16_NODEN +endfunc_x264 + +function_x264 mc_weight_w8_noden_lasx + xvldrepl.h xr1, a4, 40 // offset + xvldrepl.h xr0, a4, 36 // scale +.LOOP_WEIGHTW8_NODEN: + fld.d f3, a2, 0 + fldx.d f4, a2, a3 + vilvl.d vr3, vr4, vr3 + vext2xv.hu.bu xr3, xr3 + xvmul.h xr3, xr3, xr0 + xvadd.h xr3, xr3, xr1 + xvssrarni.bu.h xr3, xr3, 0 + xvstelm.d xr3, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr3, a0, 0, 2 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHTW8_NODEN +endfunc_x264 + +function_x264 mc_weight_w4_noden_lasx + xvldrepl.h xr1, a4, 40 // offset + xvldrepl.h xr0, a4, 36 // scale +.LOOP_WEIGHTW4_NODEN: + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + vilvl.w vr3, vr4, vr3 + vext2xv.hu.bu xr3, xr3 + xvmul.h xr3, xr3, xr0 + xvadd.h xr3, xr3, xr1 + xvssrarni.bu.h xr3, xr3, 0 + xvstelm.w xr3, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.w xr3, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHTW4_NODEN +endfunc_x264 + +function_x264 mc_weight_w20_lasx + xvldrepl.h xr1, a4, 40 // offset + xvldrepl.b xr0, a4, 36 // scale + xvldrepl.h xr2, a4, 32 // denom + xvsll.h xr1, xr1, xr2 +.LOOP_WEIGHTW20: + xvld xr3, a2, 0 + xvldx xr4, a2, a3 + xvmulwev.h.bu.b xr7, xr3, xr0 + xvmulwev.h.bu.b xr8, xr4, xr0 + xvmulwod.h.bu.b xr3, xr3, xr0 + xvmulwod.h.bu.b xr4, xr4, xr0 + xvsadd.h xr7, xr7, xr1 + xvsadd.h xr8, xr8, xr1 + xvsadd.h xr3, xr3, xr1 + xvsadd.h xr4, xr4, xr1 + xvssrarn.bu.h xr7, xr7, xr2 + xvssrarn.bu.h xr8, xr8, xr2 + xvssrarn.bu.h xr3, xr3, xr2 + xvssrarn.bu.h xr4, xr4, xr2 + xvilvl.b xr3, xr3, xr7 + xvilvl.b xr4, xr4, xr8 + vst vr3, a0, 0 + xvstelm.w xr3, a0, 16, 4 + add.d a0, a0, a1 + vst vr4, a0, 0 + xvstelm.w xr4, a0, 16, 4 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHTW20 +endfunc_x264 + +function_x264 mc_weight_w16_lasx + xvldrepl.h xr1, a4, 40 // offset + xvldrepl.h xr0, a4, 36 // scale + xvldrepl.h xr2, a4, 32 // denom + xvsll.h xr1, xr1, xr2 +.LOOP_WEIGHTW16: + vld vr3, a2, 0 + vldx vr4, a2, a3 + vext2xv.hu.bu xr3, xr3 + vext2xv.hu.bu xr4, xr4 + xvmul.h xr3, xr3, xr0 + xvmul.h xr4, xr4, xr0 + xvsadd.h xr3, xr3, xr1 + xvsadd.h xr4, xr4, xr1 + xvssrarn.bu.h xr3, xr3, xr2 + xvssrarn.bu.h xr4, xr4, xr2 + xvpermi.d xr3, xr3, 8 + xvpermi.d xr4, xr4, 8 + vst vr3, a0, 0 + vstx vr4, a0, a1 + alsl.d a0, a1, a0, 1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHTW16 +endfunc_x264 + +function_x264 mc_weight_w8_lasx + xvldrepl.h xr1, a4, 40 // offset + xvldrepl.h xr0, a4, 36 // scale + xvldrepl.h xr2, a4, 32 // denom + xvsll.h xr1, xr1, xr2 +.LOOP_WEIGHTW8: + fld.d f3, a2, 0 + fldx.d f4, a2, a3 + vilvl.d vr3, vr4, vr3 + vext2xv.hu.bu xr3, xr3 + xvmul.h xr3, xr3, xr0 + xvsadd.h xr3, xr3, xr1 + xvssrarn.bu.h xr3, xr3, xr2 + xvstelm.d xr3, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.d xr3, a0, 0, 2 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHTW8 +endfunc_x264 + +function_x264 mc_weight_w4_lasx + xvldrepl.h xr1, a4, 40 // offset + xvldrepl.h xr0, a4, 36 // scale + xvldrepl.h xr2, a4, 32 // denom + xvsll.h xr1, xr1, xr2 +.LOOP_WEIGHTW4: + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + vilvl.w vr3, vr4, vr3 + vext2xv.hu.bu xr3, xr3 + xvmul.h xr3, xr3, xr0 + xvsadd.h xr3, xr3, xr1 + xvssrarn.bu.h xr3, xr3, xr2 + xvstelm.w xr3, a0, 0, 0 + add.d a0, a0, a1 + xvstelm.w xr3, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHTW4 +endfunc_x264 + +/* + * void x264_pixel_avg2_w4(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, + * intptr_t i_src_stride, uint8_t *src2, int i_height) + */ +function_x264 pixel_avg2_w4_lasx +.avg2w4_loop_2: + addi.d a5, a5, -2 + fld.s f0, a2, 0 + fld.s f1, a4, 0 + fldx.s f2, a2, a3 + fldx.s f3, a4, a3 + alsl.d a2, a3, a2, 1 + alsl.d a4, a3, a4, 1 + vavgr.bu vr0, vr0, vr1 + vavgr.bu vr1, vr2, vr3 + fst.s f0, a0, 0 + fstx.s f1, a0, a1 + alsl.d a0, a1, a0, 1 + blt zero, a5, .avg2w4_loop_2 +endfunc_x264 + +/* + * void x264_pixel_avg2_w8(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, + * intptr_t i_src_stride, uint8_t *src2, int i_height) + */ +function_x264 pixel_avg2_w8_lasx +.avg2w8_loop_2: + addi.d a5, a5, -2 + fld.d f0, a2, 0 + fld.d f1, a4, 0 + fldx.d f2, a2, a3 + fldx.d f3, a4, a3 + alsl.d a2, a3, a2, 1 + alsl.d a4, a3, a4, 1 + vavgr.bu vr0, vr0, vr1 + vavgr.bu vr1, vr2, vr3 + fst.d f0, a0, 0 + fstx.d f1, a0, a1 + alsl.d a0, a1, a0, 1 + blt zero, a5, .avg2w8_loop_2 +endfunc_x264 + +/* + * void x264_pixel_avg2_w16(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, + * intptr_t i_src_stride, uint8_t *src2, int i_height) + */ +function_x264 pixel_avg2_w16_lasx +.avg2w16_loop_2: + addi.d a5, a5, -2 + vld vr0, a2, 0 + vldx vr1, a2, a3 + vld vr2, a4, 0 + vldx vr3, a4, a3 + alsl.d a2, a3, a2, 1 + alsl.d a4, a3, a4, 1 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vst vr0, a0, 0 + vstx vr1, a0, a1 + alsl.d a0, a1, a0, 1 + blt zero, a5, .avg2w16_loop_2 +endfunc_x264 + +/* + * void x264_pixel_avg2_w20(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, + * intptr_t i_src_stride, uint8_t *src2, int i_height) + */ +function_x264 pixel_avg2_w20_lasx +.avg2w20_loop_2: + addi.d a5, a5, -2 + xvld xr0, a2, 0 + xvldx xr1, a2, a3 + xvld xr2, a4, 0 + xvldx xr3, a4, a3 + alsl.d a2, a3, a2, 1 + alsl.d a4, a3, a4, 1 + xvavgr.bu xr0, xr0, xr2 + xvavgr.bu xr1, xr1, xr3 + vst vr0, a0, 0 + xvstelm.w xr0, a0, 16, 4 + add.d a0, a0, a1 + vst vr1, a0, 0 + xvstelm.w xr1, a0, 16, 4 + add.d a0, a0, a1 + blt zero, a5, .avg2w20_loop_2 +endfunc_x264 + +/* + * void mc_copy_width16( uint8_t *p_dst, int32_t i_dst_stride, + * uint8_t *p_src, int32_t i_src_stride, + * int32_t i_height ) + */ +function_x264 mc_copy_w16_lasx + slli.d t0, a3, 1 + add.d t1, t0, a3 + slli.d t2, a1, 1 + add.d t3, t2, a1 +.LOOP_COPYW16: + vld vr1, a2, 0 + vldx vr2, a2, a3 + vldx vr3, a2, t0 + vldx vr4, a2, t1 + + vst vr1, a0, 0 + vstx vr2, a0, a1 + vstx vr3, a0, t2 + vstx vr4, a0, t3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + addi.w a4, a4, -4 + blt zero, a4, .LOOP_COPYW16 +endfunc_x264 + +/* + * void mc_copy_w8( uint8_t *p_dst, intptr_t i_dst_stride, + * uint8_t *p_src, intptr_t i_src_stride, + * int32_t i_height ) + */ +function_x264 mc_copy_w8_lasx + slli.d t0, a3, 1 + add.d t1, t0, a3 + slli.d t2, a1, 1 + add.d t3, t2, a1 +.LOOP_COPYW8: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t0 + fldx.d f3, a2, t1 + + fst.d f0, a0, 0 + fstx.d f1, a0, a1 + fstx.d f2, a0, t2 + fstx.d f3, a0, t3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + addi.w a4, a4, -4 + blt zero, a4, .LOOP_COPYW8 +endfunc_x264 + +/* + * void mc_copy_w4( uint8_t *p_dst, intptr_t i_dst_stride, + * uint8_t *p_src, intptr_t i_src_stride, + * int32_t i_height ) + */ +function_x264 mc_copy_w4_lasx + slli.d t0, a3, 1 + add.d t1, t0, a3 + slli.d t2, a1, 1 + add.d t3, t2, a1 +.LOOP_COPYW4: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fldx.s f2, a2, t0 + fldx.s f3, a2, t1 + + fst.s f0, a0, 0 + fstx.s f1, a0, a1 + fstx.s f2, a0, t2 + fstx.s f3, a0, t3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + addi.w a4, a4, -4 + blt zero, a4, .LOOP_COPYW4 +endfunc_x264 + +/* + * void memzero_aligned( void *p_dst, size_t n ) + */ +function_x264 memzero_aligned_lasx + xvxor.v xr1, xr1, xr1 +.memzero_loop: + addi.d a1, a1, -128 +.rept 4 + xvst xr1, a0, 0 + addi.d a0, a0, 32 +.endr + blt zero, a1, .memzero_loop +endfunc_x264 + +/* + * void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, + * pixel *dstv, pixel *dstc, intptr_t src_stride, + * intptr_t dst_stride, int width, int height ) + */ +function_x264 frame_init_lowres_core_lasx + andi t1, a7, 15 + sub.w t0, a7, t1 + slli.d t2, a5, 1 + ldptr.w a7, sp, 0 // use a7 as height variable + +.height_loop: + add.d t4, zero, t0 + addi.d t3, a0, 0 + addi.d t5, a1, 0 + addi.d t6, a2, 0 + addi.d t7, a3, 0 + addi.d t8, a4, 0 +.width16_loop: + xvld xr0, t3, 0 + xvldx xr1, t3, a5 + xvldx xr2, t3, t2 + xvavgr.bu xr3, xr0, xr1 + xvavgr.bu xr4, xr1, xr2 + xvhaddw.hu.bu xr5, xr3, xr3 + xvhaddw.hu.bu xr6, xr4, xr4 + xvssrarni.bu.h xr6, xr5, 1 + xvpermi.d xr7, xr6, 0xd8 + vst vr7, t5, 0 + xvpermi.q xr7, xr7, 0x11 + vst vr7, t7, 0 + + addi.d t3, t3, 1 + xvld xr0, t3, 0 + xvldx xr1, t3, a5 + xvldx xr2, t3, t2 + xvavgr.bu xr3, xr0, xr1 + xvavgr.bu xr4, xr1, xr2 + xvhaddw.hu.bu xr5, xr3, xr3 + xvhaddw.hu.bu xr6, xr4, xr4 + xvssrarni.bu.h xr6, xr5, 1 + xvpermi.d xr7, xr6, 0xd8 + vst vr7, t6, 0 + xvpermi.q xr7, xr7, 0x11 + vst vr7, t8, 0 + addi.d t3, t3, 31 + addi.d t5, t5, 16 + addi.d t6, t6, 16 + addi.d t7, t7, 16 + addi.d t8, t8, 16 + addi.w t4, t4, -16 + blt zero, t4, .width16_loop + + beqz t1, .width16_end + vld vr0, t3, 0 + vldx vr1, t3, a5 + vldx vr2, t3, t2 + vavgr.bu vr3, vr0, vr1 + vavgr.bu vr4, vr1, vr2 + vhaddw.hu.bu vr5, vr3, vr3 + vhaddw.hu.bu vr6, vr4, vr4 + vssrarni.bu.h vr6, vr5, 1 + fst.d f6, t5, 0 + vstelm.d vr6, t7, 0, 1 + + addi.d t3, t3, 1 + vld vr0, t3, 0 + vldx vr1, t3, a5 + vldx vr2, t3, t2 + vavgr.bu vr3, vr0, vr1 + vavgr.bu vr4, vr1, vr2 + vhaddw.hu.bu vr5, vr3, vr3 + vhaddw.hu.bu vr6, vr4, vr4 + vssrarni.bu.h vr6, vr5, 1 + fst.d f6, t6, 0 + vstelm.d vr6, t8, 0, 1 + +.width16_end: + add.d a0, a0, t2 + add.d a1, a1, a6 + add.d a2, a2, a6 + add.d a3, a3, a6 + add.d a4, a4, a6 + addi.w a7, a7, -1 + blt zero, a7, .height_loop +endfunc_x264 + +/* + * void mc_chroma(uint8_t *p_dst_u, uint8_t *p_dst_v, + * intptr_t i_dst_stride, + * uint8_t *p_src, intptr_t i_src_stride, + * int32_t m_vx, int32_t m_vy, + * int32_t i_width, int32_t i_height) + */ + +function_x264 mc_chroma_lsx + MC_CHROMA_START + andi a5, a5, 0x07 /* m_vx & 0x07 */ + andi a6, a6, 0x07 /* m_vy & 0x07 */ + li.d t8, 8 + sub.d t1, t8, a5 // 8-d8x + sub.d t2, t8, a6 // 8-d8y + mul.d t3, t1, t2 // CA + mul.d t4, a5, t2 // CB + mul.d t5, t1, a6 // CC + mul.d t6, a5, a6 // CD + vreplgr2vr.b vr0, t3 + vreplgr2vr.b vr1, t4 + vreplgr2vr.b vr2, t5 + vreplgr2vr.b vr3, t6 + + add.d t0, a3, a4 + ldptr.w t1, sp, 0 /* i_height */ + move t3, t0 + addi.d t4, zero, 1 + addi.d t5, zero, 3 + addi.d t6, zero, 7 + bge t6, a7, .ENDLOOP_W8 +.LOOP_W8: + vld vr4, a3, 0 + vld vr5, t0, 0 + vld vr6, a3, 2 + vld vr7, t0, 2 + vmulwev.h.bu vr8, vr4, vr0 + vmulwod.h.bu vr9, vr4, vr0 + vmulwev.h.bu vr10, vr5, vr2 + vmulwod.h.bu vr11, vr5, vr2 + vmaddwev.h.bu vr8, vr6, vr1 + vmaddwod.h.bu vr9, vr6, vr1 + vmaddwev.h.bu vr10, vr7, vr3 + vmaddwod.h.bu vr11, vr7, vr3 + vadd.h vr12, vr8, vr10 + vadd.h vr13, vr9, vr11 + vssrarni.bu.h vr13, vr12, 6 + vstelm.d vr13, a0, 0, 0 + vstelm.d vr13, a1, 0, 1 + + add.d a0, a0, a2 + add.d a1, a1, a2 + addi.d t1, t1, -1 + move a3, t3 + add.d t3, t3, a4 + move t0, t3 + blt zero, t1, .LOOP_W8 + b .ENDLOOP_W2 +.ENDLOOP_W8: + bge t5, a7, .ENDLOOP_W4 +.LOOP_W4: + vld vr4, a3, 0 + vld vr5, t0, 0 + vld vr6, a3, 2 + vld vr7, t0, 2 + vmulwev.h.bu vr8, vr4, vr0 + vmulwod.h.bu vr9, vr4, vr0 + vmulwev.h.bu vr10, vr5, vr2 + vmulwod.h.bu vr11, vr5, vr2 + vmaddwev.h.bu vr8, vr6, vr1 + vmaddwod.h.bu vr9, vr6, vr1 + vmaddwev.h.bu vr10, vr7, vr3 + vmaddwod.h.bu vr11, vr7, vr3 + vadd.h vr12, vr8, vr10 + vadd.h vr13, vr9, vr11 + vssrarni.bu.h vr13, vr12, 6 + vstelm.w vr13, a0, 0, 0 + vstelm.w vr13, a1, 0, 2 + + add.d a0, a0, a2 + add.d a1, a1, a2 + move a3, t3 + add.d t3, t3, a4 + move t0, t3 + addi.d t1, t1, -1 + blt zero, t1, .LOOP_W4 + b .ENDLOOP_W2 +.ENDLOOP_W4: + bge t4, a7, .ENDLOOP_W2 +.LOOP_W2: + vld vr4, a3, 0 + vld vr5, t0, 0 + vld vr6, a3, 2 + vld vr7, t0, 2 + vmulwev.h.bu vr8, vr4, vr0 + vmulwod.h.bu vr9, vr4, vr0 + vmulwev.h.bu vr10, vr5, vr2 + vmulwod.h.bu vr11, vr5, vr2 + vmaddwev.h.bu vr8, vr6, vr1 + vmaddwod.h.bu vr9, vr6, vr1 + vmaddwev.h.bu vr10, vr7, vr3 + vmaddwod.h.bu vr11, vr7, vr3 + vadd.h vr12, vr8, vr10 + vadd.h vr13, vr9, vr11 + vssrarni.bu.h vr13, vr12, 6 + vstelm.h vr13, a0, 0, 0 + vstelm.h vr13, a1, 0, 4 + + add.d a0, a0, a2 + add.d a1, a1, a2 + move a3, t3 + add.d t3, t3, a4 + move t0, t3 + addi.d t1, t1, -1 + blt zero, t1, .LOOP_W2 +.ENDLOOP_W2: +endfunc_x264 + +function_x264 pixel_avg_weight_w4_lsx + addi.d t0, zero, 64 + sub.d t0, t0, a6 + vreplgr2vr.b vr0, a6 + vreplgr2vr.b vr1, t0 + vpackev.b vr8, vr1, vr0 +.LOOP_AVG_WEIGHT_W4: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fld.s f2, a4, 0 + fldx.s f3, a4, a5 + vilvl.w vr0, vr1, vr0 + vilvl.w vr2, vr3, vr2 + vilvl.b vr0, vr2, vr0 + vmulwev.h.bu.b vr1, vr0, vr8 + vmaddwod.h.bu.b vr1, vr0, vr8 + vssrarni.bu.h vr1, vr1, 6 + fst.s f1, a0, 0 + add.d a0, a0, a1 + vstelm.w vr1, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a4, a5, a4, 1 + addi.w a7, a7, -2 + bnez a7, .LOOP_AVG_WEIGHT_W4 +endfunc_x264 + +function_x264 pixel_avg_w4_lsx +.LOOP_AVG_W4: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fld.s f4, a4, 0 + fldx.s f5, a4, a5 + vilvl.w vr0, vr1, vr0 + vilvl.w vr4, vr5, vr4 + vavgr.bu vr0, vr0, vr4 + fst.s f0, a0, 0 + add.d a0, a0, a1 + vstelm.w vr0, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a4, a5, a4, 1 + addi.w a7, a7, -2 + bnez a7, .LOOP_AVG_W4 +endfunc_x264 + +function_x264 pixel_avg_weight_w8_lsx + addi.d t0, zero, 64 + sub.d t0, t0, a6 + slli.d t5, a1, 1 + add.d t6, a1, t5 + add.d t7, a1, t6 + vreplgr2vr.b vr0, a6 + vreplgr2vr.b vr1, t0 + vpackev.b vr8, vr1, vr0 + PIXEL_AVG_START_W8 +.LOOP_AVG_HEIGHT_W8: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t0 + fldx.d f3, a2, t1 + fld.d f4, a4, 0 + fldx.d f5, a4, a5 + fldx.d f6, a4, t3 + fldx.d f7, a4, t4 + vilvl.b vr0, vr4, vr0 + vilvl.b vr1, vr5, vr1 + vilvl.b vr2, vr6, vr2 + vilvl.b vr3, vr7, vr3 + vmulwev.h.bu.b vr4, vr0, vr8 + vmulwev.h.bu.b vr5, vr1, vr8 + vmulwev.h.bu.b vr6, vr2, vr8 + vmulwev.h.bu.b vr7, vr3, vr8 + vmaddwod.h.bu.b vr4, vr0, vr8 + vmaddwod.h.bu.b vr5, vr1, vr8 + vmaddwod.h.bu.b vr6, vr2, vr8 + vmaddwod.h.bu.b vr7, vr3, vr8 + vssrarni.bu.h vr4, vr4, 6 + vssrarni.bu.h vr5, vr5, 6 + vssrarni.bu.h vr6, vr6, 6 + vssrarni.bu.h vr7, vr7, 6 + fst.d f4, a0, 0 + fstx.d f5, a0, a1 + fstx.d f6, a0, t5 + fstx.d f7, a0, t6 + add.d a0, a0, t7 + alsl.d a2, a3, a2, 2 + alsl.d a4, a5, a4, 2 + addi.w a7, a7, -4 + bnez a7, .LOOP_AVG_HEIGHT_W8 +endfunc_x264 + +function_x264 pixel_avg_w8_lsx + PIXEL_AVG_START_W8 +.LOOP_AVG_W8: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t0 + fldx.d f3, a2, t1 + fld.d f4, a4, 0 + fldx.d f5, a4, a5 + fldx.d f6, a4, t3 + fldx.d f7, a4, t4 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr6, vr7, vr6 + vavgr.bu vr0, vr0, vr4 + vavgr.bu vr2, vr2, vr6 + fst.d f0, a0, 0 + add.d a0, a0, a1 + vstelm.d vr0, a0, 0, 1 + fstx.d f2, a0, a1 + alsl.d a0, a1, a0, 1 + vstelm.d vr2, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 2 + alsl.d a4, a5, a4, 2 + addi.w a7, a7, -4 + bnez a7, .LOOP_AVG_W8 +endfunc_x264 + +function_x264 pixel_avg_weight_w16_lsx + addi.d t0, zero, 64 + sub.d t0, t0, a6 + vreplgr2vr.b vr8, a6 + vreplgr2vr.b vr9, t0 + PIXEL_AVG_START +.LOOP_AVG_HEIGHT_W16: + LSX_LOADX_4 a2, a3, t0, t1, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a4, a5, t3, t4, vr4, vr5, vr6, vr7 + + vmulwev.h.bu.b vr10, vr0, vr8 + vmulwev.h.bu.b vr11, vr1, vr8 + vmulwev.h.bu.b vr12, vr2, vr8 + vmulwev.h.bu.b vr13, vr3, vr8 + vmulwod.h.bu.b vr14, vr0, vr8 + vmulwod.h.bu.b vr15, vr1, vr8 + vmulwod.h.bu.b vr16, vr2, vr8 + vmulwod.h.bu.b vr17, vr3, vr8 + vmaddwev.h.bu.b vr10, vr4, vr9 + vmaddwev.h.bu.b vr11, vr5, vr9 + vmaddwev.h.bu.b vr12, vr6, vr9 + vmaddwev.h.bu.b vr13, vr7, vr9 + vmaddwod.h.bu.b vr14, vr4, vr9 + vmaddwod.h.bu.b vr15, vr5, vr9 + vmaddwod.h.bu.b vr16, vr6, vr9 + vmaddwod.h.bu.b vr17, vr7, vr9 + vssrarni.bu.h vr11, vr10, 6 + vssrarni.bu.h vr13, vr12, 6 + vssrarni.bu.h vr15, vr14, 6 + vssrarni.bu.h vr17, vr16, 6 + vilvl.b vr10, vr15, vr11 + vilvh.b vr11, vr15, vr11 + vilvl.b vr12, vr17, vr13 + vilvh.b vr13, vr17, vr13 + + vst vr10, a0, 0 + vstx vr11, a0, a1 + vstx vr12, a0, t6 + vstx vr13, a0, t7 + add.d a2, a2, t2 + add.d a4, a4, t5 + add.d a0, a0, t8 + addi.d a7, a7, -4 + bnez a7, .LOOP_AVG_HEIGHT_W16 +endfunc_x264 + +function_x264 pixel_avg_w16_lsx + PIXEL_AVG_START +.LOOP_AVG_W16: + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t0 + vldx vr3, a2, t1 + vld vr4, a4, 0 + vldx vr5, a4, a5 + vldx vr6, a4, t3 + vldx vr7, a4, t4 + vavgr.bu vr0, vr0, vr4 + vavgr.bu vr1, vr1, vr5 + vavgr.bu vr2, vr2, vr6 + vavgr.bu vr3, vr3, vr7 + vst vr0, a0, 0 + vstx vr1, a0, a1 + vstx vr2, a0, t6 + vstx vr3, a0, t7 + add.d a0, a0, t8 + add.d a2, a2, t2 + add.d a4, a4, t5 + + vld vr0, a2, 0 + vldx vr1, a2, a3 + vldx vr2, a2, t0 + vldx vr3, a2, t1 + vld vr4, a4, 0 + vldx vr5, a4, a5 + vldx vr6, a4, t3 + vldx vr7, a4, t4 + vavgr.bu vr0, vr0, vr4 + vavgr.bu vr1, vr1, vr5 + vavgr.bu vr2, vr2, vr6 + vavgr.bu vr3, vr3, vr7 + vst vr0, a0, 0 + vstx vr1, a0, a1 + vstx vr2, a0, t6 + vstx vr3, a0, t7 + add.d a2, a2, t2 + add.d a4, a4, t5 + add.d a0, a0, t8 + addi.d a7, a7, -8 + bnez a7, .LOOP_AVG_W16 +endfunc_x264 + +/* + * void pixel_avg_wxh(pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride, + * pixel *src2, intptr_t src2_stride, int weight); + */ +.macro PIXEL_AVG_LSX w, h +function_x264 pixel_avg_\w\()x\h\()_lsx + addi.d t0, a6, -32 + addi.d a7, zero, \h + bne t0, zero, x264_8_pixel_avg_weight_w\w\()_lsx + b x264_8_pixel_avg_w\w\()_lsx +endfunc_x264 +.endm + +PIXEL_AVG_LSX 16, 16 +PIXEL_AVG_LSX 16, 8 +PIXEL_AVG_LSX 8, 16 +PIXEL_AVG_LSX 8, 8 +PIXEL_AVG_LSX 8, 4 +PIXEL_AVG_LSX 4, 16 +PIXEL_AVG_LSX 4, 8 +PIXEL_AVG_LSX 4, 4 +PIXEL_AVG_LSX 4, 2 + +function_x264 mc_weight_w20_noden_lsx + vldrepl.b vr0, a4, 36 // scale + vldrepl.h vr1, a4, 40 // offset +.LOOP_WEIGHT_W20_NODEN: + vld vr3, a2, 0 + vld vr4, a2, 16 + add.d a2, a2, a3 + vld vr5, a2, 0 + vld vr6, a2, 16 + vilvl.w vr4, vr6, vr4 + vmulwev.h.bu.b vr7, vr3, vr0 + vmulwod.h.bu.b vr8, vr3, vr0 + vmulwev.h.bu.b vr9, vr4, vr0 + vmulwod.h.bu.b vr10, vr4, vr0 + vmulwev.h.bu.b vr11, vr5, vr0 + vmulwod.h.bu.b vr12, vr5, vr0 + vadd.h vr7, vr7, vr1 + vadd.h vr8, vr8, vr1 + vadd.h vr9, vr9, vr1 + vadd.h vr10, vr10, vr1 + vadd.h vr11, vr11, vr1 + vadd.h vr12, vr12, vr1 + vssrani.bu.h vr11, vr7, 0 + vssrani.bu.h vr12, vr8, 0 + vssrani.bu.h vr9, vr9, 0 + vssrani.bu.h vr10, vr10, 0 + vilvl.b vr7, vr12, vr11 + vilvl.b vr9, vr10, vr9 + vilvh.b vr11, vr12, vr11 + + vst vr7, a0, 0 + vstelm.w vr9, a0, 16, 0 + add.d a0, a0, a1 + vst vr11, a0, 0 + vstelm.w vr9, a0, 16, 1 + add.d a0, a0, a1 + add.d a2, a2, a3 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHT_W20_NODEN +endfunc_x264 + +function_x264 mc_weight_w16_noden_lsx + vldrepl.b vr0, a4, 36 // scale + vldrepl.h vr1, a4, 40 // offset +.LOOP_WEIGHT_W16_NODEN: + vld vr3, a2, 0 + vldx vr4, a2, a3 + vmulwev.h.bu.b vr5, vr3, vr0 + vmulwod.h.bu.b vr6, vr3, vr0 + vmulwev.h.bu.b vr7, vr4, vr0 + vmulwod.h.bu.b vr8, vr4, vr0 + vadd.h vr5, vr5, vr1 + vadd.h vr6, vr6, vr1 + vadd.h vr7, vr7, vr1 + vadd.h vr8, vr8, vr1 + vssrani.bu.h vr7, vr5, 0 + vssrani.bu.h vr8, vr6, 0 + vilvl.b vr5, vr8, vr7 + vilvh.b vr7, vr8, vr7 + vst vr5, a0, 0 + vstx vr7, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHT_W16_NODEN +endfunc_x264 + +function_x264 mc_weight_w8_noden_lsx + vldrepl.b vr0, a4, 36 // scale + vldrepl.h vr1, a4, 40 // offset +.LOOP_WEIGHT_W8_NODEN: + fld.d f3, a2, 0 + fldx.d f4, a2, a3 + vilvl.d vr3, vr4, vr3 + vmulwev.h.bu.b vr5, vr3, vr0 + vmulwod.h.bu.b vr6, vr3, vr0 + vadd.h vr5, vr5, vr1 + vadd.h vr6, vr6, vr1 + vssrani.bu.h vr5, vr5, 0 + vssrani.bu.h vr6, vr6, 0 + vilvl.b vr7, vr6, vr5 + vstelm.d vr7, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr7, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHT_W8_NODEN +endfunc_x264 + +function_x264 mc_weight_w4_noden_lsx + vldrepl.h vr0, a4, 36 // scale + vldrepl.h vr1, a4, 40 // offset +.LOOP_WEIGHT_W4_NODEN: + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + vilvl.w vr3, vr4, vr3 + vsllwil.hu.bu vr3, vr3, 0 + vmul.h vr3, vr3, vr0 + vadd.h vr3, vr3, vr1 + vssrani.bu.h vr3, vr3, 0 + vstelm.w vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHT_W4_NODEN +endfunc_x264 + +function_x264 mc_weight_w20_lsx + vldrepl.h vr1, a4, 40 // offset + vldrepl.b vr0, a4, 36 // scale + vldrepl.h vr2, a4, 32 // denom + vsll.h vr1, vr1, vr2 +.LOOP_WEIGHT_W20: + vld vr3, a2, 0 + vld vr4, a2, 16 + add.d a2, a2, a3 + vld vr5, a2, 0 + vld vr6, a2, 16 + vilvl.w vr4, vr6, vr4 + + vmulwev.h.bu.b vr7, vr3, vr0 + vmulwod.h.bu.b vr8, vr3, vr0 + vmulwev.h.bu.b vr9, vr4, vr0 + vmulwod.h.bu.b vr10, vr4, vr0 + vmulwev.h.bu.b vr11, vr5, vr0 + vmulwod.h.bu.b vr12, vr5, vr0 + vsadd.h vr7, vr7, vr1 + vsadd.h vr8, vr8, vr1 + vsadd.h vr9, vr9, vr1 + vsadd.h vr10, vr10, vr1 + vsadd.h vr11, vr11, vr1 + vsadd.h vr12, vr12, vr1 + vssrarn.bu.h vr7, vr7, vr2 + vssrarn.bu.h vr8, vr8, vr2 + vssrarn.bu.h vr9, vr9, vr2 + vssrarn.bu.h vr10, vr10, vr2 + vssrarn.bu.h vr11, vr11, vr2 + vssrarn.bu.h vr12, vr12, vr2 + vilvl.b vr7, vr8, vr7 + vilvl.b vr9, vr10, vr9 + vilvl.b vr11, vr12, vr11 + + vst vr7, a0, 0 + vstelm.w vr9, a0, 16, 0 + add.d a0, a0, a1 + vst vr11, a0, 0 + vstelm.w vr9, a0, 16, 1 + add.d a0, a0, a1 + add.d a2, a2, a3 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHT_W20 +endfunc_x264 + +function_x264 mc_weight_w16_lsx + vldrepl.h vr1, a4, 40 // offset + vldrepl.b vr0, a4, 36 // scale + vldrepl.h vr2, a4, 32 // denom + vsll.h vr1, vr1, vr2 +.LOOP_WEIGHT_W16: + vld vr3, a2, 0 + vldx vr4, a2, a3 + vmulwev.h.bu.b vr5, vr3, vr0 + vmulwod.h.bu.b vr6, vr3, vr0 + vmulwev.h.bu.b vr7, vr4, vr0 + vmulwod.h.bu.b vr8, vr4, vr0 + vsadd.h vr5, vr5, vr1 + vsadd.h vr6, vr6, vr1 + vsadd.h vr7, vr7, vr1 + vsadd.h vr8, vr8, vr1 + vssrarn.bu.h vr5, vr5, vr2 + vssrarn.bu.h vr6, vr6, vr2 + vssrarn.bu.h vr7, vr7, vr2 + vssrarn.bu.h vr8, vr8, vr2 + vilvl.b vr5, vr6, vr5 + vilvl.b vr7, vr8, vr7 + vst vr5, a0, 0 + vstx vr7, a0, a1 + alsl.d a2, a3, a2, 1 + alsl.d a0, a1, a0, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHT_W16 +endfunc_x264 + +function_x264 mc_weight_w8_lsx + vldrepl.h vr1, a4, 40 // offset + vldrepl.b vr0, a4, 36 // scale + vldrepl.h vr2, a4, 32 // denom + vsll.h vr1, vr1, vr2 +.LOOP_WEIGHT_W8: + fld.d f3, a2, 0 + fldx.d f4, a2, a3 + vilvl.d vr3, vr4, vr3 + vmulwev.h.bu.b vr5, vr3, vr0 + vmulwod.h.bu.b vr6, vr3, vr0 + vsadd.h vr5, vr5, vr1 + vsadd.h vr6, vr6, vr1 + vssrarn.bu.h vr5, vr5, vr2 + vssrarn.bu.h vr6, vr6, vr2 + vilvl.b vr7, vr6, vr5 + vstelm.d vr7, a0, 0, 0 + add.d a0, a0, a1 + vstelm.d vr7, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHT_W8 +endfunc_x264 + +function_x264 mc_weight_w4_lsx + vldrepl.h vr1, a4, 40 // offset + vldrepl.h vr0, a4, 36 // scale + vldrepl.h vr2, a4, 32 // denom + vsll.h vr1, vr1, vr2 +.LOOP_WEIGHT_W4: + fld.s f3, a2, 0 + fldx.s f4, a2, a3 + vilvl.w vr3, vr4, vr3 + vsllwil.hu.bu vr3, vr3, 0 + vmul.h vr3, vr3, vr0 + vsadd.h vr3, vr3, vr1 + vssrarn.bu.h vr3, vr3, vr2 + vstelm.w vr3, a0, 0, 0 + add.d a0, a0, a1 + vstelm.w vr3, a0, 0, 1 + add.d a0, a0, a1 + alsl.d a2, a3, a2, 1 + addi.w a5, a5, -2 + blt zero, a5, .LOOP_WEIGHT_W4 +endfunc_x264 + +/* + * void x264_pixel_avg2_w4(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, + * intptr_t i_src_stride, uint8_t *src2, int i_height) + */ +function_x264 pixel_avg2_w4_lsx +.LOOP_AVG2_W4: + addi.d a5, a5, -2 + fld.s f0, a2, 0 + fld.s f1, a4, 0 + fldx.s f2, a2, a3 + fldx.s f3, a4, a3 + alsl.d a2, a3, a2, 1 + alsl.d a4, a3, a4, 1 + vavgr.bu vr0, vr0, vr1 + vavgr.bu vr1, vr2, vr3 + fst.s f0, a0, 0 + fstx.s f1, a0, a1 + alsl.d a0, a1, a0, 1 + blt zero, a5, .LOOP_AVG2_W4 +endfunc_x264 + +/* + * void x264_pixel_avg2_w8(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, + * intptr_t i_src_stride, uint8_t *src2, int i_height) + */ +function_x264 pixel_avg2_w8_lsx +.LOOP_AVG2_W8: + addi.d a5, a5, -2 + fld.d f0, a2, 0 + fld.d f1, a4, 0 + fldx.d f2, a2, a3 + fldx.d f3, a4, a3 + alsl.d a2, a3, a2, 1 + alsl.d a4, a3, a4, 1 + vavgr.bu vr0, vr0, vr1 + vavgr.bu vr1, vr2, vr3 + fst.d f0, a0, 0 + fstx.d f1, a0, a1 + alsl.d a0, a1, a0, 1 + blt zero, a5, .LOOP_AVG2_W8 +endfunc_x264 + +/* + * void x264_pixel_avg2_w16(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, + * intptr_t i_src_stride, uint8_t *src2, int i_height) + */ +function_x264 pixel_avg2_w16_lsx +.LOOP_AVG2_W16: + addi.d a5, a5, -2 + vld vr0, a2, 0 + vldx vr1, a2, a3 + vld vr2, a4, 0 + vldx vr3, a4, a3 + alsl.d a2, a3, a2, 1 + alsl.d a4, a3, a4, 1 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vst vr0, a0, 0 + vstx vr1, a0, a1 + alsl.d a0, a1, a0, 1 + blt zero, a5, .LOOP_AVG2_W16 +endfunc_x264 + +/* + * void x264_pixel_avg2_w20(uint8_t *dst, intptr_t i_dst_stride, uint8_t *src1, + * intptr_t i_src_stride, uint8_t *src2, int i_height) + */ +function_x264 pixel_avg2_w20_lsx +.LOOP_AVG2_W20: + addi.d a5, a5, -2 + vld vr0, a2, 0 + vld vr1, a2, 16 + vld vr2, a4, 0 + vld vr3, a4, 16 + add.d a2, a2, a3 + add.d a4, a4, a3 + vld vr4, a2, 0 + vld vr5, a2, 16 + vld vr6, a4, 0 + vld vr7, a4, 16 + vavgr.bu vr0, vr0, vr2 + vavgr.bu vr1, vr1, vr3 + vavgr.bu vr4, vr4, vr6 + vavgr.bu vr5, vr5, vr7 + + vst vr0, a0, 0 + vstelm.w vr1, a0, 16, 0 + add.d a0, a0, a1 + vst vr4, a0, 0 + vstelm.w vr5, a0, 16, 0 + add.d a2, a2, a3 + add.d a4, a4, a3 + add.d a0, a0, a1 + blt zero, a5, .LOOP_AVG2_W20 +endfunc_x264 + +/* + * void mc_copy_width16( uint8_t *p_dst, int32_t i_dst_stride, + * uint8_t *p_src, int32_t i_src_stride, + * int32_t i_height ) + */ +function_x264 mc_copy_w16_lsx + slli.d t0, a3, 1 + add.d t1, t0, a3 + slli.d t2, a1, 1 + add.d t3, t2, a1 +.LOOP_COPY_W16: + vld vr1, a2, 0 + vldx vr2, a2, a3 + vldx vr3, a2, t0 + vldx vr4, a2, t1 + + vst vr1, a0, 0 + vstx vr2, a0, a1 + vstx vr3, a0, t2 + vstx vr4, a0, t3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + addi.w a4, a4, -4 + blt zero, a4, .LOOP_COPY_W16 +endfunc_x264 + +/* + * void mc_copy_w8(uint8_t *p_dst, intptr_t i_dst_stride, + * uint8_t *p_src, intptr_t i_src_stride, + * int32_t i_height) + */ +function_x264 mc_copy_w8_lsx + slli.d t0, a3, 1 + add.d t1, t0, a3 + slli.d t2, a1, 1 + add.d t3, t2, a1 +.LOOP_COPY_W8: + fld.d f0, a2, 0 + fldx.d f1, a2, a3 + fldx.d f2, a2, t0 + fldx.d f3, a2, t1 + + fst.d f0, a0, 0 + fstx.d f1, a0, a1 + fstx.d f2, a0, t2 + fstx.d f3, a0, t3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + addi.w a4, a4, -4 + blt zero, a4, .LOOP_COPY_W8 +endfunc_x264 + +/* + * void mc_copy_w4(uint8_t *p_dst, intptr_t i_dst_stride, + * uint8_t *p_src, intptr_t i_src_stride, + * int32_t i_height) + */ +function_x264 mc_copy_w4_lsx + slli.d t0, a3, 1 + add.d t1, t0, a3 + slli.d t2, a1, 1 + add.d t3, t2, a1 +.LOOP_COPY_W4: + fld.s f0, a2, 0 + fldx.s f1, a2, a3 + fldx.s f2, a2, t0 + fldx.s f3, a2, t1 + + fst.s f0, a0, 0 + fstx.s f1, a0, a1 + fstx.s f2, a0, t2 + fstx.s f3, a0, t3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + addi.w a4, a4, -4 + blt zero, a4, .LOOP_COPY_W4 +endfunc_x264 + +/* + * void store_interleave_chroma(uint8_t *p_dst, intptr_t i_dst_stride, + * uint8_t *p_src0, uint8_t *p_src1, + * int32_t i_height) + */ +function_x264 store_interleave_chroma_lsx +.loop_interleave_chroma: + fld.d f0, a2, 0 + fld.d f1, a3, 0 + addi.d a2, a2, FDEC_STRIDE + addi.d a3, a3, FDEC_STRIDE + vilvl.b vr0, vr1, vr0 + vst vr0, a0, 0 + add.d a0, a0, a1 + addi.w a4, a4, -1 + blt zero, a4, .loop_interleave_chroma +endfunc_x264 + +/* + * void load_deinterleave_chroma_fenc(pixel *dst, pixel *src, + * intptr_t i_src, int height) + */ +function_x264 load_deinterleave_chroma_fenc_lsx + addi.d t0, a0, FENC_STRIDE/2 + andi t1, a3, 1 + sub.w t2, a3, t1 +.loop_deinterleave_fenc: + vld vr0, a1, 0 + vldx vr1, a1, a2 + vpickev.b vr2, vr1, vr0 + vpickod.b vr3, vr1, vr0 + fst.d f2, a0, 0 + fst.d f3, t0, 0 + vstelm.d vr2, a0, FENC_STRIDE, 1 + vstelm.d vr3, t0, FENC_STRIDE, 1 + addi.d a0, a0, FENC_STRIDE * 2 + addi.d t0, t0, FENC_STRIDE * 2 + alsl.d a1, a2, a1, 1 + addi.w t2, t2, -2 + blt zero, t2, .loop_deinterleave_fenc + + beqz t1, .loop_deinterleave_fenc_end + vld vr0, a1, 0 + vpickev.b vr1, vr0, vr0 + vpickod.b vr2, vr0, vr0 + fst.d f1, a0, 0 + fst.d f2, t0, 0 +.loop_deinterleave_fenc_end: +endfunc_x264 + +/* + * void load_deinterleave_chroma_fdec(pixel *dst, pixel *src, + * intptr_t i_src, int height) + */ +function_x264 load_deinterleave_chroma_fdec_lsx + addi.d t0, a0, FDEC_STRIDE/2 + andi t1, a3, 1 + sub.w t2, a3, t1 +.loop_deinterleave_fdec: + vld vr0, a1, 0 + vldx vr1, a1, a2 + vpickev.b vr2, vr1, vr0 + vpickod.b vr3, vr1, vr0 + fst.d f2, a0, 0 + fst.d f3, t0, 0 + vstelm.d vr2, a0, FDEC_STRIDE, 1 + vstelm.d vr3, t0, FDEC_STRIDE, 1 + addi.d a0, a0, FDEC_STRIDE * 2 + addi.d t0, t0, FDEC_STRIDE * 2 + alsl.d a1, a2, a1, 1 + addi.w t2, t2, -2 + blt zero, t2, .loop_deinterleave_fdec + + beqz t1, .loop_deinterleave_fdec_end + vld vr0, a1, 0 + vpickev.b vr1, vr0, vr0 + vpickod.b vr2, vr0, vr0 + fst.d f1, a0, 0 + fst.d f2, t0, 0 +.loop_deinterleave_fdec_end: +endfunc_x264 + +/* + * x264_plane_copy_interleave(pixel *dst, intptr_t i_dst, + * pixel *srcu, intptr_t i_srcu, + * pixel *srcv, intptr_t i_srcv, int w, int h) + */ +function_x264 plane_copy_interleave_core_lsx +.loop_h: + add.d t0, a0, zero + add.d t2, a2, zero + add.d t4, a4, zero + add.d t6, a6, zero +.loop_copy_interleavew16: + vld vr0, t2, 0 + vld vr1, t4, 0 + vilvl.b vr2, vr1, vr0 + vilvh.b vr3, vr1, vr0 + vst vr2, t0, 0 + vst vr3, t0, 16 + addi.d t2, t2, 16 + addi.d t4, t4, 16 + addi.d t0, t0, 32 + addi.w t6, t6, -16 + blt zero, t6, .loop_copy_interleavew16 + + add.d a2, a2, a3 + add.d a4, a4, a5 + add.d a0, a0, a1 + addi.w a7, a7, -1 + blt zero, a7, .loop_h +endfunc_x264 + +/* + * void x264_plane_copy_deinterleave(pixel *dsta, intptr_t i_dsta, + * pixel *dstb, intptr_t i_dstb, + * pixel *src, intptr_t i_src, int w, int h) + */ +function_x264 plane_copy_deinterleave_lsx +.LOOP_PLANE_COPY_H: + add.d t0, a0, zero + add.d t2, a2, zero + add.d t4, a4, zero + add.d t6, a6, zero +.LOOP_PLANE_COPY_W16: + vld vr0, t4, 0 + vld vr1, t4, 16 + vpickev.b vr2, vr1, vr0 + vpickod.b vr3, vr1, vr0 + vst vr2, t0, 0 + vst vr3, t2, 0 + addi.d t4, t4, 32 + addi.d t0, t0, 16 + addi.d t2, t2, 16 + addi.w t6, t6, -16 + blt zero, t6, .LOOP_PLANE_COPY_W16 + + add.d a2, a2, a3 + add.d a4, a4, a5 + add.d a0, a0, a1 + addi.w a7, a7, -1 + blt zero, a7, .LOOP_PLANE_COPY_H +endfunc_x264 + +function_x264 plane_copy_deinterleave_lasx +.LOOP_PLANE_COPY_H_LASX: + add.d t0, a0, zero + add.d t2, a2, zero + add.d t4, a4, zero + add.d t6, a6, zero +.LOOP_PLANE_COPY_W32_LASX: + xvld xr0, t4, 0 + xvld xr1, t4, 32 + xvpickev.b xr2, xr1, xr0 + xvpickod.b xr3, xr1, xr0 + xvpermi.d xr2, xr2, 0xd8 + xvpermi.d xr3, xr3, 0xd8 + xvst xr2, t0, 0 + xvst xr3, t2, 0 + addi.d t4, t4, 64 + addi.d t0, t0, 32 + addi.d t2, t2, 32 + addi.w t6, t6, -32 + blt zero, t6, .LOOP_PLANE_COPY_W32_LASX + + add.d a2, a2, a3 + add.d a4, a4, a5 + add.d a0, a0, a1 + addi.w a7, a7, -1 + blt zero, a7, .LOOP_PLANE_COPY_H_LASX +endfunc_x264 + +/* + * void prefetch_ref(uint8_t *pix, intptr_t stride, int32_t parity) + */ +function_x264 prefetch_ref_lsx + addi.d a2, a2, -1 + addi.d a0, a0, 64 + and a2, a2, a1 + alsl.d t1, a2, a0, 3 + alsl.d a2, a1, a1, 1 + preld 0, t1, 0 + add.d t2, t1, a1 + preld 0, t2, 0 + add.d t2, t2, a1 + preld 0, t2, 0 + add.d t1, t1, a2 + preld 0, t1, 0 + alsl.d a0, a1, t2, 1 + preld 0, a0, 0 + add.d t1, a0, a1 + preld 0, t1, 0 + add.d t1, t1, a1 + preld 0, t1, 0 + add.d a0, a0, a2 + preld 0, a0, 0 +endfunc_x264 + +/* + * void prefetch_fenc_422(uint8_t *pix_y, intptr_t stride_y, + * uint8_t *pix_uv, intptr_t stride_uv, + * int32_t mb_x) + */ +function_x264 prefetch_fenc_422_lsx + andi t0, a4, 3 + mul.d t0, t0, a1 + andi a4, a4, 6 + mul.d t1, a4, a3 + addi.d a0, a0, 64 + addi.d a2, a2, 64 + alsl.d a0, t0, a0, 2 + preld 0, a0, 0 + add.d t2, a0, a1 + preld 0, t2, 0 + add.d a0, t2, a1 + preld 0, a0, 0 + add.d a0, a0, a1 + preld 0, a0, 0 + alsl.d a2, t1, a2, 2 + preld 0, a2, 0 + add.d t3, a2, a3 + preld 0, t3, 0 + add.d a2, t3, a3 + preld 0, a2, 0 + add.d a2, a2, a3 + preld 0, a2, 0 +endfunc_x264 + +/* + * void prefetch_fenc_420(uint8_t *pix_y, intptr_t stride_y, + * uint8_t *pix_uv, intptr_t stride_uv, + * int32_t mb_x) + */ +function_x264 prefetch_fenc_420_lsx + andi t0, a4, 3 + mul.d t0, t0, a1 + andi a4, a4, 6 + mul.d t1, a4, a3 + addi.d a0, a0, 64 + addi.d a2, a2, 64 + alsl.d a0, t0, a0, 2 + preld 0, a0, 0 + add.d t2, a0, a1 + preld 0, t2, 0 + add.d a0, t2, a1 + preld 0, a0, 0 + add.d a0, a0, a1 + preld 0, a0, 0 + alsl.d a2, t1, a2, 2 + preld 0, a2, 0 + add.d a2, a2, a3 + preld 0, a2, 0 +endfunc_x264 + +/* + * void *memcpy_aligned(void *dst, const void *src, size_t n) + */ +function_x264 memcpy_aligned_lsx + andi t0, a2, 16 + beqz t0, 2f + addi.d a2, a2, -16 + vld vr0, a1, 0 + vst vr0, a0, 0 + addi.d a1, a1, 16 + addi.d a0, a0, 16 +2: + andi t0, a2, 32 + beqz t0, 3f + addi.d a2, a2, -32 + vld vr0, a1, 0 + vld vr1, a1, 16 + vst vr0, a0, 0 + vst vr1, a0, 16 + addi.d a1, a1, 32 + addi.d a0, a0, 32 +3: + beqz a2, 5f +4: + addi.d a2, a2, -64 + vld vr0, a1, 48 + vld vr1, a1, 32 + vld vr2, a1, 16 + vld vr3, a1, 0 + vst vr0, a0, 48 + vst vr1, a0, 32 + vst vr2, a0, 16 + vst vr3, a0, 0 + addi.d a1, a1, 64 + addi.d a0, a0, 64 + blt zero, a2, 4b +5: +endfunc_x264 + +/* + * void memzero_aligned(void *p_dst, size_t n) + */ +function_x264 memzero_aligned_lsx + vxor.v vr1, vr1, vr1 +.loop_memzero: + addi.d a1, a1, -128 + vst vr1, a0, 0 + vst vr1, a0, 16 + vst vr1, a0, 32 + vst vr1, a0, 48 + vst vr1, a0, 64 + vst vr1, a0, 80 + vst vr1, a0, 96 + vst vr1, a0, 112 + addi.d a0, a0, 128 + blt zero, a1, .loop_memzero +endfunc_x264 + +.macro FILT_H_LSX s1, s2, s3 + vsub.h \s1, \s1, \s2 + vsrai.h \s1, \s1, 2 + vsub.h \s1, \s1, \s2 + vadd.h \s1, \s1, \s3 + vsrai.h \s1, \s1, 2 + vadd.h \s1, \s1, \s3 +.endm + +//s1: s1.0, s2: s2.0, s3: s3.0, s4: s1.1 s5: s2.1 s6: s3.1 +.macro FILT_C_LSX s1, s2, s3, s4, s5, s6 + vaddi.bu vr17, vr23, 2 //vr24 + vaddi.bu vr19, vr26, 1 //vr27 + vaddi.bu vr18, vr26, 3 //vr29 + + vshuf.b vr1, \s2, \s4, vr23 + vshuf.b vr2, \s2, \s4, vr17 + vshuf.b vr3, \s5, \s2, vr18 + vshuf.b vr4, \s5, \s2, vr19 + vadd.h vr3, vr2, vr3 + + vshuf.b vr16, \s5, \s2, vr23 + vshuf.b vr17, \s5, \s2, vr17 + vshuf.b vr18, \s3, \s5, vr18 + vshuf.b vr19, \s3, \s5, vr19 + vadd.h vr18, vr17, vr18 + + vmov vr2, \s5 + vmov \s1, \s3 + vmov vr20, \s3 + vmov \s4, \s6 + + vaddi.bu vr17, vr26, 5 //vr30 + + vshuf.b \s3, vr2, \s2, vr17 + vshuf.b \s6, vr20, \s5, vr17 + + vadd.h vr4, vr4, \s2 + vadd.h \s3, \s3, vr1 + vadd.h vr19, vr19, \s5 + vadd.h \s6, \s6, vr16 + + FILT_H_LSX \s3, vr3, vr4 + FILT_H_LSX \s6, vr18, vr19 +.endm + +.macro FILT_PACK_LSX s1, s2, s3 + vmulwev.w.h vr16, \s1, \s3 + vmulwev.w.h vr17, \s2, \s3 + vsrarni.h.w vr17, vr16, 15 + vmaxi.h vr17, vr17, 0 + vsat.hu vr17, vr17, 7 + vmulwod.w.h vr18, \s1, \s3 + vmulwod.w.h vr19, \s2, \s3 + vsrarni.h.w vr19, vr18, 15 + vmaxi.h vr19, vr19, 0 + vsat.hu vr19, vr19, 7 + vpackev.b \s1, vr19, vr17 +.endm + +//s1: s1.0, s2: s2.0, s3: s3.0, s4: s4.0 +//s5: s1.1, s6: s2.1, s7: s3.1, s8: s4.1 + +.macro DO_FILT_C_LSX s1, s2, s3, s4, s5, s6, s7, s8 + FILT_C_LSX \s1, \s2, \s3, \s5, \s6, \s7 + FILT_C_LSX \s2, \s1, \s4, \s6, \s5, \s8 + FILT_PACK_LSX \s3, \s4, vr15 + FILT_PACK_LSX \s7, \s8, vr15 + vilvl.d vr16, \s7, \s3 + vilvh.d vr17, \s7, \s3 + addi.d t3, a5, 16 + vstx vr16, a5, a4 + vstx vr17, t3, a4 +.endm + +.macro DO_FILT_H_LSX s1, s2, s3, s4, s5, s6 + vaddi.bu vr16, vr23, 2 //vr24 + vaddi.bu vr17, vr23, 3 //vr25 + vaddi.bu vr18, vr26, 1 //vr27 + vaddi.bu vr19, vr26, 2 //vr28 + vld vr3, t5, 0 + + vshuf.b vr1, \s2, \s4, vr16 + vshuf.b vr2, \s2, \s4, vr17 + vshuf.b vr4, \s5, \s2, vr26 + vshuf.b vr5, \s5, \s2, vr18 + vshuf.b vr6, \s5, \s2, vr19 + + vdp2.h.bu.b vr16, vr1, vr12 + vdp2.h.bu.b vr17, vr2, vr12 + vdp2.h.bu.b vr18, \s2, vr14 + vdp2.h.bu.b vr19, vr4, vr14 + vdp2.h.bu.b vr20, vr5, vr0 + vdp2.h.bu.b vr21, vr6, vr0 + vadd.h vr1, vr16, vr18 + vadd.h vr2, vr17, vr19 + vadd.h vr1, vr1, vr20 + vadd.h vr2, vr2, vr21 + FILT_PACK_LSX vr1, vr2, vr15 + vshuf.b vr1, vr1, vr1, vr3 + vstx vr1, a0, a4 + + vaddi.bu vr16, vr23, 2 //vr24 + vaddi.bu vr17, vr23, 3 //vr25 + vaddi.bu vr18, vr26, 1 //vr27 + vaddi.bu vr19, vr26, 2 //vr28 + + vshuf.b vr1, \s5, \s2, vr16 + vshuf.b vr2, \s5, \s2, vr17 + vshuf.b vr4, \s3, \s5, vr26 + vshuf.b vr5, \s3, \s5, vr18 + vshuf.b vr6, \s3, \s5, vr19 + + vdp2.h.bu.b vr16, vr1, vr12 + vdp2.h.bu.b vr17, vr2, vr12 + vdp2.h.bu.b vr18, \s5, vr14 + vdp2.h.bu.b vr19, vr4, vr14 + vdp2.h.bu.b vr20, vr5, vr0 + vdp2.h.bu.b vr21, vr6, vr0 + vadd.h vr1, vr16, vr18 + vadd.h vr2, vr17, vr19 + vadd.h vr1, vr1, vr20 + vadd.h vr2, vr2, vr21 + FILT_PACK_LSX vr1, vr2, vr15 + vshuf.b vr1, vr1, vr1, vr3 + addi.d a0, a0, 16 + vstx vr1, a0, a4 + addi.d a0, a0, -16 + + vmov \s1, \s2 + vmov \s2, \s3 + vmov \s4, \s5 + vmov \s5, \s6 +.endm + +/* s3: temp, s4: UNUSED, s5: imm */ +.macro DO_FILT_V0_LSX s1, s2, s3, s4, s5 + alsl.d t1, a2, a1, 1 /* t1 = a1 + 2 * a2 */ + alsl.d t2, a2, a3, 1 /* t2 = a3 + 2 * a2 */ + vld vr1, a3, 0 + vldx vr2, a3, a2 + vld \s3, t2, 0 + vld vr3, a1, 0 + vldx \s1, a1, a2 + vld \s2, t1, 0 + vilvh.b vr16, vr2, vr1 + vilvl.b vr17, vr2, vr1 + vilvh.b vr18, \s2, \s1 + vilvl.b vr19, \s2, \s1 + vilvh.b vr20, \s3, vr3 + vilvl.b vr21, \s3, vr3 + vdp2.h.bu.b vr1, vr17, vr12 + vdp2.h.bu.b vr4, vr16, vr12 + vdp2.h.bu.b \s1, vr19, vr0 + vdp2.h.bu.b vr2, vr18, vr0 + vdp2.h.bu.b vr3, vr21, vr14 + vdp2.h.bu.b \s2, vr20, vr14 + vadd.h vr1, vr1, \s1 + vadd.h vr4, vr4, vr2 + vadd.h vr1, vr1, vr3 + vadd.h vr4, vr4, \s2 + vmov \s1, vr1 + vmov \s2, vr4 + addi.d a3, a3, 16 + addi.d a1, a1, 16 + FILT_PACK_LSX vr1, vr4, vr15 + addi.d t3, a4, \s5 + vstx vr1, t0, t3 +.endm + +.macro DO_FILT_V1_LSX s1, s2, s3, s4, s5 + vld vr1, a3, 0 + vldx vr2, a3, a2 + vld \s3, t2, 16 + vld vr3, a1, 0 + vldx \s1, a1, a2 + vld \s2, t1, 16 + vilvh.b vr16, vr2, vr1 + vilvl.b vr17, vr2, vr1 + vilvh.b vr18, \s2, \s1 + vilvl.b vr19, \s2, \s1 + vilvh.b vr20, \s3, vr3 + vilvl.b vr21, \s3, vr3 + vdp2.h.bu.b vr1, vr17, vr12 + vdp2.h.bu.b vr4, vr16, vr12 + vdp2.h.bu.b \s1, vr19, vr0 + vdp2.h.bu.b vr2, vr18, vr0 + vdp2.h.bu.b vr3, vr21, vr14 + vdp2.h.bu.b \s2, vr20, vr14 + vadd.h vr1, vr1, \s1 + vadd.h vr4, vr4, vr2 + vadd.h vr1, vr1, vr3 + vadd.h vr4, vr4, \s2 + vmov \s1, vr1 + vmov \s2, vr4 + addi.d a3, a3, 16 + addi.d a1, a1, 16 + FILT_PACK_LSX vr1, vr4, vr15 + addi.d t3, a4, \s5 + addi.d t3, t3, 16 + vstx vr1, t0, t3 +.endm + +/* + * void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, + * uint8_t *src, intptr_t stride, int width, int height ) + */ +function_x264 hpel_filter_lsx + addi.d sp, sp, -64 + fst.d f24, sp, 0 + fst.d f25, sp, 8 + fst.d f26, sp, 16 + fst.d f27, sp, 24 + fst.d f28, sp, 32 + fst.d f29, sp, 40 + fst.d f30, sp, 48 + fst.d f31, sp, 56 + + move a7, a3 + addi.d a5, a5, -32 + move t0, a1 + andi a7, a7, 31 + sub.d a3, a3, a7 + add.d a0, a0, a5 + add.d t0, t0, a5 + add.d a7, a7, a5 + add.d a5, a5, a2 + move a2, a4 + sub.d a7, zero, a7 + add.d a1, a3, a2 + sub.d a3, a3, a2 + sub.d a3, a3, a2 + move a4, a7 + la.local t1, filt_mul51 + vld vr0, t1, 0 + la.local t2, filt_mul15 + vld vr12, t2, 0 + la.local t3, filt_mul20 + vld vr14, t3, 0 + la.local t4, pw_1024 + vld vr15, t4, 0 + la.local t5, hpel_shuf + la.local t2, shuf_12 + vld vr23, t2, 0 + la.local t3, shuf_1 + vld vr26, t3, 0 + vxor.v vr9, vr9, vr9 + vxor.v vr10, vr10, vr10 + vxor.v vr11, vr11, vr11 + vxor.v vr13, vr13, vr13 +.LOOPY_LSX: + DO_FILT_V0_LSX vr24, vr25, vr31, vr12, 0 + DO_FILT_V1_LSX vr8, vr7, vr22, vr12, 0 +.LOOPX_LSX: + DO_FILT_V0_LSX vr27, vr28, vr29, vr12, 32 + DO_FILT_V1_LSX vr6, vr5, vr30, vr12, 32 +.LSTX: + vsrli.h vr15, vr15, 1 + DO_FILT_C_LSX vr9, vr24, vr8, vr27, vr10, vr25, vr7, vr28 + vadd.h vr15, vr15, vr15 + vmov vr8, vr6 + vmov vr7, vr5 + + DO_FILT_H_LSX vr11, vr31, vr29, vr13, vr22, vr30 + addi.d a4, a4, 32 + blt a4, zero, .LOOPX_LSX + addi.d t1, a4, -32 + blt t1, zero, .LSTX + //setup regs for next y + sub.d a4, a4, a7 + sub.d a4, a4, a2 + sub.d a1, a1, a4 + sub.d a3, a3, a4 + add.d a0, a0, a2 + add.d t0, t0, a2 + add.d a5, a5, a2 + move a4, a7 + addi.d a6, a6, -1 + blt zero, a6, .LOOPY_LSX + fld.d f24, sp, 0 + fld.d f25, sp, 8 + fld.d f26, sp, 16 + fld.d f27, sp, 24 + fld.d f28, sp, 32 + fld.d f29, sp, 40 + fld.d f30, sp, 48 + fld.d f31, sp, 56 + addi.d sp, sp, 64 +endfunc_x264 + +/* + * void frame_init_lowres_core(pixel *src0, pixel *dst0, pixel *dsth, + * pixel *dstv, pixel *dstc, intptr_t src_stride, + * intptr_t dst_stride, int width, int height) + */ +function_x264 frame_init_lowres_core_lsx + addi.d t0, zero, 15 + addi.d t1, zero, 7 + addi.d t2, zero, 3 + addi.d t3, zero, 1 + ld.d t4, sp, 0 + addi.d sp, sp, -16 + st.d s0, sp, 0 + st.d s1, sp, 8 + slli.d s0, a5, 1 +.LOOPH: + bge zero, t4, .ENDLOOPH + addi.d t4, t4, -1 + add.d t5, a0, a5 + add.d t7, t5, a5 + move t6, a7 +.LOOPW16: + bge t0, t6, .LOOPW8 + vld vr0, a0, 0 + vld vr1, t5, 0 + vld vr2, t7, 0 + vld vr3, a0, 1 + vld vr4, t5, 1 + vld vr5, t7, 1 + vld vr6, a0, 16 + vld vr7, t5, 16 + vld vr8, t7, 16 + vld vr9, a0, 17 + vld vr10, t5, 17 + vld vr11, t7, 17 + + // Calculate dst0, dsth, dstv and dstc + vavgr.bu vr12, vr0, vr1 + vavgr.bu vr13, vr1, vr2 + vavgr.bu vr14, vr3, vr4 + vavgr.bu vr15, vr4, vr5 + vavgr.bu vr16, vr6, vr7 + vavgr.bu vr17, vr7, vr8 + vavgr.bu vr18, vr9, vr10 + vavgr.bu vr19, vr10, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vssrarni.bu.h vr13, vr12, 1 + vssrarni.bu.h vr15, vr14, 1 + vssrarni.bu.h vr17, vr16, 1 + vssrarni.bu.h vr19, vr18, 1 + vilvl.d vr12, vr17, vr13 + vilvl.d vr14, vr19, vr15 + vilvh.d vr13, vr17, vr13 + vilvh.d vr15, vr19, vr15 + vst vr12, a1, 0 + vst vr14, a2, 0 + vst vr13, a3, 0 + vst vr15, a4, 0 + + addi.d a1, a1, 16 + addi.d a2, a2, 16 + addi.d a3, a3, 16 + addi.d a4, a4, 16 + addi.d a0, a0, 32 + addi.d t5, t5, 32 + addi.d t7, t7, 32 + addi.d t6, t6, -16 + b .LOOPW16 +.LOOPW8: + bge t1, t6, .LOOPW4 + vld vr0, a0, 0 + vld vr1, t5, 0 + vld vr2, t7, 0 + vld vr3, a0, 1 + vld vr4, t5, 1 + vld vr5, t7, 1 + + // Calculate dst0, dsth, dstv and dstc + vavgr.bu vr12, vr0, vr1 + vavgr.bu vr13, vr1, vr2 + vavgr.bu vr14, vr3, vr4 + vavgr.bu vr15, vr4, vr5 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vssrarni.bu.h vr13, vr12, 1 + vssrarni.bu.h vr15, vr14, 1 + vstelm.d vr13, a1, 0, 0 + vstelm.d vr15, a2, 0, 0 + vstelm.d vr13, a3, 0, 1 + vstelm.d vr15, a4, 0, 1 + + addi.d a1, a1, 8 + addi.d a2, a2, 8 + addi.d a3, a3, 8 + addi.d a4, a4, 8 + addi.d a0, a0, 16 + addi.d t5, t5, 16 + addi.d t7, t7, 16 + addi.d t6, t6, -8 + b .LOOPW8 +.LOOPW4: + bge t2, t6, .LOOPW2 + vld vr0, a0, 0 + vld vr1, t5, 0 + vld vr2, t7, 0 + vld vr3, a0, 1 + vld vr4, t5, 1 + vld vr5, t7, 1 + + // Calculate dst0, dsth, dstv and dstc + vavgr.bu vr12, vr0, vr1 + vavgr.bu vr13, vr1, vr2 + vavgr.bu vr14, vr3, vr4 + vavgr.bu vr15, vr4, vr5 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vssrarni.bu.h vr13, vr12, 1 + vssrarni.bu.h vr15, vr14, 1 + vstelm.w vr13, a1, 0, 0 + vstelm.w vr15, a2, 0, 0 + vstelm.w vr13, a3, 0, 2 + vstelm.w vr15, a4, 0, 2 + + addi.d a1, a1, 4 + addi.d a2, a2, 4 + addi.d a3, a3, 4 + addi.d a4, a4, 4 + addi.d a0, a0, 8 + addi.d t5, t5, 8 + addi.d t7, t7, 8 + addi.d t6, t6, -4 + b .LOOPW4 +.LOOPW2: + bge t3, t6, .LOOPW1 + vld vr0, a0, 0 + vld vr1, t5, 0 + vld vr2, t7, 0 + vld vr3, a0, 1 + vld vr4, t5, 1 + vld vr5, t7, 1 + + // Calculate dst0, dsth, dstv and dstc + vavgr.bu vr12, vr0, vr1 + vavgr.bu vr13, vr1, vr2 + vavgr.bu vr14, vr3, vr4 + vavgr.bu vr15, vr4, vr5 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vssrarni.bu.h vr13, vr12, 1 + vssrarni.bu.h vr15, vr14, 1 + vstelm.h vr13, a1, 0, 0 + vstelm.h vr15, a2, 0, 0 + vstelm.h vr13, a3, 0, 4 + vstelm.h vr15, a4, 0, 4 + + addi.d a1, a1, 2 + addi.d a2, a2, 2 + addi.d a3, a3, 2 + addi.d a4, a4, 2 + addi.d a0, a0, 4 + addi.d t5, t5, 4 + addi.d t7, t7, 4 + addi.d t6, t6, -2 + b .LOOPW2 +.LOOPW1: + bge zero, t6, .ENDLOOPW1 + vld vr0, a0, 0 + vld vr1, t5, 0 + vld vr2, t7, 0 + vld vr3, a0, 1 + vld vr4, t5, 1 + vld vr5, t7, 1 + + // Calculate dst0, dsth, dstv and dstc + vavgr.bu vr12, vr0, vr1 + vavgr.bu vr13, vr1, vr2 + vavgr.bu vr14, vr3, vr4 + vavgr.bu vr15, vr4, vr5 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vssrarni.bu.h vr13, vr12, 1 + vssrarni.bu.h vr15, vr14, 1 + vstelm.b vr13, a1, 0, 0 + vstelm.b vr15, a2, 0, 0 + vstelm.b vr13, a3, 0, 8 + vstelm.b vr15, a4, 0, 8 +.ENDLOOPW1: + sub.d s1, a7, t6 + sub.d a0, a0, s1 + sub.d a0, a0, s1 + add.d a0, a0, s0 + sub.d a1, a1, s1 + add.d a1, a1, a6 + sub.d a2, a2, s1 + add.d a2, a2, a6 + sub.d a3, a3, s1 + add.d a3, a3, a6 + sub.d a4, a4, s1 + add.d a4, a4, a6 + b .LOOPH +.ENDLOOPH: + ld.d s0, sp, 0 + ld.d s1, sp, 8 + addi.d sp, sp, 16 +endfunc_x264 +#endif /* !HIGH_BIT_DEPTH */ diff --git a/common/loongarch/mc-c.c b/common/loongarch/mc-c.c new file mode 100644 index 000000000..fb5be6758 --- /dev/null +++ b/common/loongarch/mc-c.c @@ -0,0 +1,406 @@ +/***************************************************************************** + * mc-c.c: loongarch motion compensation + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Xiwei Gu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "mc.h" + +#if !HIGH_BIT_DEPTH + +#define MC_WEIGHT_LSX(func) \ +static void (* mc##func##_wtab_lsx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) = \ +{ \ + x264_mc_weight_w4##func##_lsx, \ + x264_mc_weight_w4##func##_lsx, \ + x264_mc_weight_w8##func##_lsx, \ + x264_mc_weight_w16##func##_lsx, \ + x264_mc_weight_w16##func##_lsx, \ + x264_mc_weight_w20##func##_lsx, \ +}; + +#define MC_WEIGHT(func) \ +static void (* mc##func##_wtab_lasx[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) = \ +{ \ + x264_mc_weight_w4##func##_lasx, \ + x264_mc_weight_w4##func##_lasx, \ + x264_mc_weight_w8##func##_lasx, \ + x264_mc_weight_w16##func##_lasx, \ + x264_mc_weight_w16##func##_lasx, \ + x264_mc_weight_w20##func##_lasx, \ +}; + +#if !HIGH_BIT_DEPTH +MC_WEIGHT_LSX() +MC_WEIGHT_LSX(_noden) +MC_WEIGHT() +MC_WEIGHT(_noden) +#endif + +static void weight_cache_lsx( x264_t *h, x264_weight_t *w ) +{ + if ( w->i_denom >= 1) + { + w->weightfn = mc_wtab_lsx; + } + else + w->weightfn = mc_noden_wtab_lsx; +} + +static weight_fn_t mc_weight_wtab_lsx[6] = +{ + x264_mc_weight_w4_lsx, + x264_mc_weight_w4_lsx, + x264_mc_weight_w8_lsx, + x264_mc_weight_w16_lsx, + x264_mc_weight_w16_lsx, + x264_mc_weight_w20_lsx, +}; + +static void (* const pixel_avg_wtab_lsx[6])(uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) = +{ + NULL, + x264_pixel_avg2_w4_lsx, + x264_pixel_avg2_w8_lsx, + x264_pixel_avg2_w16_lsx, + x264_pixel_avg2_w16_lsx, + x264_pixel_avg2_w20_lsx, +}; + +static void (* const mc_copy_wtab_lsx[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) = +{ + NULL, + x264_mc_copy_w4_lsx, + x264_mc_copy_w8_lsx, + NULL, + x264_mc_copy_w16_lsx, +}; + +static void weight_cache_lasx( x264_t *h, x264_weight_t *w ) +{ + if ( w->i_denom >= 1) + { + w->weightfn = mc_wtab_lasx; + } + else + w->weightfn = mc_noden_wtab_lasx; +} + +static weight_fn_t mc_weight_wtab_lasx[6] = +{ + x264_mc_weight_w4_lasx, + x264_mc_weight_w4_lasx, + x264_mc_weight_w8_lasx, + x264_mc_weight_w16_lasx, + x264_mc_weight_w16_lasx, + x264_mc_weight_w20_lasx, +}; + +static void (* const pixel_avg_wtab_lasx[6])(uint8_t *, intptr_t, uint8_t *, + intptr_t, uint8_t *, int ) = +{ + NULL, + x264_pixel_avg2_w4_lasx, + x264_pixel_avg2_w8_lasx, + x264_pixel_avg2_w16_lasx, + x264_pixel_avg2_w16_lasx, + x264_pixel_avg2_w20_lasx, +}; + +static void (* const mc_copy_wtab_lasx[5])( uint8_t *, intptr_t, uint8_t *, + intptr_t, int ) = +{ + NULL, + x264_mc_copy_w4_lasx, + x264_mc_copy_w8_lasx, + NULL, + x264_mc_copy_w16_lasx, +}; + +static uint8_t *get_ref_lsx( uint8_t *p_dst, intptr_t *p_dst_stride, + uint8_t *p_src[4], intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height, + const x264_weight_t *pWeight ) +{ + int32_t i_qpel_idx; + int32_t i_offset; + uint8_t *p_src1; + int32_t r_vy = m_vy & 3; + int32_t r_vx = m_vx & 3; + int32_t width = i_width >> 2; + + i_qpel_idx = ( r_vy << 2 ) + r_vx; + i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); + p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + + ( 3 == r_vy ) * i_src_stride; + + if( i_qpel_idx & 5 ) + { + uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + + i_offset + ( 3 == r_vx ); + pixel_avg_wtab_lsx[width]( + p_dst, *p_dst_stride, p_src1, i_src_stride, + p_src2, i_height ); + + if( pWeight->weightfn ) + { + pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height); + } + return p_dst; + } + else if ( pWeight->weightfn ) + { + pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height ); + return p_dst; + } + else + { + *p_dst_stride = i_src_stride; + return p_src1; + } +} + +static void mc_luma_lsx( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src[4], intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height, + const x264_weight_t *pWeight ) +{ + int32_t i_qpel_idx; + int32_t i_offset; + uint8_t *p_src1; + + i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 ); + i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); + p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + + ( 3 == ( m_vy & 3 ) ) * i_src_stride; + + if( i_qpel_idx & 5 ) + { + uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + + i_offset + ( 3 == ( m_vx & 3 ) ); + + pixel_avg_wtab_lsx[i_width >> 2]( + p_dst, i_dst_stride, p_src1, i_src_stride, + p_src2, i_height ); + + if( pWeight->weightfn ) + { + pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height ); + } + } + else if( pWeight->weightfn ) + { + pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height ); + } + else + { + mc_copy_wtab_lsx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height ); + } +} + +PLANE_INTERLEAVE(lsx) +PLANE_COPY_YUYV(32, lsx) + +#define x264_mc_chroma_lsx x264_template(mc_chroma_lsx) +void x264_mc_chroma_lsx( uint8_t *p_dst_u, uint8_t *p_dst_v, + intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height ); + +static uint8_t *get_ref_lasx( uint8_t *p_dst, intptr_t *p_dst_stride, + uint8_t *p_src[4], intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height, + const x264_weight_t *pWeight ) +{ + int32_t i_qpel_idx; + int32_t i_offset; + uint8_t *p_src1; + int32_t r_vy = m_vy & 3; + int32_t r_vx = m_vx & 3; + int32_t width = i_width >> 2; + + i_qpel_idx = ( r_vy << 2 ) + r_vx; + i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); + p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + + ( 3 == r_vy ) * i_src_stride; + + if( i_qpel_idx & 5 ) + { + uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + + i_offset + ( 3 == r_vx ); + pixel_avg_wtab_lasx[width]( + p_dst, *p_dst_stride, p_src1, i_src_stride, + p_src2, i_height ); + + if( pWeight->weightfn ) + { + pWeight->weightfn[width](p_dst, *p_dst_stride, p_dst, *p_dst_stride, pWeight, i_height); + } + return p_dst; + } + else if ( pWeight->weightfn ) + { + pWeight->weightfn[width]( p_dst, *p_dst_stride, p_src1, i_src_stride, pWeight, i_height ); + return p_dst; + } + else + { + *p_dst_stride = i_src_stride; + return p_src1; + } +} + +static void mc_luma_lasx( uint8_t *p_dst, intptr_t i_dst_stride, + uint8_t *p_src[4], intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height, + const x264_weight_t *pWeight ) +{ + int32_t i_qpel_idx; + int32_t i_offset; + uint8_t *p_src1; + + i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 ); + i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 ); + p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset + + ( 3 == ( m_vy & 3 ) ) * i_src_stride; + + if( i_qpel_idx & 5 ) + { + uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] + + i_offset + ( 3 == ( m_vx & 3 ) ); + + pixel_avg_wtab_lasx[i_width >> 2]( + p_dst, i_dst_stride, p_src1, i_src_stride, + p_src2, i_height ); + + if( pWeight->weightfn ) + { + pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_dst, i_dst_stride, pWeight, i_height ); + } + } + else if( pWeight->weightfn ) + { + pWeight->weightfn[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, pWeight, i_height ); + } + else + { + mc_copy_wtab_lasx[i_width>>2]( p_dst, i_dst_stride, p_src1, i_src_stride, i_height ); + } +} + +PLANE_COPY_YUYV(64, lasx) + +#define x264_mc_chroma_lasx x264_template(mc_chroma_lasx) +void x264_mc_chroma_lasx( uint8_t *p_dst_u, uint8_t *p_dst_v, + intptr_t i_dst_stride, + uint8_t *p_src, intptr_t i_src_stride, + int32_t m_vx, int32_t m_vy, + int32_t i_width, int32_t i_height ); +#endif // !HIGH_BIT_DEPTH + +void x264_mc_init_loongarch( int32_t cpu, x264_mc_functions_t *pf ) +{ +#if !HIGH_BIT_DEPTH + if( cpu & X264_CPU_LSX ) + { + pf->mc_luma = mc_luma_lsx; + pf->mc_chroma = x264_mc_chroma_lsx; + pf->get_ref = get_ref_lsx; + + pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_lsx; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lsx; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lsx; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lsx; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lsx; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lsx; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lsx; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lsx; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lsx; + + pf->weight = mc_weight_wtab_lsx; + pf->offsetadd = mc_weight_wtab_lsx; + pf->offsetsub = mc_weight_wtab_lsx; + pf->weight_cache = weight_cache_lsx; + + pf->copy_16x16_unaligned = x264_mc_copy_w16_lsx; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lsx; + pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lsx; + pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lsx; + + pf->store_interleave_chroma = x264_store_interleave_chroma_lsx; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_lsx; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_lsx; + + pf->plane_copy_interleave = plane_copy_interleave_lsx; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lsx; + pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lsx; + + pf->hpel_filter = x264_hpel_filter_lsx; + pf->memcpy_aligned = x264_memcpy_aligned_lsx; + pf->memzero_aligned = x264_memzero_aligned_lsx; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_lsx; + + pf->prefetch_fenc_420 = x264_prefetch_fenc_420_lsx; + pf->prefetch_fenc_422 = x264_prefetch_fenc_422_lsx; + pf->prefetch_ref = x264_prefetch_ref_lsx; + } + + if( cpu & X264_CPU_LASX ) + { + pf->mc_luma = mc_luma_lasx; + pf->mc_chroma = x264_mc_chroma_lasx; + pf->get_ref = get_ref_lasx; + + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_lasx; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_lasx; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_lasx; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_lasx; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_lasx; + pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_lasx; + pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_lasx; + pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_lasx; + + pf->weight = mc_weight_wtab_lasx; + pf->offsetadd = mc_weight_wtab_lasx; + pf->offsetsub = mc_weight_wtab_lasx; + pf->weight_cache = weight_cache_lasx; + + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_lasx; + pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_lasx; + + pf->copy_16x16_unaligned = x264_mc_copy_w16_lasx; + pf->copy[PIXEL_16x16] = x264_mc_copy_w16_lasx; + pf->copy[PIXEL_8x8] = x264_mc_copy_w8_lasx; + pf->copy[PIXEL_4x4] = x264_mc_copy_w4_lasx; + + pf->hpel_filter = x264_hpel_filter_lasx; + pf->memzero_aligned = x264_memzero_aligned_lasx; + pf->frame_init_lowres_core = x264_frame_init_lowres_core_lasx; + } +#endif // !HIGH_BIT_DEPTH +} diff --git a/common/loongarch/mc.h b/common/loongarch/mc.h new file mode 100644 index 000000000..6421f6e30 --- /dev/null +++ b/common/loongarch/mc.h @@ -0,0 +1,196 @@ +/***************************************************************************** + * mc.h: loongarch motion compensation + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Xiwei Gu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_LOONGARCH_MC_H +#define X264_LOONGARCH_MC_H + +#define x264_mc_init_loongarch x264_template(mc_init_loongarch) +void x264_mc_init_loongarch( int cpu, x264_mc_functions_t *pf ); + +#define x264_pixel_avg_16x16_lsx x264_template(pixel_avg_16x16_lsx) +void x264_pixel_avg_16x16_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); +#define x264_pixel_avg_16x8_lsx x264_template(pixel_avg_16x8_lsx) +void x264_pixel_avg_16x8_lsx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); +#define x264_pixel_avg_8x16_lsx x264_template(pixel_avg_8x16_lsx) +void x264_pixel_avg_8x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_8x8_lsx x264_template(pixel_avg_8x8_lsx) +void x264_pixel_avg_8x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_8x4_lsx x264_template(pixel_avg_8x4_lsx) +void x264_pixel_avg_8x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_4x16_lsx x264_template(pixel_avg_4x16_lsx) +void x264_pixel_avg_4x16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_4x8_lsx x264_template(pixel_avg_4x8_lsx) +void x264_pixel_avg_4x8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_4x4_lsx x264_template(pixel_avg_4x4_lsx) +void x264_pixel_avg_4x4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_4x2_lsx x264_template(pixel_avg_4x2_lsx) +void x264_pixel_avg_4x2_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +#define x264_pixel_avg2_w4_lsx x264_template(pixel_avg2_w4_lsx) +void x264_pixel_avg2_w4_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +#define x264_pixel_avg2_w8_lsx x264_template(pixel_avg2_w8_lsx) +void x264_pixel_avg2_w8_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +#define x264_pixel_avg2_w16_lsx x264_template(pixel_avg2_w16_lsx) +void x264_pixel_avg2_w16_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +#define x264_pixel_avg2_w20_lsx x264_template(pixel_avg2_w20_lsx) +void x264_pixel_avg2_w20_lsx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); + +#define x264_mc_weight_w20_lsx x264_template(mc_weight_w20_lsx) +void x264_mc_weight_w20_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w20_noden_lsx x264_template(mc_weight_w20_noden_lsx) +void x264_mc_weight_w20_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w16_lsx x264_template(mc_weight_w16_lsx) +void x264_mc_weight_w16_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w16_noden_lsx x264_template(mc_weight_w16_noden_lsx) +void x264_mc_weight_w16_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w8_lsx x264_template(mc_weight_w8_lsx) +void x264_mc_weight_w8_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w8_noden_lsx x264_template(mc_weight_w8_noden_lsx) +void x264_mc_weight_w8_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w4_lsx x264_template(mc_weight_w4_lsx) +void x264_mc_weight_w4_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w4_noden_lsx x264_template(mc_weight_w4_noden_lsx) +void x264_mc_weight_w4_noden_lsx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); + +#define x264_mc_copy_w16_lsx x264_template(mc_copy_w16_lsx) +void x264_mc_copy_w16_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_mc_copy_w8_lsx x264_template(mc_copy_w8_lsx) +void x264_mc_copy_w8_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_mc_copy_w4_lsx x264_template(mc_copy_w4_lsx) +void x264_mc_copy_w4_lsx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +#define x264_store_interleave_chroma_lsx x264_template(store_interleave_chroma_lsx) +void x264_store_interleave_chroma_lsx( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +#define x264_load_deinterleave_chroma_fenc_lsx x264_template(load_deinterleave_chroma_fenc_lsx) +void x264_load_deinterleave_chroma_fenc_lsx( pixel *dst, pixel *src, intptr_t i_src, int height ); +#define x264_load_deinterleave_chroma_fdec_lsx x264_template(load_deinterleave_chroma_fdec_lsx) +void x264_load_deinterleave_chroma_fdec_lsx( pixel *dst, pixel *src, intptr_t i_src, int height ); + +#define x264_plane_copy_interleave_core_lsx x264_template(plane_copy_interleave_core_lsx) +void x264_plane_copy_interleave_core_lsx( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); +#define x264_plane_copy_deinterleave_lsx x264_template(plane_copy_deinterleave_lsx) +void x264_plane_copy_deinterleave_lsx( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); + +#define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx) +void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); + +#define x264_prefetch_fenc_420_lsx x264_template(prefetch_fenc_420_lsx) +void x264_prefetch_fenc_420_lsx( uint8_t *pix_y, intptr_t stride_y, + uint8_t *pix_uv, intptr_t stride_uv, + int32_t mb_x ); +#define x264_prefetch_fenc_422_lsx x264_template(prefetch_fenc_422_lsx) +void x264_prefetch_fenc_422_lsx( uint8_t *pix_y, intptr_t stride_y, + uint8_t *pix_uv, intptr_t stride_uv, + int32_t mb_x ); +#define x264_prefetch_ref_lsx x264_template(prefetch_ref_lsx) +void x264_prefetch_ref_lsx( uint8_t *pix, intptr_t stride, int32_t parity ); + +#define x264_memcpy_aligned_lsx x264_template(memcpy_aligned_lsx) +void *x264_memcpy_aligned_lsx( void *dst, const void *src, size_t n ); +#define x264_memzero_aligned_lsx x264_template(memzero_aligned_lsx) +void x264_memzero_aligned_lsx( void *p_dst, size_t n ); + +#define x264_hpel_filter_lsx x264_template(hpel_filter_lsx) +void x264_hpel_filter_lsx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * ); +#define x264_frame_init_lowres_core_lsx x264_template(frame_init_lowres_core_lsx) +void x264_frame_init_lowres_core_lsx( uint8_t *, uint8_t *, uint8_t *, uint8_t *, + uint8_t *, intptr_t, intptr_t, int, int ); + +#define x264_pixel_avg_16x8_lasx x264_template(pixel_avg_16x8_lasx) +void x264_pixel_avg_16x8_lasx( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ); +#define x264_pixel_avg_8x16_lasx x264_template(pixel_avg_8x16_lasx) +void x264_pixel_avg_8x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_8x8_lasx x264_template(pixel_avg_8x8_lasx) +void x264_pixel_avg_8x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_8x4_lasx x264_template(pixel_avg_8x4_lasx) +void x264_pixel_avg_8x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_4x16_lasx x264_template(pixel_avg_4x16_lasx) +void x264_pixel_avg_4x16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_4x8_lasx x264_template(pixel_avg_4x8_lasx) +void x264_pixel_avg_4x8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_4x4_lasx x264_template(pixel_avg_4x4_lasx) +void x264_pixel_avg_4x4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_pixel_avg_4x2_lasx x264_template(pixel_avg_4x2_lasx) +void x264_pixel_avg_4x2_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +#define x264_pixel_avg2_w4_lasx x264_template(pixel_avg2_w4_lasx) +void x264_pixel_avg2_w4_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +#define x264_pixel_avg2_w8_lasx x264_template(pixel_avg2_w8_lasx) +void x264_pixel_avg2_w8_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +#define x264_pixel_avg2_w16_lasx x264_template(pixel_avg2_w16_lasx) +void x264_pixel_avg2_w16_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +#define x264_pixel_avg2_w20_lasx x264_template(pixel_avg2_w20_lasx) +void x264_pixel_avg2_w20_lasx ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); + +#define x264_mc_weight_w20_lasx x264_template(mc_weight_w20_lasx) +void x264_mc_weight_w20_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w20_noden_lasx x264_template(mc_weight_w20_noden_lasx) +void x264_mc_weight_w20_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w16_lasx x264_template(mc_weight_w16_lasx) +void x264_mc_weight_w16_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w16_noden_lasx x264_template(mc_weight_w16_noden_lasx) +void x264_mc_weight_w16_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w8_lasx x264_template(mc_weight_w8_lasx) +void x264_mc_weight_w8_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w8_noden_lasx x264_template(mc_weight_w8_noden_lasx) +void x264_mc_weight_w8_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w4_lasx x264_template(mc_weight_w4_lasx) +void x264_mc_weight_w4_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); +#define x264_mc_weight_w4_noden_lasx x264_template(mc_weight_w4_noden_lasx) +void x264_mc_weight_w4_noden_lasx( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ); + +#define x264_mc_copy_w16_lasx x264_template(mc_copy_w16_lasx) +void x264_mc_copy_w16_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_mc_copy_w8_lasx x264_template(mc_copy_w8_lasx) +void x264_mc_copy_w8_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +#define x264_mc_copy_w4_lasx x264_template(mc_copy_w4_lasx) +void x264_mc_copy_w4_lasx( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); + +#define x264_plane_copy_interleave_core_lasx x264_template(plane_copy_interleave_core_lasx) +void x264_plane_copy_interleave_core_lasx( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); + +#define x264_plane_copy_deinterleave_lasx x264_template(plane_copy_deinterleave_lasx) +void x264_plane_copy_deinterleave_lasx( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); + +#define x264_memzero_aligned_lasx x264_template(memzero_aligned_lasx) +void x264_memzero_aligned_lasx( void *p_dst, size_t n ); + +#define x264_hpel_filter_lasx x264_template(hpel_filter_lasx) +void x264_hpel_filter_lasx( pixel *, pixel *, pixel *, pixel *, intptr_t, int, int, int16_t * ); +#define x264_frame_init_lowres_core_lasx x264_template(frame_init_lowres_core_lasx) +void x264_frame_init_lowres_core_lasx( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, + intptr_t, intptr_t, int, int ); + +#endif diff --git a/common/loongarch/pixel-a.S b/common/loongarch/pixel-a.S new file mode 100644 index 000000000..b1f84225d --- /dev/null +++ b/common/loongarch/pixel-a.S @@ -0,0 +1,3548 @@ +/***************************************************************************** + * pixel-a.S: LoongArch pixel metrics + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Hecai Yuan + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "loongson_asm.S" +#include "loongson_util.S" +#if !HIGH_BIT_DEPTH + +const hmul_8p +.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1 +.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1 +endconst + +const mask_ac4b +.short 0, -1, 0, -1, -1, -1, -1, -1 +.short 0, -1, 0, -1, -1, -1, -1, -1 +endconst + +const mask_ac8 +.short 0, -1, -1, -1, -1, -1, -1, -1 +.short 0, -1, -1, -1, -1, -1, -1, -1 +endconst + + +.macro LOAD_INC_8x4W n1, n2, n3, n4, n5 + vld $vr\n1, a0, 0 + vldx $vr\n2, a0, a1 + vldx $vr\n3, a0, t0 + vldx $vr\n4, a0, t1 + xvpermi.d xr18, $xr\n1, 0x05 + xvpermi.d xr19, $xr\n2, 0x05 + xvpermi.d xr20, $xr\n3, 0x05 + xvpermi.d xr21, $xr\n4, 0x05 + add.d a0, a0, t2 + xvdp2.h.bu.b $xr\n1, xr18, $xr\n5 + xvdp2.h.bu.b $xr\n2, xr19, $xr\n5 + xvdp2.h.bu.b $xr\n3, xr20, $xr\n5 + xvdp2.h.bu.b $xr\n4, xr21, $xr\n5 +.endm + +.macro SUMSUB_BADC a, b, c, d + xvadd.h \a, \a, \b + xvadd.h \c, \c, \d + xvadd.h \b, \b, \b + xvadd.h \d, \d, \d + xvsub.h \b, \b, \a + xvsub.h \d, \d, \c +.endm + +.macro HADAMARD4_V a, b, c, d + SUMSUB_BADC \a, \b, \c, \d + SUMSUB_BADC \a, \c, \b, \d +.endm + +.macro HADAMARD_1 a, b, tmp + xmov \tmp, \a + xvpackod.h \a, \b, \a + xvpackev.h \b, \b, \tmp + xvadd.h \tmp, \a, \b + xvsub.h \b, \b, \a + xmov \a, \tmp +.endm + +.macro HADAMARD_2 a, b, c + xvpickod.w \c, \b, \a + xvpickev.w \a, \b, \a + xvadda.h \a, \a, xr17 + xvadda.h \c, \c, xr17 + xvmax.h \a, \a, \c +.endm + +.macro HADAMARD_AC_WXH_LASX w, h +function_x264 pixel_hadamard_ac_\w\()x\h\()_lasx + add.d t0, a1, a1 + add.d t1, a1, t0 + add.d t2, t1, a1 + xvxor.v xr17, xr17, xr17 + move t4, ra + bl x264_8_hadamard_ac_16x8_lasx +.if \h == 16 + xmov xr11, xr9 + xmov xr10, xr8 + bl x264_8_hadamard_ac_16x8_lasx + xvadd.h xr9, xr9, xr11 + xvadd.h xr8, xr8, xr10 +.endif + move ra, t4 + xvhaddw.wu.hu xr8, xr8, xr8 + xvhaddw.du.wu xr8, xr8, xr8 + xvhaddw.qu.du xr8, xr8, xr8 + xvpickve2gr.wu t0, xr8, 0 + xvpickve2gr.wu t1, xr8, 4 + add.d t0, t0, t1 + xvhaddw.wu.hu xr9, xr9, xr9 + xvhaddw.du.wu xr9, xr9, xr9 + xvhaddw.qu.du xr9, xr9, xr9 + xvpickve2gr.wu t1, xr9, 0 + xvpickve2gr.wu t2, xr9, 4 + add.d t1, t1, t2 + srli.d t0, t0, 2 + srli.d t1, t1, 1 + slli.d t0, t0, 32 + add.d a0, t0, t1 +endfunc_x264 +.endm + +function_x264 hadamard_ac_16x8_lasx +/* Load intermediate variable */ + la.local t3, hmul_8p + xvld xr8, t3, 0 + LOAD_INC_8x4W 0, 1, 2, 3, 8 + HADAMARD4_V xr0, xr1, xr2, xr3 + LOAD_INC_8x4W 4, 5, 6, 7, 8 + HADAMARD4_V xr4, xr5, xr6, xr7 + HADAMARD_1 xr0, xr1, xr8 + HADAMARD_1 xr2, xr3, xr8 + xmov xr18, xr1 + HADAMARD_1 xr4, xr5, xr8 + HADAMARD_1 xr6, xr7, xr8 + xmov xr19, xr2 + xmov xr20, xr3 + xvadda.h xr1, xr0, xr4 + xvsub.h xr21, xr4, xr0 + xvadd.h xr0, xr4, xr0 + la.local t3, mask_ac4b + xvld xr8, t3, 0 + xvand.v xr1, xr1, xr8 + xvadda.h xr1, xr1, xr5 + xvadda.h xr1, xr1, xr18 + xvadda.h xr1, xr1, xr19 + xvadda.h xr1, xr1, xr20 + xvadda.h xr1, xr1, xr6 + xvadda.h xr9, xr1, xr7 + + xvadd.h xr3, xr7, xr20 + xvsub.h xr7, xr7, xr20 + xvadd.h xr2, xr6, xr19 + xvsub.h xr6, xr6, xr19 + xvadd.h xr1, xr5, xr18 + xvsub.h xr5, xr5, xr18 + + HADAMARD_2 xr3, xr7, xr18 + HADAMARD_2 xr2, xr6, xr19 + HADAMARD_2 xr1, xr5, xr20 + + xvpickod.w xr5, xr21, xr0 + xvpickev.w xr0, xr21, xr0 + xmov xr4, xr5 + xvadd.h xr5, xr0, xr4 + xvsub.h xr4, xr4, xr0 + + xvadd.h xr2, xr2, xr3 + xvadd.h xr2, xr2, xr1 + xvadd.h xr2, xr2, xr2 + + la.local t3, mask_ac8 + xvld xr8, t3, 0 + xvand.v xr0, xr5, xr8 + + xvadda.h xr2, xr2, xr4 + xvadda.h xr8, xr2, xr0 +endfunc_x264 + +HADAMARD_AC_WXH_LASX 16, 8 +HADAMARD_AC_WXH_LASX 16, 16 + +/* uint64_t hadamard_ac_8x8_lasx(uint8_t *p_pix, + * int32_t i_stride) + */ +function_x264 hadamard_ac_8x8_lasx +/* Load intermediate variable */ + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a1, 2 + + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 + + vilvl.d vr8, vr1, vr0 + vilvl.d vr9, vr3, vr2 + vilvl.d vr10, vr5, vr4 + vilvl.d vr11, vr7, vr6 + xvpermi.q xr8, xr10, 0x02 + xvpermi.q xr9, xr11, 0x02 + xvpickev.b xr12, xr9, xr8 + xvpickod.b xr13, xr9, xr8 + xvaddwev.h.bu xr8, xr12, xr13 + xvaddwod.h.bu xr9, xr12, xr13 + xvsubwev.h.bu xr10, xr12, xr13 + xvsubwod.h.bu xr11, xr12, xr13 + xvadd.h xr12, xr8, xr9 + xvadd.h xr13, xr10, xr11 + xvsub.h xr14, xr8, xr9 + xvsub.h xr15, xr10, xr11 + + xvilvl.h xr8, xr13, xr12 + xvilvh.h xr9, xr13, xr12 + xvilvl.h xr10, xr15, xr14 + xvilvh.h xr11, xr15, xr14 + xvilvl.w xr12, xr10, xr8 + xvilvh.w xr13, xr10, xr8 + xvilvl.w xr14, xr11, xr9 + xvilvh.w xr15, xr11, xr9 + xvadd.h xr8, xr12, xr13 + xvadd.h xr9, xr14, xr15 + xvsub.h xr10, xr12, xr13 + xvsub.h xr11, xr14, xr15 + xvadd.h xr12, xr8, xr9 + xvadd.h xr13, xr10, xr11 + xvsub.h xr14, xr8, xr9 + xvsub.h xr15, xr10, xr11 + + vpickve2gr.hu t3, vr12, 0 + vpickve2gr.hu t4, vr12, 4 + xvor.v xr16, xr12, xr12 + xvpermi.q xr16, xr16, 0x31 + vpickve2gr.hu t5, vr16, 0 + vpickve2gr.hu t6, vr16, 4 + add.d t3, t3, t4 + add.d t5, t5, t6 + add.d t3, t3, t5 + + xvadda.h xr16, xr12, xr13 + xvadda.h xr18, xr14, xr15 + xvadd.h xr16, xr16, xr18 + xvpermi.d xr17, xr16, 0x4e + xvadd.h xr18, xr16, xr17 + xvhaddw.wu.hu xr18, xr18, xr18 + xvhaddw.du.wu xr18, xr18, xr18 + xvhaddw.qu.du xr18, xr18, xr18 + xvpickve2gr.wu t4, xr18, 0 + + xvpackev.h xr8, xr13, xr12 + xvpackev.h xr9, xr15, xr14 + xvpackod.h xr10, xr13, xr12 + xvpackod.h xr11, xr15, xr14 + xvilvl.d xr12, xr9, xr8 + xvilvh.d xr13, xr9, xr8 + xvilvl.d xr14, xr11, xr10 + xvilvh.d xr15, xr11, xr10 + xvor.v xr16, xr12, xr12 + xvor.v xr17, xr13, xr13 + xvpermi.q xr12, xr14, 0x02 + xvpermi.q xr13, xr14, 0x12 + xvpermi.q xr16, xr15, 0x03 + xvpermi.q xr17, xr15, 0x13 + + xvadd.h xr8, xr12, xr13 + xvsub.h xr9, xr12, xr13 + xvadd.h xr10, xr16, xr17 + xvsub.h xr11, xr16, xr17 + xvadd.h xr12, xr8, xr10 + xvadd.h xr13, xr9, xr11 + xvsub.h xr14, xr8, xr10 + xvsub.h xr15, xr9, xr11 + xvadda.h xr16, xr12, xr13 + xvadda.h xr17, xr14, xr15 + xvadd.h xr18, xr16, xr17 + xvpermi.d xr19, xr18, 0x4e + xvadd.d xr19, xr18, xr19 + xvhaddw.wu.hu xr19, xr19, xr19 + xvhaddw.du.wu xr19, xr19, xr19 + xvhaddw.qu.du xr19, xr19, xr19 + xvpickve2gr.wu t5, xr19, 0 + + sub.d t4, t4, t3 + sub.d t5, t5, t3 + slli.d t5, t5, 32 + add.d a0, t5, t4 +endfunc_x264 + +/* int x264_pixel_satd_16x16_lasx(pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_satd_16x16_lasx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + slli.d t4, a1, 2 + slli.d t5, a3, 2 + add.d t6, a1, t2 + add.d t7, a3, t3 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3 + add.d a0, a0, t4 + LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15 + xvpermi.q xr0, xr4, 0x02 + xvpermi.q xr1, xr5, 0x02 + xvpermi.q xr2, xr6, 0x02 + xvpermi.q xr3, xr7, 0x02 + xvpermi.q xr8, xr12, 0x02 + xvpermi.q xr9, xr13, 0x02 + xvpermi.q xr10, xr14, 0x02 + xvpermi.q xr11, xr15, 0x02 + + // HADAMARD4 + xvsubwev.h.bu xr4, xr0, xr8 + xvsubwod.h.bu xr5, xr0, xr8 + xvsubwev.h.bu xr6, xr1, xr9 + xvsubwod.h.bu xr7, xr1, xr9 + xvsubwev.h.bu xr8, xr2, xr10 + xvsubwod.h.bu xr9, xr2, xr10 + xvsubwev.h.bu xr12, xr3, xr11 + xvsubwod.h.bu xr13, xr3, xr11 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr8, xr9 + xvsub.h xr5, xr8, xr9 + xvadd.h xr6, xr12, xr13 + xvsub.h xr7, xr12, xr13 + xvpackev.h xr8, xr5, xr4 + xvpackod.h xr9, xr5, xr4 + xvpackev.h xr10, xr7, xr6 + xvpackod.h xr11, xr7, xr6 + xvpackev.h xr4, xr1, xr0 + xvpackod.h xr5, xr1, xr0 + xvpackev.h xr6, xr3, xr2 + xvpackod.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr8, xr9 + xvsub.h xr5, xr8, xr9 + xvadd.h xr6, xr10, xr11 + xvsub.h xr7, xr10, xr11 + xvilvl.h xr8, xr1, xr0 + xvilvl.h xr9, xr3, xr2 + xvilvl.h xr10, xr5, xr4 + xvilvl.h xr11, xr7, xr6 + xvilvh.h xr0, xr1, xr0 + xvilvh.h xr1, xr3, xr2 + xvilvh.h xr2, xr5, xr4 + xvilvh.h xr3, xr7, xr6 + xvadd.h xr4, xr8, xr9 + xvadd.h xr6, xr10, xr11 + xvsub.h xr5, xr8, xr9 + xvsub.h xr7, xr10, xr11 + xvadd.h xr8, xr4, xr6 + xvadd.h xr9, xr5, xr7 + xvsub.h xr10, xr4, xr6 + xvsub.h xr11, xr5, xr7 + xvadd.h xr4, xr0, xr1 + xvadd.h xr6, xr2, xr3 + xvsub.h xr5, xr0, xr1 + xvsub.h xr7, xr2, xr3 + xvadd.h xr0, xr4, xr6 + xvadd.h xr1, xr5, xr7 + xvsub.h xr2, xr4, xr6 + xvsub.h xr3, xr5, xr7 + xvadda.h xr8, xr8, xr9 + xvadda.h xr9, xr10, xr11 + xvadda.h xr0, xr0, xr1 + xvadda.h xr1, xr2, xr3 + xvadd.h xr8, xr8, xr9 + xvadd.h xr0, xr0, xr1 + xvadd.h xr16, xr0, xr8 + + add.d a0, a0, t4 + add.d a2, a2, t5 + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3 + add.d a0, a0, t4 + LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15 + xvpermi.q xr0, xr4, 0x02 + xvpermi.q xr1, xr5, 0x02 + xvpermi.q xr2, xr6, 0x02 + xvpermi.q xr3, xr7, 0x02 + xvpermi.q xr8, xr12, 0x02 + xvpermi.q xr9, xr13, 0x02 + xvpermi.q xr10, xr14, 0x02 + xvpermi.q xr11, xr15, 0x02 + + // HADAMARD4 + xvsubwev.h.bu xr4, xr0, xr8 + xvsubwod.h.bu xr5, xr0, xr8 + xvsubwev.h.bu xr6, xr1, xr9 + xvsubwod.h.bu xr7, xr1, xr9 + xvsubwev.h.bu xr8, xr2, xr10 + xvsubwod.h.bu xr9, xr2, xr10 + xvsubwev.h.bu xr12, xr3, xr11 + xvsubwod.h.bu xr13, xr3, xr11 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr8, xr9 + xvsub.h xr5, xr8, xr9 + xvadd.h xr6, xr12, xr13 + xvsub.h xr7, xr12, xr13 + xvpackev.h xr8, xr5, xr4 + xvpackod.h xr9, xr5, xr4 + xvpackev.h xr10, xr7, xr6 + xvpackod.h xr11, xr7, xr6 + xvpackev.h xr4, xr1, xr0 + xvpackod.h xr5, xr1, xr0 + xvpackev.h xr6, xr3, xr2 + xvpackod.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr8, xr9 + xvsub.h xr5, xr8, xr9 + xvadd.h xr6, xr10, xr11 + xvsub.h xr7, xr10, xr11 + xvilvl.h xr8, xr1, xr0 + xvilvl.h xr9, xr3, xr2 + xvilvl.h xr10, xr5, xr4 + xvilvl.h xr11, xr7, xr6 + xvilvh.h xr0, xr1, xr0 + xvilvh.h xr1, xr3, xr2 + xvilvh.h xr2, xr5, xr4 + xvilvh.h xr3, xr7, xr6 + xvadd.h xr4, xr8, xr9 + xvadd.h xr6, xr10, xr11 + xvsub.h xr5, xr8, xr9 + xvsub.h xr7, xr10, xr11 + xvadd.h xr8, xr4, xr6 + xvadd.h xr9, xr5, xr7 + xvsub.h xr10, xr4, xr6 + xvsub.h xr11, xr5, xr7 + xvadd.h xr4, xr0, xr1 + xvadd.h xr6, xr2, xr3 + xvsub.h xr5, xr0, xr1 + xvsub.h xr7, xr2, xr3 + xvadd.h xr0, xr4, xr6 + xvadd.h xr1, xr5, xr7 + xvsub.h xr2, xr4, xr6 + xvsub.h xr3, xr5, xr7 + xvadda.h xr8, xr8, xr9 + xvadda.h xr9, xr10, xr11 + xvadda.h xr0, xr0, xr1 + xvadda.h xr1, xr2, xr3 + xvadd.h xr8, xr8, xr9 + xvadd.h xr0, xr0, xr1 + xvadd.h xr0, xr0, xr8 + xvadd.h xr0, xr0, xr16 + + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.du.wu xr0, xr0, xr0 + xvhaddw.qu.du xr0, xr0, xr0 + xvpickve2gr.wu t0, xr0, 0 + xvpickve2gr.wu t1, xr0, 4 + add.w t0, t0, t1 + srli.d a0, t0, 1 +endfunc_x264 + +/* int x264_pixel_satd_16x8_lasx(pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_satd_16x8_lasx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + slli.d t4, t2, 1 + slli.d t5, t3, 1 + add.d t6, a1, t2 + add.d t7, a3, t3 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t6, vr0, vr1, vr2, vr3 + add.d a0, a0, t4 + LSX_LOADX_4 a0, a1, t2, t6, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t3, t7, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t7, vr12, vr13, vr14, vr15 + xvpermi.q xr0, xr4, 0x02 + xvpermi.q xr1, xr5, 0x02 + xvpermi.q xr2, xr6, 0x02 + xvpermi.q xr3, xr7, 0x02 + xvpermi.q xr8, xr12, 0x02 + xvpermi.q xr9, xr13, 0x02 + xvpermi.q xr10, xr14, 0x02 + xvpermi.q xr11, xr15, 0x02 + + // HADAMARD4 + xvsubwev.h.bu xr4, xr0, xr8 + xvsubwod.h.bu xr5, xr0, xr8 + xvsubwev.h.bu xr6, xr1, xr9 + xvsubwod.h.bu xr7, xr1, xr9 + xvsubwev.h.bu xr8, xr2, xr10 + xvsubwod.h.bu xr9, xr2, xr10 + xvsubwev.h.bu xr12, xr3, xr11 + xvsubwod.h.bu xr13, xr3, xr11 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr8, xr9 + xvsub.h xr5, xr8, xr9 + xvadd.h xr6, xr12, xr13 + xvsub.h xr7, xr12, xr13 + xvpackev.h xr8, xr5, xr4 + xvpackod.h xr9, xr5, xr4 + xvpackev.h xr10, xr7, xr6 + xvpackod.h xr11, xr7, xr6 + xvpackev.h xr4, xr1, xr0 + xvpackod.h xr5, xr1, xr0 + xvpackev.h xr6, xr3, xr2 + xvpackod.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr8, xr9 + xvsub.h xr5, xr8, xr9 + xvadd.h xr6, xr10, xr11 + xvsub.h xr7, xr10, xr11 + xvilvl.h xr8, xr1, xr0 + xvilvl.h xr9, xr3, xr2 + xvilvl.h xr10, xr5, xr4 + xvilvl.h xr11, xr7, xr6 + xvilvh.h xr0, xr1, xr0 + xvilvh.h xr1, xr3, xr2 + xvilvh.h xr2, xr5, xr4 + xvilvh.h xr3, xr7, xr6 + xvadd.h xr4, xr8, xr9 + xvadd.h xr6, xr10, xr11 + xvsub.h xr5, xr8, xr9 + xvsub.h xr7, xr10, xr11 + xvadd.h xr8, xr4, xr6 + xvadd.h xr9, xr5, xr7 + xvsub.h xr10, xr4, xr6 + xvsub.h xr11, xr5, xr7 + xvadd.h xr4, xr0, xr1 + xvadd.h xr6, xr2, xr3 + xvsub.h xr5, xr0, xr1 + xvsub.h xr7, xr2, xr3 + xvadd.h xr0, xr4, xr6 + xvadd.h xr1, xr5, xr7 + xvsub.h xr2, xr4, xr6 + xvsub.h xr3, xr5, xr7 + xvadda.h xr8, xr8, xr9 + xvadda.h xr9, xr10, xr11 + xvadda.h xr0, xr0, xr1 + xvadda.h xr1, xr2, xr3 + xvadd.h xr8, xr8, xr9 + xvadd.h xr0, xr0, xr1 + xvadd.h xr0, xr0, xr8 + + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.du.wu xr0, xr0, xr0 + xvhaddw.qu.du xr0, xr0, xr0 + xvpickve2gr.wu t0, xr0, 0 + xvpickve2gr.wu t1, xr0, 4 + add.w t0, t0, t1 + srli.d a0, t0, 1 +endfunc_x264 + +/* int x264_pixel_satd_8x16_lasx(pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_satd_8x16_lasx + slli.d t2, a1, 1 + add.d t3, a1, t2 + slli.d t4, a1, 2 + slli.d t5, a3, 1 + add.d t6, a3, t5 + slli.d t7, a3, 2 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3 + add.d a0, a0, t4 + LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11 + add.d a2, a2, t7 + LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr2, vr5, vr4 + vilvl.d vr3, vr7, vr6 + xvpermi.q xr0, xr2, 0x02 + xvpermi.q xr1, xr3, 0x02 + vilvl.d vr2, vr9, vr8 + vilvl.d vr3, vr11, vr10 + vilvl.d vr4, vr13, vr12 + vilvl.d vr5, vr15, vr14 + xvpermi.q xr2, xr4, 0x02 + xvpermi.q xr3, xr5, 0x02 + + // HADAMARD4 + xvsubwev.h.bu xr4, xr0, xr2 + xvsubwod.h.bu xr5, xr0, xr2 + xvsubwev.h.bu xr6, xr1, xr3 + xvsubwod.h.bu xr7, xr1, xr3 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvpackev.h xr4, xr1, xr0 + xvpackod.h xr5, xr1, xr0 + xvpackev.h xr6, xr3, xr2 + xvpackod.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvilvl.h xr4, xr1, xr0 + xvilvh.h xr5, xr1, xr0 + xvilvl.h xr6, xr3, xr2 + xvilvh.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr1, xr4, xr5 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr0, xr2 + xvadd.h xr5, xr1, xr3 + xvsub.h xr6, xr0, xr2 + xvsub.h xr7, xr1, xr3 + xvadda.h xr0, xr4, xr5 + xvadda.h xr1, xr6, xr7 + xvadd.h xr16, xr0, xr1 + add.d a0, a0, t4 + add.d a2, a2, t7 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3 + add.d a0, a0, t4 + LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11 + add.d a2, a2, t7 + LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr2, vr5, vr4 + vilvl.d vr3, vr7, vr6 + xvpermi.q xr0, xr2, 0x02 + xvpermi.q xr1, xr3, 0x02 + vilvl.d vr2, vr9, vr8 + vilvl.d vr3, vr11, vr10 + vilvl.d vr4, vr13, vr12 + vilvl.d vr5, vr15, vr14 + xvpermi.q xr2, xr4, 0x02 + xvpermi.q xr3, xr5, 0x02 + + // HADAMARD4 + xvsubwev.h.bu xr4, xr0, xr2 + xvsubwod.h.bu xr5, xr0, xr2 + xvsubwev.h.bu xr6, xr1, xr3 + xvsubwod.h.bu xr7, xr1, xr3 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvpackev.h xr4, xr1, xr0 + xvpackod.h xr5, xr1, xr0 + xvpackev.h xr6, xr3, xr2 + xvpackod.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvilvl.h xr4, xr1, xr0 + xvilvh.h xr5, xr1, xr0 + xvilvl.h xr6, xr3, xr2 + xvilvh.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr1, xr4, xr5 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr0, xr2 + xvadd.h xr5, xr1, xr3 + xvsub.h xr6, xr0, xr2 + xvsub.h xr7, xr1, xr3 + xvadda.h xr0, xr4, xr5 + xvadda.h xr1, xr6, xr7 + xvadd.h xr0, xr0, xr1 + xvadd.h xr0, xr0, xr16 + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.du.wu xr0, xr0, xr0 + xvhaddw.qu.du xr0, xr0, xr0 + xvpickve2gr.wu t0, xr0, 0 + xvpickve2gr.wu t1, xr0, 4 + add.w t0, t0, t1 + srli.d a0, t0, 1 +endfunc_x264 + +/* int x264_pixel_satd_8x8_lasx(pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_satd_8x8_lasx + slli.d t2, a1, 1 + slli.d t5, a3, 1 + add.d t3, a1, t2 + add.d t6, a3, t5 + slli.d t4, t2, 1 + slli.d t7, t5, 1 + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t3, vr0, vr1, vr2, vr3 + add.d a0, a0, t4 + LSX_LOADX_4 a0, a1, t2, t3, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t5, t6, vr8, vr9, vr10, vr11 + add.d a2, a2, t7 + LSX_LOADX_4 a2, a3, t5, t6, vr12, vr13, vr14, vr15 + + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr2, vr5, vr4 + vilvl.d vr3, vr7, vr6 + xvpermi.q xr0, xr2, 0x02 + xvpermi.q xr1, xr3, 0x02 + vilvl.d vr2, vr9, vr8 + vilvl.d vr3, vr11, vr10 + vilvl.d vr4, vr13, vr12 + vilvl.d vr5, vr15, vr14 + xvpermi.q xr2, xr4, 0x02 + xvpermi.q xr3, xr5, 0x02 + + // HADAMARD4 + xvsubwev.h.bu xr4, xr0, xr2 + xvsubwod.h.bu xr5, xr0, xr2 + xvsubwev.h.bu xr6, xr1, xr3 + xvsubwod.h.bu xr7, xr1, xr3 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvpackev.h xr4, xr1, xr0 + xvpackod.h xr5, xr1, xr0 + xvpackev.h xr6, xr3, xr2 + xvpackod.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvsub.h xr1, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr3, xr6, xr7 + xvilvl.h xr4, xr1, xr0 + xvilvh.h xr5, xr1, xr0 + xvilvl.h xr6, xr3, xr2 + xvilvh.h xr7, xr3, xr2 + xvadd.h xr0, xr4, xr5 + xvadd.h xr2, xr6, xr7 + xvsub.h xr1, xr4, xr5 + xvsub.h xr3, xr6, xr7 + xvadd.h xr4, xr0, xr2 + xvadd.h xr5, xr1, xr3 + xvsub.h xr6, xr0, xr2 + xvsub.h xr7, xr1, xr3 + xvadda.h xr0, xr4, xr5 + xvadda.h xr1, xr6, xr7 + xvadd.h xr0, xr0, xr1 + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.du.wu xr0, xr0, xr0 + xvhaddw.qu.du xr0, xr0, xr0 + xvpickve2gr.wu t0, xr0, 0 + xvpickve2gr.wu t1, xr0, 4 + add.w t0, t0, t1 + srli.d a0, t0, 1 +endfunc_x264 + +/* int x264_pixel_satd_8x4_lasx(pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_satd_8x4_lasx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + add.d t4, a1, t2 + add.d t5, a3, t3 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 + LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr9, xr11, xr13 + xvsub.h xr10, xr11, xr13 + xvpackev.d xr11, xr10, xr9 + xvpackod.d xr12, xr10, xr9 + xvadda.h xr11, xr11, xr12 + xvhaddw.wu.hu xr11, xr11, xr11 + xvhaddw.du.wu xr11, xr11, xr11 + xvhaddw.qu.du xr11, xr11, xr11 + xvpickve2gr.wu t4, xr11, 0 + xvpickve2gr.wu t5, xr11, 4 + add.d t4, t4, t5 + srli.d a0, t4, 1 +endfunc_x264 + +/* int x264_pixel_satd_4x16_lasx(pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_satd_4x16_lasx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + add.d t4, a1, t2 + add.d t5, a3, t3 + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 + LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 + vilvl.w vr1, vr2, vr1 + vilvl.w vr3, vr4, vr3 + vilvl.d vr9, vr3, vr1 + vilvl.w vr5, vr6, vr5 + vilvl.w vr7, vr8, vr7 + vilvl.d vr10, vr7, vr5 + + slli.d t0, a1, 2 + slli.d t1, a3, 2 + // Load data from pix1 and pix2 + add.d a0, a0, t0 + LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 + add.d a2, a2, t1 + LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 + vilvl.w vr1, vr2, vr1 + vilvl.w vr3, vr4, vr3 + vilvl.d vr1, vr3, vr1 + vilvl.w vr5, vr6, vr5 + vilvl.w vr7, vr8, vr7 + vilvl.d vr5, vr7, vr5 + xvpermi.q xr1, xr9, 0x20 + xvpermi.q xr5, xr10, 0x20 + + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* b0 + b1 */ + xvsub.h xr12, xr9, xr10 /* b0 - b1 */ + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadda.h xr9, xr9, xr10 + xvhaddw.wu.hu xr9, xr9, xr9 + xvhaddw.du.wu xr9, xr9, xr9 + xvhaddw.qu.du xr9, xr9, xr9 + xvpickve2gr.wu t6, xr9, 0 + xvpickve2gr.wu t7, xr9, 4 + add.d t7, t6, t7 + + // Load data from pix1 and pix2 + add.d a0, a0, t0 + LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 + add.d a2, a2, t1 + LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 + vilvl.w vr1, vr2, vr1 + vilvl.w vr3, vr4, vr3 + vilvl.d vr9, vr3, vr1 + vilvl.w vr5, vr6, vr5 + vilvl.w vr7, vr8, vr7 + vilvl.d vr10, vr7, vr5 + + // Load data from pix1 and pix2 + add.d a0, a0, t0 + LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 + add.d a2, a2, t1 + LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 + vilvl.w vr1, vr2, vr1 + vilvl.w vr3, vr4, vr3 + vilvl.d vr1, vr3, vr1 + vilvl.w vr5, vr6, vr5 + vilvl.w vr7, vr8, vr7 + vilvl.d vr5, vr7, vr5 + xvpermi.q xr1, xr9, 0x20 + xvpermi.q xr5, xr10, 0x20 + + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* b0 + b1 */ + xvsub.h xr12, xr9, xr10 /* b0 - b1 */ + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadda.h xr9, xr9, xr10 + xvhaddw.wu.hu xr9, xr9, xr9 + xvhaddw.du.wu xr9, xr9, xr9 + xvhaddw.qu.du xr9, xr9, xr9 + xvpickve2gr.wu t6, xr9, 0 + xvpickve2gr.wu t5, xr9, 4 + add.d t6, t5, t6 + add.d t7, t6, t7 + srli.d a0, t7, 1 +endfunc_x264 + +/* int x264_pixel_satd_4x8_lasx(pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_satd_4x8_lasx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + add.d t4, a1, t2 + add.d t5, a3, t3 + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 + LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 + vilvl.w vr1, vr2, vr1 + vilvl.w vr3, vr4, vr3 + vilvl.d vr9, vr3, vr1 + vilvl.w vr5, vr6, vr5 + vilvl.w vr7, vr8, vr7 + vilvl.d vr10, vr7, vr5 + + slli.d t0, a1, 2 + slli.d t1, a3, 2 + add.d a0, a0, t0 + add.d a2, a2, t1 + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t2, t4, vr1, vr2, vr3, vr4 + LSX_LOADX_4 a2, a3, t3, t5, vr5, vr6, vr7, vr8 + vilvl.w vr1, vr2, vr1 + vilvl.w vr3, vr4, vr3 + vilvl.d vr1, vr3, vr1 + vilvl.w vr5, vr6, vr5 + vilvl.w vr7, vr8, vr7 + vilvl.d vr5, vr7, vr5 + xvpermi.q xr1, xr9, 0x20 + xvpermi.q xr5, xr10, 0x20 + + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* b0 + b1 */ + xvsub.h xr12, xr9, xr10 /* b0 - b1 */ + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadda.h xr9, xr9, xr10 + xvhaddw.wu.hu xr9, xr9, xr9 + xvhaddw.du.wu xr9, xr9, xr9 + xvhaddw.qu.du xr9, xr9, xr9 + xvpickve2gr.wu t6, xr9, 0 + xvpickve2gr.wu t7, xr9, 4 + add.d t6, t6, t7 + srli.d a0, t6, 1 +endfunc_x264 + +/* int x264_pixel_satd_4x4_lsx(pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2) + */ +.macro pixel_satd_4x4_lsx_core out + vilvl.w vr1, vr2, vr1 + vilvl.w vr3, vr4, vr3 + vilvl.d vr1, vr3, vr1 + vilvl.w vr5, vr6, vr5 + vilvl.w vr7, vr8, vr7 + vilvl.d vr5, vr7, vr5 + + vsubwev.h.bu vr9, vr1, vr5 + vsubwod.h.bu vr10, vr1, vr5 + vadd.h vr11, vr9, vr10 /* a0 + a1 */ + vsub.h vr12, vr9, vr10 /* a0 - a1 */ + vpackev.h vr9, vr12, vr11 + vpackod.h vr10, vr12, vr11 + vadd.h vr11, vr9, vr10 /* b0 + b1 */ + vsub.h vr12, vr9, vr10 /* b0 - b1 */ + vpackev.w vr9, vr12, vr11 + vpackod.w vr10, vr12, vr11 + vadd.h vr11, vr9, vr10 /* HADAMARD4 */ + vsub.h vr12, vr9, vr10 + vpackev.d vr9, vr12, vr11 + vpackod.d vr10, vr12, vr11 + vadd.h vr11, vr9, vr10 + vsub.h vr12, vr9, vr10 + vpackev.d vr9, vr12, vr11 + vpackod.d vr10, vr12, vr11 + vadda.h \out, vr9, vr10 +.endm + +function_x264 pixel_satd_4x4_lsx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + add.d t4, a1, t2 + add.d t5, a3, t3 + + // Load data from pix1 and pix2 + FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + pixel_satd_4x4_lsx_core vr13 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu t5, vr13, 0 + srli.d a0, t5, 1 +endfunc_x264 + +/* + * int pixel_ssd_16x16_lasx(const Pixel *pix1, intptr_t stride_pix1, + * const Pixel *pix2, intptr_t stride_pix2) + */ +function_x264 pixel_ssd_16x16_lasx + slli.d t0, a1, 1 + add.d t1, a1, t0 + add.d t2, a1, t1 + slli.d t3, a3, 1 + add.d t4, a3, t3 + add.d t5, a3, t4 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + vext2xv.hu.bu xr4, xr4 + vext2xv.hu.bu xr5, xr5 + vext2xv.hu.bu xr6, xr6 + vext2xv.hu.bu xr7, xr7 + vext2xv.hu.bu xr8, xr8 + vext2xv.hu.bu xr9, xr9 + vext2xv.hu.bu xr10, xr10 + vext2xv.hu.bu xr11, xr11 + vext2xv.hu.bu xr12, xr12 + vext2xv.hu.bu xr13, xr13 + vext2xv.hu.bu xr14, xr14 + vext2xv.hu.bu xr15, xr15 + + // Calculate the square of the difference + xvsub.h xr0, xr0, xr8 + xvsub.h xr1, xr1, xr9 + xvsub.h xr2, xr2, xr10 + xvsub.h xr3, xr3, xr11 + xvsub.h xr4, xr4, xr12 + xvsub.h xr5, xr5, xr13 + xvsub.h xr6, xr6, xr14 + xvsub.h xr7, xr7, xr15 + xvmul.h xr0, xr0, xr0 + xvmul.h xr1, xr1, xr1 + xvmul.h xr2, xr2, xr2 + xvmul.h xr3, xr3, xr3 + xvmul.h xr4, xr4, xr4 + xvmul.h xr5, xr5, xr5 + xvmul.h xr6, xr6, xr6 + xvmul.h xr7, xr7, xr7 + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.wu.hu xr1, xr1, xr1 + xvhaddw.wu.hu xr2, xr2, xr2 + xvhaddw.wu.hu xr3, xr3, xr3 + xvhaddw.wu.hu xr4, xr4, xr4 + xvhaddw.wu.hu xr5, xr5, xr5 + xvhaddw.wu.hu xr6, xr6, xr6 + xvhaddw.wu.hu xr7, xr7, xr7 + xvadd.w xr16, xr0, xr1 + xvadd.w xr17, xr2, xr3 + xvadd.w xr18, xr4, xr5 + xvadd.w xr19, xr6, xr7 + xvadd.w xr16, xr16, xr17 + xvadd.w xr18, xr18, xr19 + xvadd.w xr16, xr16, xr18 + + // Load data from pix1 and pix2 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + vext2xv.hu.bu xr4, xr4 + vext2xv.hu.bu xr5, xr5 + vext2xv.hu.bu xr6, xr6 + vext2xv.hu.bu xr7, xr7 + vext2xv.hu.bu xr8, xr8 + vext2xv.hu.bu xr9, xr9 + vext2xv.hu.bu xr10, xr10 + vext2xv.hu.bu xr11, xr11 + vext2xv.hu.bu xr12, xr12 + vext2xv.hu.bu xr13, xr13 + vext2xv.hu.bu xr14, xr14 + vext2xv.hu.bu xr15, xr15 + + // Calculate the square of the difference + xvsub.h xr0, xr0, xr8 + xvsub.h xr1, xr1, xr9 + xvsub.h xr2, xr2, xr10 + xvsub.h xr3, xr3, xr11 + xvsub.h xr4, xr4, xr12 + xvsub.h xr5, xr5, xr13 + xvsub.h xr6, xr6, xr14 + xvsub.h xr7, xr7, xr15 + xvmul.h xr0, xr0, xr0 + xvmul.h xr1, xr1, xr1 + xvmul.h xr2, xr2, xr2 + xvmul.h xr3, xr3, xr3 + xvmul.h xr4, xr4, xr4 + xvmul.h xr5, xr5, xr5 + xvmul.h xr6, xr6, xr6 + xvmul.h xr7, xr7, xr7 + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.wu.hu xr1, xr1, xr1 + xvhaddw.wu.hu xr2, xr2, xr2 + xvhaddw.wu.hu xr3, xr3, xr3 + xvhaddw.wu.hu xr4, xr4, xr4 + xvhaddw.wu.hu xr5, xr5, xr5 + xvhaddw.wu.hu xr6, xr6, xr6 + xvhaddw.wu.hu xr7, xr7, xr7 + xvadd.w xr0, xr0, xr1 + xvadd.w xr2, xr2, xr3 + xvadd.w xr4, xr4, xr5 + xvadd.w xr6, xr6, xr7 + xvadd.w xr0, xr0, xr2 + xvadd.w xr4, xr4, xr6 + xvadd.w xr0, xr0, xr4 + xvadd.w xr0, xr0, xr16 + + // Calculate the sum + xvhaddw.d.w xr0, xr0, xr0 + xvhaddw.q.d xr0, xr0, xr0 + xvpickve2gr.w t2, xr0, 0 + xvpickve2gr.w t3, xr0, 4 + add.d a0, t2, t3 +endfunc_x264 + +/* + * int pixel_ssd_16x8_lasx(const Pixel *pix1, intptr_t stride_pix1, + * const Pixel *pix2, intptr_t stride_pix2) + */ +function_x264 pixel_ssd_16x8_lasx + slli.d t0, a1, 1 + add.d t1, a1, t0 + add.d t2, a1, t1 + slli.d t3, a3, 1 + add.d t4, a3, t3 + add.d t5, a3, t4 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + vext2xv.hu.bu xr4, xr4 + vext2xv.hu.bu xr5, xr5 + vext2xv.hu.bu xr6, xr6 + vext2xv.hu.bu xr7, xr7 + vext2xv.hu.bu xr8, xr8 + vext2xv.hu.bu xr9, xr9 + vext2xv.hu.bu xr10, xr10 + vext2xv.hu.bu xr11, xr11 + vext2xv.hu.bu xr12, xr12 + vext2xv.hu.bu xr13, xr13 + vext2xv.hu.bu xr14, xr14 + vext2xv.hu.bu xr15, xr15 + + // Calculate the square of the difference + xvsub.h xr0, xr0, xr8 + xvsub.h xr1, xr1, xr9 + xvsub.h xr2, xr2, xr10 + xvsub.h xr3, xr3, xr11 + xvsub.h xr4, xr4, xr12 + xvsub.h xr5, xr5, xr13 + xvsub.h xr6, xr6, xr14 + xvsub.h xr7, xr7, xr15 + xvmul.h xr0, xr0, xr0 + xvmul.h xr1, xr1, xr1 + xvmul.h xr2, xr2, xr2 + xvmul.h xr3, xr3, xr3 + xvmul.h xr4, xr4, xr4 + xvmul.h xr5, xr5, xr5 + xvmul.h xr6, xr6, xr6 + xvmul.h xr7, xr7, xr7 + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.wu.hu xr1, xr1, xr1 + xvhaddw.wu.hu xr2, xr2, xr2 + xvhaddw.wu.hu xr3, xr3, xr3 + xvhaddw.wu.hu xr4, xr4, xr4 + xvhaddw.wu.hu xr5, xr5, xr5 + xvhaddw.wu.hu xr6, xr6, xr6 + xvhaddw.wu.hu xr7, xr7, xr7 + xvadd.w xr0, xr0, xr1 + xvadd.w xr2, xr2, xr3 + xvadd.w xr4, xr4, xr5 + xvadd.w xr6, xr6, xr7 + xvadd.w xr0, xr0, xr2 + xvadd.w xr4, xr4, xr6 + xvadd.w xr0, xr0, xr4 + + // Calculate the sum + xvhaddw.d.w xr0, xr0, xr0 + xvhaddw.q.d xr0, xr0, xr0 + xvpickve2gr.w t2, xr0, 0 + xvpickve2gr.w t3, xr0, 4 + add.d a0, t2, t3 +endfunc_x264 + +/* + * int pixel_ssd_8x16_lasx(const Pixel *pix1, intptr_t stride_pix1, + * const Pixel *pix2, intptr_t stride_pix2) + */ +function_x264 pixel_ssd_8x16_lasx + slli.d t0, a1, 1 + add.d t1, a1, t0 + add.d t2, a1, t1 + slli.d t3, a3, 1 + add.d t4, a3, t3 + add.d t5, a3, t4 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 + + vilvl.d vr0, vr4, vr0 + vilvl.d vr1, vr5, vr1 + vilvl.d vr2, vr6, vr2 + vilvl.d vr3, vr7, vr3 + vilvl.d vr8, vr12, vr8 + vilvl.d vr9, vr13, vr9 + vilvl.d vr10, vr14, vr10 + vilvl.d vr11, vr15, vr11 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + vext2xv.hu.bu xr8, xr8 + vext2xv.hu.bu xr9, xr9 + vext2xv.hu.bu xr10, xr10 + vext2xv.hu.bu xr11, xr11 + + // Calculate the square of the difference + xvsub.h xr0, xr0, xr8 + xvsub.h xr1, xr1, xr9 + xvsub.h xr2, xr2, xr10 + xvsub.h xr3, xr3, xr11 + xvmul.h xr0, xr0, xr0 + xvmul.h xr1, xr1, xr1 + xvmul.h xr2, xr2, xr2 + xvmul.h xr3, xr3, xr3 + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.wu.hu xr1, xr1, xr1 + xvhaddw.wu.hu xr2, xr2, xr2 + xvhaddw.wu.hu xr3, xr3, xr3 + xvadd.w xr0, xr0, xr1 + xvadd.w xr2, xr2, xr3 + xvadd.w xr16, xr0, xr2 + + // Load data from pix1 and pix2 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 + + vilvl.d vr0, vr4, vr0 + vilvl.d vr1, vr5, vr1 + vilvl.d vr2, vr6, vr2 + vilvl.d vr3, vr7, vr3 + vilvl.d vr8, vr12, vr8 + vilvl.d vr9, vr13, vr9 + vilvl.d vr10, vr14, vr10 + vilvl.d vr11, vr15, vr11 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + vext2xv.hu.bu xr8, xr8 + vext2xv.hu.bu xr9, xr9 + vext2xv.hu.bu xr10, xr10 + vext2xv.hu.bu xr11, xr11 + + // Calculate the square of the difference + xvsub.h xr0, xr0, xr8 + xvsub.h xr1, xr1, xr9 + xvsub.h xr2, xr2, xr10 + xvsub.h xr3, xr3, xr11 + xvmul.h xr0, xr0, xr0 + xvmul.h xr1, xr1, xr1 + xvmul.h xr2, xr2, xr2 + xvmul.h xr3, xr3, xr3 + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.wu.hu xr1, xr1, xr1 + xvhaddw.wu.hu xr2, xr2, xr2 + xvhaddw.wu.hu xr3, xr3, xr3 + xvadd.w xr0, xr0, xr1 + xvadd.w xr2, xr2, xr3 + xvadd.w xr0, xr0, xr2 + xvadd.w xr0, xr0, xr16 + + // Calculate the sum + xvhaddw.d.w xr0, xr0, xr0 + xvhaddw.q.d xr0, xr0, xr0 + xvpickve2gr.w t2, xr0, 0 + xvpickve2gr.w t3, xr0, 4 + add.d a0, t2, t3 +endfunc_x264 + +/* + * int pixel_ssd_8x8_lasx(const Pixel *pix1, intptr_t stride_pix1, + * const Pixel *pix2, intptr_t stride_pix2) + */ +function_x264 pixel_ssd_8x8_lasx + slli.d t0, a1, 1 + add.d t1, a1, t0 + add.d t2, a1, t1 + slli.d t3, a3, 1 + add.d t4, a3, t3 + add.d t5, a3, t4 + + // Load data from pix1 and pix2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + add.d a0, a0, t2 + LSX_LOADX_4 a0, a1, t0, t1, vr4, vr5, vr6, vr7 + LSX_LOADX_4 a2, a3, t3, t4, vr8, vr9, vr10, vr11 + add.d a2, a2, t5 + LSX_LOADX_4 a2, a3, t3, t4, vr12, vr13, vr14, vr15 + + vilvl.d vr0, vr4, vr0 + vilvl.d vr1, vr5, vr1 + vilvl.d vr2, vr6, vr2 + vilvl.d vr3, vr7, vr3 + vilvl.d vr8, vr12, vr8 + vilvl.d vr9, vr13, vr9 + vilvl.d vr10, vr14, vr10 + vilvl.d vr11, vr15, vr11 + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + vext2xv.hu.bu xr8, xr8 + vext2xv.hu.bu xr9, xr9 + vext2xv.hu.bu xr10, xr10 + vext2xv.hu.bu xr11, xr11 + + // Calculate the square of the difference + xvsub.h xr0, xr0, xr8 + xvsub.h xr1, xr1, xr9 + xvsub.h xr2, xr2, xr10 + xvsub.h xr3, xr3, xr11 + xvmul.h xr0, xr0, xr0 + xvmul.h xr1, xr1, xr1 + xvmul.h xr2, xr2, xr2 + xvmul.h xr3, xr3, xr3 + xvhaddw.wu.hu xr0, xr0, xr0 + xvhaddw.wu.hu xr1, xr1, xr1 + xvhaddw.wu.hu xr2, xr2, xr2 + xvhaddw.wu.hu xr3, xr3, xr3 + xvadd.w xr0, xr0, xr1 + xvadd.w xr2, xr2, xr3 + xvadd.w xr0, xr0, xr2 + + // Calculate the sum + xvhaddw.d.w xr0, xr0, xr0 + xvhaddw.q.d xr0, xr0, xr0 + xvpickve2gr.w t2, xr0, 0 + xvpickve2gr.w t3, xr0, 4 + add.d a0, t2, t3 +endfunc_x264 + +/* + * int pixel_sa8d_16x16_lasx(const Pixel *pix1, intptr_t i_pix1, + * const Pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_sa8d_16x16_lasx + addi.d sp, sp, -8 + fst.d f24, sp, 0 + + slli.d t2, a1, 1 + slli.d t3, a3, 1 + add.d t4, a1, t2 + add.d t5, a3, t3 + slli.d t6, a1, 2 + slli.d t7, a3, 2 + slli.d t0, a1, 3 + slli.d t1, a3, 3 + + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr15, xr11, xr13 + xvsub.h xr16, xr11, xr13 + + add.d a0, a0, t6 + add.d a2, a2, t7 + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr9, xr11, xr13 + xvsub.h xr10, xr11, xr13 + xvadd.h xr17, xr15, xr9 + xvadd.h xr18, xr16, xr10 + xvsub.h xr19, xr15, xr9 + xvsub.h xr20, xr16, xr10 + xvadda.h xr17, xr17, xr18 + xvadda.h xr19, xr19, xr20 + xvadd.h xr21, xr17, xr19 + + add.d a0, a0, t6 + add.d a2, a2, t7 + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr15, xr11, xr13 + xvsub.h xr16, xr11, xr13 + + add.d a0, a0, t6 + add.d a2, a2, t7 + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr9, xr11, xr13 + xvsub.h xr10, xr11, xr13 + xvadd.h xr17, xr15, xr9 + xvadd.h xr18, xr16, xr10 + xvsub.h xr19, xr15, xr9 + xvsub.h xr20, xr16, xr10 + xvadda.h xr17, xr17, xr18 + xvadda.h xr19, xr19, xr20 + xvadd.h xr22, xr17, xr19 + + sub.d a0, a0, t6 + sub.d a2, a2, t7 + addi.d a0, a0, 8 + addi.d a2, a2, 8 + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr15, xr11, xr13 + xvsub.h xr16, xr11, xr13 + + add.d a0, a0, t6 + add.d a2, a2, t7 + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr9, xr11, xr13 + xvsub.h xr10, xr11, xr13 + xvadd.h xr17, xr15, xr9 + xvadd.h xr18, xr16, xr10 + xvsub.h xr19, xr15, xr9 + xvsub.h xr20, xr16, xr10 + xvadda.h xr17, xr17, xr18 + xvadda.h xr19, xr19, xr20 + xvadd.h xr23, xr17, xr19 + + sub.d a0, a0, t0 + sub.d a2, a2, t1 + sub.d a0, a0, t6 + sub.d a2, a2, t7 + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr15, xr11, xr13 + xvsub.h xr16, xr11, xr13 + + add.d a0, a0, t6 + add.d a2, a2, t7 + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr12, 0x13 + xvadd.h xr9, xr11, xr13 + xvsub.h xr10, xr11, xr13 + xvadd.h xr17, xr15, xr9 + xvadd.h xr18, xr16, xr10 + xvsub.h xr19, xr15, xr9 + xvsub.h xr20, xr16, xr10 + xvadda.h xr17, xr17, xr18 + xvadda.h xr19, xr19, xr20 + xvadd.h xr24, xr17, xr19 + + xvadd.h xr21, xr21, xr22 + xvadd.h xr23, xr23, xr24 + xvhaddw.wu.hu xr21, xr21, xr21 + xvhaddw.wu.hu xr23, xr23, xr23 + xvadd.w xr21, xr21, xr23 + xvhaddw.du.wu xr21, xr21, xr21 + xvhaddw.qu.du xr21, xr21, xr21 + xvpickve2gr.du t4, xr21, 0 + xvpickve2gr.du t5, xr21, 2 + add.d t4, t4, t5 + addi.d t4, t4, 2 + srli.d a0, t4, 2 + + fld.d f24, sp, 0 + addi.d sp, sp, 8 +endfunc_x264 + +/* + * int pixel_sa8d_8x8_lasx(const Pixel *pix1, intptr_t i_pix1, + * const Pixel *pix2, intptr_t i_pix2) + */ +function_x264 pixel_sa8d_8x8_lasx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + add.d t4, a1, t2 + add.d t5, a3, t3 + slli.d t6, a1, 2 + slli.d t7, a3, 2 + + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvor.v xr14, xr12, xr12 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr14, 0x13 + xvadd.h xr15, xr11, xr13 + xvsub.h xr16, xr11, xr13 + + add.d a0, a0, t6 + add.d a2, a2, t7 + // Load data from pix1 and pix2 + FLDD_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDD_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + vilvl.d vr1, vr2, vr1 + vilvl.d vr3, vr4, vr3 + vilvl.d vr5, vr6, vr5 + vilvl.d vr7, vr8, vr7 + xvpermi.q xr1, xr3, 0x02 + xvpermi.q xr5, xr7, 0x02 + xvsubwev.h.bu xr9, xr1, xr5 + xvsubwod.h.bu xr10, xr1, xr5 + xvadd.h xr11, xr9, xr10 /* a0 + a1 */ + xvsub.h xr12, xr9, xr10 /* a0 - a1 */ + xvpackev.h xr9, xr12, xr11 + xvpackod.h xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvpackev.w xr9, xr12, xr11 + xvpackod.w xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 + xvsub.h xr12, xr9, xr10 + xvpackev.d xr9, xr12, xr11 + xvpackod.d xr10, xr12, xr11 + xvadd.h xr11, xr9, xr10 /* HADAMARD4 */ + xvsub.h xr12, xr9, xr10 + xvor.v xr13, xr11, xr11 + xvor.v xr14, xr12, xr12 + xvpermi.q xr11, xr12, 0x02 + xvpermi.q xr13, xr14, 0x13 + xvadd.h xr9, xr11, xr13 + xvsub.h xr10, xr11, xr13 + + xvadd.h xr17, xr15, xr9 + xvadd.h xr18, xr16, xr10 + xvsub.h xr19, xr15, xr9 + xvsub.h xr20, xr16, xr10 + xvadda.h xr17, xr17, xr18 + xvadda.h xr19, xr19, xr20 + xvadd.h xr17, xr17, xr19 + xvhaddw.wu.hu xr17, xr17, xr17 + xvhaddw.du.wu xr17, xr17, xr17 + xvhaddw.qu.du xr17, xr17, xr17 + xvpickve2gr.wu t4, xr17, 0 + xvpickve2gr.wu t5, xr17, 4 + add.d t4, t4, t5 + addi.d t4, t4, 2 + srli.d a0, t4, 2 +endfunc_x264 + +.macro sse_diff_8width_lasx in0, in1 + fld.d f0, \in0, 0 + fld.d f1, \in0, FENC_STRIDE + fld.d f2, \in0, FENC_STRIDE * 2 + fld.d f3, \in0, FENC_STRIDE * 3 + fld.d f4, \in1, 0 + fld.d f5, \in1, FDEC_STRIDE + fld.d f6, \in1, FDEC_STRIDE * 2 + fld.d f7, \in1, FDEC_STRIDE * 3 + + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + xvpermi.q xr1, xr0, 0x20 + xvpermi.q xr5, xr4, 0x20 + + xvilvl.b xr2, xr5, xr1 + xvilvh.b xr6, xr5, xr1 + xvhsubw.hu.bu xr3, xr2, xr2 + xvhsubw.hu.bu xr4, xr6, xr6 + xvdp2add.w.h xr8, xr3, xr3 + xvdp2add.w.h xr8, xr4, xr4 + xvadd.h xr9, xr9, xr3 + xvadd.h xr9, xr9, xr4 +.endm + +/* + * int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2, + * int32_t ssd[2] ) + */ +function_x264 pixel_var2_8x16_lasx + add.d t0, a0, zero + add.d t1, a1, zero + xvxor.v xr8, xr8, xr8 + xvxor.v xr9, xr9, xr9 + + sse_diff_8width_lasx a0, a1 + addi.d a0, a0, FENC_STRIDE * 4 + addi.d a1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lasx a0, a1 + addi.d a0, a0, FENC_STRIDE * 4 + addi.d a1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lasx a0, a1 + addi.d a0, a0, FENC_STRIDE * 4 + addi.d a1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lasx a0, a1 + + xvhaddw.w.h xr9, xr9, xr9 + xvhaddw.d.w xr9, xr9, xr9 + xvhaddw.q.d xr9, xr9, xr9 + xvpickve2gr.wu t2, xr9, 0 + xvpickve2gr.wu t3, xr9, 4 + add.w t2, t2, t3 + xvhaddw.d.w xr8, xr8, xr8 + xvhaddw.q.d xr8, xr8, xr8 + xvpickve2gr.wu t3, xr8, 0 + xvpickve2gr.wu t4, xr8, 4 + add.w t3, t4, t3 + st.w t3, a2, 0 + mul.w t2, t2, t2 + srai.w t2, t2, 7 + sub.w t3, t3, t2 + + xvxor.v xr8, xr8, xr8 + xvxor.v xr9, xr9, xr9 + addi.d a0, t0, FENC_STRIDE / 2 + addi.d a1, t1, FDEC_STRIDE / 2 + sse_diff_8width_lasx a0, a1 + addi.d a0, a0, FENC_STRIDE * 4 + addi.d a1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lasx a0, a1 + addi.d a0, a0, FENC_STRIDE * 4 + addi.d a1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lasx a0, a1 + addi.d a0, a0, FENC_STRIDE * 4 + addi.d a1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lasx a0, a1 + + xvhaddw.w.h xr9, xr9, xr9 + xvhaddw.d.w xr9, xr9, xr9 + xvhaddw.q.d xr9, xr9, xr9 + xvpickve2gr.wu t4, xr9, 0 + xvpickve2gr.wu t5, xr9, 4 + add.w t4, t4, t5 + xvhaddw.d.w xr8, xr8, xr8 + xvhaddw.q.d xr8, xr8, xr8 + xvpickve2gr.wu t5, xr8, 0 + xvpickve2gr.wu t6, xr8, 4 + add.w t5, t6, t5 + st.w t5, a2, 4 + mul.w t4, t4, t4 + srai.w t4, t4, 7 + sub.w t5, t5, t4 + add.w a0, t3, t5 +endfunc_x264 + +/* + * int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2, + * int32_t ssd[2] ) + */ +function_x264 pixel_var2_8x8_lasx + add.d t0, a0, zero + add.d t1, a1, zero + xvxor.v xr8, xr8, xr8 + xvxor.v xr9, xr9, xr9 + + sse_diff_8width_lasx a0, a1 + addi.d a0, a0, FENC_STRIDE * 4 + addi.d a1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lasx a0, a1 + + xvhaddw.w.h xr9, xr9, xr9 + xvhaddw.d.w xr9, xr9, xr9 + xvhaddw.q.d xr9, xr9, xr9 + xvpickve2gr.wu t2, xr9, 0 + xvpickve2gr.wu t3, xr9, 4 + add.w t2, t2, t3 + xvhaddw.d.w xr8, xr8, xr8 + xvhaddw.q.d xr8, xr8, xr8 + xvpickve2gr.wu t3, xr8, 0 + xvpickve2gr.wu t4, xr8, 4 + add.w t3, t4, t3 + st.w t3, a2, 0 + mul.w t2, t2, t2 + srai.w t2, t2, 6 + sub.w t3, t3, t2 + + xvxor.v xr8, xr8, xr8 + xvxor.v xr9, xr9, xr9 + addi.d a0, t0, FENC_STRIDE / 2 + addi.d a1, t1, FDEC_STRIDE / 2 + sse_diff_8width_lasx a0, a1 + addi.d a0, a0, FENC_STRIDE * 4 + addi.d a1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lasx a0, a1 + + xvhaddw.w.h xr9, xr9, xr9 + xvhaddw.d.w xr9, xr9, xr9 + xvhaddw.q.d xr9, xr9, xr9 + xvpickve2gr.wu t4, xr9, 0 + xvpickve2gr.wu t5, xr9, 4 + add.w t4, t4, t5 + xvhaddw.d.w xr8, xr8, xr8 + xvhaddw.q.d xr8, xr8, xr8 + xvpickve2gr.wu t5, xr8, 0 + xvpickve2gr.wu t6, xr8, 4 + add.w t5, t6, t5 + st.w t5, a2, 4 + mul.w t4, t4, t4 + srai.w t4, t4, 6 + sub.w t5, t5, t4 + add.w a0, t3, t5 +endfunc_x264 + + +/* + * uint64_t x264_pixel_hadamard_ac_8x8( pixel *pix, intptr_t stride ) + */ +function_x264 hadamard_ac_8x8_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + alsl.d a0, a1, a0, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7 + + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + + vpickev.b vr2, vr1, vr0 + vpickod.b vr3, vr1, vr0 + vaddwev.h.bu vr6, vr2, vr3 + vaddwod.h.bu vr7, vr2, vr3 + vsubwev.h.bu vr8, vr2, vr3 + vsubwod.h.bu vr9, vr2, vr3 + vadd.h vr10, vr6, vr7 + vadd.h vr11, vr8, vr9 + vsub.h vr12, vr6, vr7 + vsub.h vr13, vr8, vr9 + + vilvl.h vr6, vr11, vr10 + vilvh.h vr7, vr11, vr10 + vilvl.h vr8, vr13, vr12 + vilvh.h vr9, vr13, vr12 + vilvl.w vr10, vr8, vr6 + vilvh.w vr11, vr8, vr6 + vilvl.w vr12, vr9, vr7 + vilvh.w vr13, vr9, vr7 + + vadd.h vr6, vr10, vr11 + vadd.h vr7, vr12, vr13 + vsub.h vr8, vr10, vr11 + vsub.h vr9, vr12, vr13 + vadd.h vr10, vr6, vr7 + vadd.h vr11, vr8, vr9 + vsub.h vr12, vr6, vr7 + vsub.h vr13, vr8, vr9 + + vpickev.b vr2, vr5, vr4 + vpickod.b vr3, vr5, vr4 + vaddwev.h.bu vr6, vr2, vr3 + vaddwod.h.bu vr7, vr2, vr3 + vsubwev.h.bu vr8, vr2, vr3 + vsubwod.h.bu vr9, vr2, vr3 + vadd.h vr14, vr6, vr7 + vadd.h vr15, vr8, vr9 + vsub.h vr16, vr6, vr7 + vsub.h vr17, vr8, vr9 + + vilvl.h vr6, vr15, vr14 + vilvh.h vr7, vr15, vr14 + vilvl.h vr8, vr17, vr16 + vilvh.h vr9, vr17, vr16 + vilvl.w vr14, vr8, vr6 + vilvh.w vr15, vr8, vr6 + vilvl.w vr16, vr9, vr7 + vilvh.w vr17, vr9, vr7 + + vadd.h vr6, vr14, vr15 + vadd.h vr7, vr16, vr17 + vsub.h vr8, vr14, vr15 + vsub.h vr9, vr16, vr17 + vadd.h vr14, vr6, vr7 + vadd.h vr15, vr8, vr9 + vsub.h vr16, vr6, vr7 + vsub.h vr17, vr8, vr9 + + vadd.h vr18, vr10, vr14 + vpickve2gr.hu t0, vr18, 0 + vpickve2gr.hu t1, vr18, 4 + add.d t1, t0, t1 // dc + + vadda.h vr4, vr11, vr10 + vadda.h vr5, vr13, vr12 + vadda.h vr6, vr15, vr14 + vadda.h vr7, vr17, vr16 + vadd.h vr4, vr5, vr4 + vadd.h vr6, vr7, vr6 + vadd.h vr4, vr4, vr6 + vhaddw.wu.hu vr4, vr4, vr4 + vhaddw.du.wu vr4, vr4, vr4 + vhaddw.qu.du vr4, vr4, vr4 + vpickve2gr.wu t0, vr4, 0 // sum4 + + vpackev.h vr0, vr11, vr10 + vpackev.h vr1, vr13, vr12 + vpackev.h vr2, vr15, vr14 + vpackev.h vr3, vr17, vr16 + vpackod.h vr4, vr11, vr10 + vpackod.h vr5, vr13, vr12 + vpackod.h vr6, vr15, vr14 + vpackod.h vr7, vr17, vr16 + + vilvl.d vr10, vr1, vr0 + vilvh.d vr11, vr1, vr0 + vilvl.d vr12, vr3, vr2 + vilvh.d vr13, vr3, vr2 + vilvl.d vr14, vr5, vr4 + vilvh.d vr15, vr5, vr4 + vilvl.d vr16, vr7, vr6 + vilvh.d vr17, vr7, vr6 + + vadd.h vr0, vr10, vr11 + vadd.h vr1, vr12, vr13 + vadd.h vr2, vr14, vr16 + vadd.h vr3, vr15, vr17 + vsub.h vr4, vr10, vr11 + vsub.h vr5, vr12, vr13 + vsub.h vr6, vr14, vr16 + vsub.h vr7, vr15, vr17 + + vadd.h vr10, vr0, vr1 + vadd.h vr11, vr2, vr3 + vadd.h vr12, vr4, vr5 + vadd.h vr13, vr6, vr7 + vsub.h vr14, vr0, vr1 + vsub.h vr15, vr2, vr3 + vsub.h vr16, vr4, vr5 + vsub.h vr17, vr6, vr7 + + vadda.h vr10, vr10, vr11 + vadda.h vr11, vr12, vr13 + vadda.h vr12, vr14, vr15 + vadda.h vr13, vr16, vr17 + vadd.h vr10, vr10, vr11 + vadd.h vr11, vr12, vr13 + vadd.h vr10, vr10, vr11 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.du.wu vr10, vr10, vr10 + vhaddw.qu.du vr10, vr10, vr10 + vpickve2gr.wu t2, vr10, 0 // sum8 + + sub.d t0, t0, t1 + sub.d t2, t2, t1 + slli.d t2, t2, 32 + add.d a0, t2, t0 +endfunc_x264 + +/* + * int x264_pixel_satd_4x8( pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2 ) + */ +function_x264 pixel_satd_4x8_lsx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + add.d t4, a1, t2 + add.d t5, a3, t3 + + // Load data from pix1 and pix2 + FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + pixel_satd_4x4_lsx_core vr13 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + pixel_satd_4x4_lsx_core vr14 + vadd.h vr13, vr14, vr13 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu t5, vr13, 0 + srli.d a0, t5, 1 +endfunc_x264 + +/* + * int x264_pixel_satd_4x16( uint8_t *p_pix1, intptr_t i_stride, + * uint8_t *p_pix2, intptr_t i_stride2 ) + */ +function_x264 pixel_satd_4x16_lsx + slli.d t2, a1, 1 + slli.d t3, a3, 1 + add.d t4, a1, t2 + add.d t5, a3, t3 + + // Load data from pix1 and pix2 + FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + pixel_satd_4x4_lsx_core vr13 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + pixel_satd_4x4_lsx_core vr14 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + pixel_satd_4x4_lsx_core vr15 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t2, t4, f1, f2, f3, f4 + FLDS_LOADX_4 a2, a3, t3, t5, f5, f6, f7, f8 + pixel_satd_4x4_lsx_core vr16 + + vadd.h vr13, vr14, vr13 + vadd.h vr15, vr16, vr15 + vadd.h vr13, vr15, vr13 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu t5, vr13, 0 + srli.d a0, t5, 1 +endfunc_x264 + +.macro pixel_satd_8x4_lsx_core out0, out1, out2, out3 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr2, vr5, vr4 + vilvl.d vr3, vr7, vr6 + + vsubwev.h.bu vr4, vr0, vr2 + vsubwod.h.bu vr5, vr0, vr2 + vsubwev.h.bu vr6, vr1, vr3 + vsubwod.h.bu vr7, vr1, vr3 + vadd.h vr0, vr4, vr5 + vsub.h vr1, vr4, vr5 + vadd.h vr2, vr6, vr7 + vsub.h vr3, vr6, vr7 + vpackev.h vr4, vr1, vr0 + vpackod.h vr5, vr1, vr0 + vpackev.h vr6, vr3, vr2 + vpackod.h vr7, vr3, vr2 + vadd.h vr8, vr4, vr5 + vsub.h vr9, vr4, vr5 + vadd.h vr10, vr6, vr7 + vsub.h vr11, vr6, vr7 + vilvl.d vr4, vr9, vr8 + vilvh.d vr5, vr9, vr8 + vilvl.d vr6, vr11, vr10 + vilvh.d vr7, vr11, vr10 + vadd.h vr8, vr4, vr5 + vsub.h vr9, vr4, vr5 + vadd.h vr10, vr6, vr7 + vsub.h vr11, vr6, vr7 + vadd.h \out0, vr8, vr10 + vsub.h \out1, vr8, vr10 + vadd.h \out2, vr9, vr11 + vsub.h \out3, vr9, vr11 +.endm + +/* + * int x264_pixel_satd_8x4( uint8_t *p_pix1, intptr_t i_stride, + * uint8_t *p_pix2, intptr_t i_stride2 ) + */ +function_x264 pixel_satd_8x4_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + slli.d t2, a3, 1 + add.d t3, t2, a3 + + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 + vadda.h vr12, vr13, vr12 + vadda.h vr13, vr15, vr14 + + vadd.h vr12, vr13, vr12 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.du.wu vr12, vr12, vr12 + vhaddw.qu.du vr12, vr12, vr12 + vpickve2gr.wu t4, vr12, 0 + srli.d a0, t4, 1 +endfunc_x264 + +/* + * int x264_pixel_satd_8x8( uint8_t *p_pix1, intptr_t i_stride, + * uint8_t *p_pix2, intptr_t i_stride2 ) + */ +function_x264 pixel_satd_8x8_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + slli.d t2, a3, 1 + add.d t3, t2, a3 + + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 + vadda.h vr12, vr13, vr12 + vadda.h vr13, vr15, vr14 + vadd.h vr12, vr13, vr12 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 + vadda.h vr13, vr14, vr13 + vadda.h vr14, vr16, vr15 + vadd.h vr13, vr14, vr13 + + vadd.h vr12, vr13, vr12 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.du.wu vr12, vr12, vr12 + vhaddw.qu.du vr12, vr12, vr12 + vpickve2gr.wu t4, vr12, 0 + srli.d a0, t4, 1 +endfunc_x264 + +/* + * int x264_pixel_satd_8x8( uint8_t *p_pix1, intptr_t i_stride, + * uint8_t *p_pix2, intptr_t i_stride2 ) + */ +function_x264 pixel_satd_8x16_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + slli.d t2, a3, 1 + add.d t3, t2, a3 + + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 + vadda.h vr12, vr13, vr12 + vadda.h vr13, vr15, vr14 + vadd.h vr12, vr13, vr12 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 + vadda.h vr13, vr14, vr13 + vadda.h vr14, vr16, vr15 + vadd.h vr13, vr14, vr13 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17 + vadda.h vr14, vr15, vr14 + vadda.h vr15, vr17, vr16 + vadd.h vr14, vr15, vr14 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18 + vadda.h vr15, vr16, vr15 + vadda.h vr16, vr18, vr17 + vadd.h vr15, vr16, vr15 + + vadd.h vr12, vr12, vr13 + vadd.h vr14, vr14, vr15 + vadd.h vr12, vr12, vr14 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.du.wu vr12, vr12, vr12 + vhaddw.qu.du vr12, vr12, vr12 + vpickve2gr.wu t4, vr12, 0 + srli.d a0, t4, 1 +endfunc_x264 + +/* + * int x264_pixel_satd_16x8( uint8_t *p_pix1, intptr_t i_stride, + * uint8_t *p_pix2, intptr_t i_stride2 ) + */ +function_x264 pixel_satd_16x8_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + slli.d t2, a3, 1 + add.d t3, t2, a3 + + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 + vadda.h vr12, vr13, vr12 + vadda.h vr13, vr15, vr14 + vadd.h vr12, vr13, vr12 + + addi.d t5, a0, 8 + addi.d t6, a2, 8 + FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 + vadda.h vr13, vr14, vr13 + vadda.h vr14, vr16, vr15 + vadd.h vr13, vr14, vr13 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17 + vadda.h vr14, vr15, vr14 + vadda.h vr15, vr17, vr16 + vadd.h vr14, vr15, vr14 + + addi.d t5, a0, 8 + addi.d t6, a2, 8 + FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18 + vadda.h vr15, vr16, vr15 + vadda.h vr16, vr18, vr17 + vadd.h vr15, vr16, vr15 + + vadd.h vr12, vr13, vr12 + vadd.h vr14, vr15, vr14 + vadd.h vr12, vr14, vr12 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.du.wu vr12, vr12, vr12 + vhaddw.qu.du vr12, vr12, vr12 + vpickve2gr.wu t4, vr12, 0 + srli.d a0, t4, 1 +endfunc_x264 + +/* + * int x264_pixel_satd_16x16( uint8_t *p_pix1, intptr_t i_stride, + * uint8_t *p_pix2, intptr_t i_stride2 ) + */ +function_x264 pixel_satd_16x16_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + slli.d t2, a3, 1 + add.d t3, t2, a3 + + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 + vadda.h vr12, vr13, vr12 + vadda.h vr13, vr15, vr14 + vadd.h vr12, vr13, vr12 + + addi.d t5, a0, 8 + addi.d t6, a2, 8 + FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 + vadda.h vr13, vr14, vr13 + vadda.h vr14, vr16, vr15 + vadd.h vr13, vr14, vr13 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17 + vadda.h vr14, vr15, vr14 + vadda.h vr15, vr17, vr16 + vadd.h vr14, vr15, vr14 + + addi.d t5, a0, 8 + addi.d t6, a2, 8 + FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18 + vadda.h vr15, vr16, vr15 + vadda.h vr16, vr18, vr17 + vadd.h vr15, vr16, vr15 + + vadd.h vr12, vr13, vr12 + vadd.h vr14, vr15, vr14 + vadd.h vr19, vr14, vr12 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr12, vr13, vr14, vr15 + vadda.h vr12, vr13, vr12 + vadda.h vr13, vr15, vr14 + vadd.h vr12, vr13, vr12 + + addi.d t5, a0, 8 + addi.d t6, a2, 8 + FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr13, vr14, vr15, vr16 + vadda.h vr13, vr14, vr13 + vadda.h vr14, vr16, vr15 + vadd.h vr13, vr14, vr13 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr14, vr15, vr16, vr17 + vadda.h vr14, vr15, vr14 + vadda.h vr15, vr17, vr16 + vadd.h vr14, vr15, vr14 + + addi.d t5, a0, 8 + addi.d t6, a2, 8 + FLDD_LOADX_4 t5, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 t6, a3, t2, t3, f4, f5, f6, f7 + pixel_satd_8x4_lsx_core vr15, vr16, vr17, vr18 + vadda.h vr15, vr16, vr15 + vadda.h vr16, vr18, vr17 + vadd.h vr15, vr16, vr15 + + vadd.h vr12, vr13, vr12 + vadd.h vr14, vr15, vr14 + vadd.h vr12, vr14, vr12 + vadd.h vr12, vr19, vr12 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.du.wu vr12, vr12, vr12 + vhaddw.qu.du vr12, vr12, vr12 + vpickve2gr.wu t4, vr12, 0 + srli.d a0, t4, 1 +endfunc_x264 + +/* + * int x264_pixel_ssd_4x4( pixel *pix1, intptr_t i_stride_pix1, + * pixel *pix2, intptr_t i_stride_pix2 ) + */ +function_x264 pixel_ssd_4x4_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a3, 1 + add.d t3, a3, t2 + + FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + + vilvl.w vr0, vr1, vr0 + vilvl.w vr1, vr3, vr2 + vilvl.w vr4, vr5, vr4 + vilvl.w vr5, vr7, vr6 + vilvl.d vr0, vr1, vr0 + vilvl.d vr4, vr5, vr4 + vsubwev.h.bu vr1, vr0, vr4 + vsubwod.h.bu vr2, vr0, vr4 + vmul.h vr5, vr1, vr1 + vmul.h vr6, vr2, vr2 + vhaddw.wu.hu vr5, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vadd.w vr5, vr5, vr6 + vhaddw.d.w vr5, vr5, vr5 + vhaddw.q.d vr5, vr5, vr5 + vpickve2gr.w a0, vr5, 0 +endfunc_x264 + +/* + * int x264_pixel_ssd_4x8( pixel *pix1, intptr_t i_stride_pix1, + * pixel *pix2, intptr_t i_stride_pix2 ) + */ +function_x264 pixel_ssd_4x8_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a3, 1 + add.d t3, a3, t2 + + FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.w vr0, vr1, vr0 + vilvl.w vr1, vr3, vr2 + vilvl.w vr4, vr5, vr4 + vilvl.w vr5, vr7, vr6 + vilvl.d vr0, vr1, vr0 + vilvl.d vr4, vr5, vr4 + vsubwev.h.bu vr1, vr0, vr4 + vsubwod.h.bu vr2, vr0, vr4 + vmul.h vr5, vr1, vr1 + vmul.h vr6, vr2, vr2 + vhaddw.wu.hu vr5, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vadd.w vr10, vr5, vr6 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.w vr0, vr1, vr0 + vilvl.w vr1, vr3, vr2 + vilvl.w vr4, vr5, vr4 + vilvl.w vr5, vr7, vr6 + vilvl.d vr0, vr1, vr0 + vilvl.d vr4, vr5, vr4 + vsubwev.h.bu vr1, vr0, vr4 + vsubwod.h.bu vr2, vr0, vr4 + vmul.h vr5, vr1, vr1 + vmul.h vr6, vr2, vr2 + vhaddw.wu.hu vr5, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vadd.w vr5, vr5, vr6 + + vadd.w vr5, vr5, vr10 + vhaddw.d.w vr5, vr5, vr5 + vhaddw.q.d vr5, vr5, vr5 + vpickve2gr.w a0, vr5, 0 +endfunc_x264 + +/* + * int x264_pixel_ssd_4x16( pixel *pix1, intptr_t i_stride_pix1, + * pixel *pix2, intptr_t i_stride_pix2 ) + */ +function_x264 pixel_ssd_4x16_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a3, 1 + add.d t3, a3, t2 + + FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.w vr0, vr1, vr0 + vilvl.w vr1, vr3, vr2 + vilvl.w vr4, vr5, vr4 + vilvl.w vr5, vr7, vr6 + vilvl.d vr0, vr1, vr0 + vilvl.d vr4, vr5, vr4 + vsubwev.h.bu vr1, vr0, vr4 + vsubwod.h.bu vr2, vr0, vr4 + vmul.h vr5, vr1, vr1 + vmul.h vr6, vr2, vr2 + vhaddw.wu.hu vr5, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vadd.w vr10, vr5, vr6 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDS_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.w vr0, vr1, vr0 + vilvl.w vr1, vr3, vr2 + vilvl.w vr4, vr5, vr4 + vilvl.w vr5, vr7, vr6 + vilvl.d vr0, vr1, vr0 + vilvl.d vr4, vr5, vr4 + vsubwev.h.bu vr1, vr0, vr4 + vsubwod.h.bu vr2, vr0, vr4 + vmul.h vr5, vr1, vr1 + vmul.h vr6, vr2, vr2 + vhaddw.wu.hu vr5, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vadd.w vr5, vr5, vr6 + vadd.w vr10, vr5, vr10 +.endr + + vhaddw.d.w vr10, vr10, vr10 + vhaddw.q.d vr10, vr10, vr10 + vpickve2gr.w a0, vr10, 0 +endfunc_x264 + +/* + * int x264_pixel_ssd_8x4( pixel *pix1, intptr_t i_stride_pix1, + * pixel *pix2, intptr_t i_stride_pix2 ) + */ +function_x264 pixel_ssd_8x4_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a3, 1 + add.d t3, a3, t2 + + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vsubwev.h.bu vr2, vr0, vr4 + vsubwod.h.bu vr3, vr0, vr4 + vsubwev.h.bu vr6, vr1, vr5 + vsubwod.h.bu vr7, vr1, vr5 + vmul.h vr2, vr2, vr2 + vmul.h vr3, vr3, vr3 + vmul.h vr6, vr6, vr6 + vmul.h vr7, vr7, vr7 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.wu.hu vr3, vr3, vr3 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vadd.w vr2, vr2, vr3 + vadd.w vr6, vr6, vr7 + vadd.w vr2, vr2, vr6 + vhaddw.d.w vr2, vr2, vr2 + vhaddw.q.d vr2, vr2, vr2 + vpickve2gr.w a0, vr2, 0 +endfunc_x264 + +/* + * int x264_pixel_ssd_8x8( pixel *pix1, intptr_t i_stride_pix1, + * pixel *pix2, intptr_t i_stride_pix2 ) + */ +function_x264 pixel_ssd_8x8_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a3, 1 + add.d t3, a3, t2 + + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vsubwev.h.bu vr2, vr0, vr4 + vsubwod.h.bu vr3, vr0, vr4 + vsubwev.h.bu vr6, vr1, vr5 + vsubwod.h.bu vr7, vr1, vr5 + vmul.h vr2, vr2, vr2 + vmul.h vr3, vr3, vr3 + vmul.h vr6, vr6, vr6 + vmul.h vr7, vr7, vr7 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.wu.hu vr3, vr3, vr3 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vadd.w vr2, vr2, vr3 + vadd.w vr6, vr6, vr7 + vadd.w vr10, vr2, vr6 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vsubwev.h.bu vr2, vr0, vr4 + vsubwod.h.bu vr3, vr0, vr4 + vsubwev.h.bu vr6, vr1, vr5 + vsubwod.h.bu vr7, vr1, vr5 + vmul.h vr2, vr2, vr2 + vmul.h vr3, vr3, vr3 + vmul.h vr6, vr6, vr6 + vmul.h vr7, vr7, vr7 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.wu.hu vr3, vr3, vr3 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vadd.w vr2, vr2, vr3 + vadd.w vr6, vr6, vr7 + vadd.w vr11, vr2, vr6 + + vadd.w vr10, vr10, vr11 + vhaddw.d.w vr10, vr10, vr10 + vhaddw.q.d vr10, vr10, vr10 + vpickve2gr.w a0, vr10, 0 +endfunc_x264 + +/* + * int x264_pixel_ssd_8x16( pixel *pix1, intptr_t i_stride_pix1, + * pixel *pix2, intptr_t i_stride_pix2 ) + */ +function_x264 pixel_ssd_8x16_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a3, 1 + add.d t3, a3, t2 + + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vsubwev.h.bu vr2, vr0, vr4 + vsubwod.h.bu vr3, vr0, vr4 + vsubwev.h.bu vr6, vr1, vr5 + vsubwod.h.bu vr7, vr1, vr5 + vmul.h vr2, vr2, vr2 + vmul.h vr3, vr3, vr3 + vmul.h vr6, vr6, vr6 + vmul.h vr7, vr7, vr7 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.wu.hu vr3, vr3, vr3 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vadd.w vr2, vr2, vr3 + vadd.w vr6, vr6, vr7 + vadd.w vr10, vr2, vr6 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vsubwev.h.bu vr2, vr0, vr4 + vsubwod.h.bu vr3, vr0, vr4 + vsubwev.h.bu vr6, vr1, vr5 + vsubwod.h.bu vr7, vr1, vr5 + vmul.h vr2, vr2, vr2 + vmul.h vr3, vr3, vr3 + vmul.h vr6, vr6, vr6 + vmul.h vr7, vr7, vr7 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.wu.hu vr3, vr3, vr3 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vadd.w vr2, vr2, vr3 + vadd.w vr6, vr6, vr7 + vadd.w vr11, vr2, vr6 + vadd.w vr10, vr10, vr11 +.endr + + vhaddw.d.w vr10, vr10, vr10 + vhaddw.q.d vr10, vr10, vr10 + vpickve2gr.w a0, vr10, 0 +endfunc_x264 + +/* + * int x264_pixel_ssd_16x8( pixel *pix1, intptr_t i_stride_pix1, + * pixel *pix2, intptr_t i_stride_pix2 ) + */ +function_x264 pixel_ssd_16x8_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a3, 1 + add.d t3, a3, t2 + + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7 + vsubwev.h.bu vr8, vr0, vr4 + vsubwod.h.bu vr9, vr0, vr4 + vsubwev.h.bu vr10, vr1, vr5 + vsubwod.h.bu vr11, vr1, vr5 + vsubwev.h.bu vr12, vr2, vr6 + vsubwod.h.bu vr13, vr2, vr6 + vsubwev.h.bu vr14, vr3, vr7 + vsubwod.h.bu vr15, vr3, vr7 + vmul.h vr8, vr8, vr8 + vmul.h vr9, vr9, vr9 + vmul.h vr10, vr10, vr10 + vmul.h vr11, vr11, vr11 + vmul.h vr12, vr12, vr12 + vmul.h vr13, vr13, vr13 + vmul.h vr14, vr14, vr14 + vmul.h vr15, vr15, vr15 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.wu.hu vr14, vr14, vr14 + vhaddw.wu.hu vr15, vr15, vr15 + vadd.w vr8, vr8, vr9 + vadd.w vr9, vr10, vr11 + vadd.w vr10, vr12, vr13 + vadd.w vr11, vr14, vr15 + vadd.w vr8, vr8, vr9 + vadd.w vr9, vr10, vr11 + vadd.w vr16, vr8, vr9 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7 + vsubwev.h.bu vr8, vr0, vr4 + vsubwod.h.bu vr9, vr0, vr4 + vsubwev.h.bu vr10, vr1, vr5 + vsubwod.h.bu vr11, vr1, vr5 + vsubwev.h.bu vr12, vr2, vr6 + vsubwod.h.bu vr13, vr2, vr6 + vsubwev.h.bu vr14, vr3, vr7 + vsubwod.h.bu vr15, vr3, vr7 + vmul.h vr8, vr8, vr8 + vmul.h vr9, vr9, vr9 + vmul.h vr10, vr10, vr10 + vmul.h vr11, vr11, vr11 + vmul.h vr12, vr12, vr12 + vmul.h vr13, vr13, vr13 + vmul.h vr14, vr14, vr14 + vmul.h vr15, vr15, vr15 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.wu.hu vr14, vr14, vr14 + vhaddw.wu.hu vr15, vr15, vr15 + vadd.w vr8, vr8, vr9 + vadd.w vr9, vr10, vr11 + vadd.w vr10, vr12, vr13 + vadd.w vr11, vr14, vr15 + vadd.w vr8, vr8, vr9 + vadd.w vr9, vr10, vr11 + vadd.w vr17, vr8, vr9 + + vadd.w vr10, vr16, vr17 + vhaddw.d.w vr10, vr10, vr10 + vhaddw.q.d vr10, vr10, vr10 + vpickve2gr.w a0, vr10, 0 +endfunc_x264 + +/* + * int x264_pixel_ssd_16x16( pixel *pix1, intptr_t i_stride_pix1, + * pixel *pix2, intptr_t i_stride_pix2 ) + */ +function_x264 pixel_ssd_16x16_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + slli.d t2, a3, 1 + add.d t3, a3, t2 + + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7 + vsubwev.h.bu vr8, vr0, vr4 + vsubwod.h.bu vr9, vr0, vr4 + vsubwev.h.bu vr10, vr1, vr5 + vsubwod.h.bu vr11, vr1, vr5 + vsubwev.h.bu vr12, vr2, vr6 + vsubwod.h.bu vr13, vr2, vr6 + vsubwev.h.bu vr14, vr3, vr7 + vsubwod.h.bu vr15, vr3, vr7 + vmul.h vr8, vr8, vr8 + vmul.h vr9, vr9, vr9 + vmul.h vr10, vr10, vr10 + vmul.h vr11, vr11, vr11 + vmul.h vr12, vr12, vr12 + vmul.h vr13, vr13, vr13 + vmul.h vr14, vr14, vr14 + vmul.h vr15, vr15, vr15 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.wu.hu vr14, vr14, vr14 + vhaddw.wu.hu vr15, vr15, vr15 + vadd.w vr8, vr8, vr9 + vadd.w vr9, vr10, vr11 + vadd.w vr10, vr12, vr13 + vadd.w vr11, vr14, vr15 + vadd.w vr8, vr8, vr9 + vadd.w vr9, vr10, vr11 + vadd.w vr16, vr8, vr9 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t3, vr4, vr5, vr6, vr7 + vsubwev.h.bu vr8, vr0, vr4 + vsubwod.h.bu vr9, vr0, vr4 + vsubwev.h.bu vr10, vr1, vr5 + vsubwod.h.bu vr11, vr1, vr5 + vsubwev.h.bu vr12, vr2, vr6 + vsubwod.h.bu vr13, vr2, vr6 + vsubwev.h.bu vr14, vr3, vr7 + vsubwod.h.bu vr15, vr3, vr7 + vmul.h vr8, vr8, vr8 + vmul.h vr9, vr9, vr9 + vmul.h vr10, vr10, vr10 + vmul.h vr11, vr11, vr11 + vmul.h vr12, vr12, vr12 + vmul.h vr13, vr13, vr13 + vmul.h vr14, vr14, vr14 + vmul.h vr15, vr15, vr15 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.wu.hu vr12, vr12, vr12 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.wu.hu vr14, vr14, vr14 + vhaddw.wu.hu vr15, vr15, vr15 + vadd.w vr8, vr8, vr9 + vadd.w vr9, vr10, vr11 + vadd.w vr10, vr12, vr13 + vadd.w vr11, vr14, vr15 + vadd.w vr8, vr8, vr9 + vadd.w vr9, vr10, vr11 + vadd.w vr17, vr8, vr9 + vadd.w vr16, vr16, vr17 +.endr + vhaddw.d.w vr16, vr16, vr16 + vhaddw.q.d vr16, vr16, vr16 + vpickve2gr.w a0, vr16, 0 +endfunc_x264 + +/* + * int x264_pixel_sa8d_8x8( pixel *pix1, intptr_t i_pix1, pixel *pix2, intptr_t i_pix2 ) + */ +.macro pixel_sa8d_8x8_lsx_core out0, out1, out2, out3 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 a2, a3, t2, t3, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vsubwev.h.bu vr2, vr0, vr4 + vsubwod.h.bu vr3, vr0, vr4 + vsubwev.h.bu vr6, vr1, vr5 + vsubwod.h.bu vr7, vr1, vr5 + vadd.h vr8, vr2, vr3 + vsub.h vr9, vr2, vr3 + vadd.h vr10, vr6, vr7 + vsub.h vr11, vr6, vr7 + vpackev.h vr0, vr9, vr8 + vpackod.h vr1, vr9, vr8 + vpackev.h vr2, vr11, vr10 + vpackod.h vr3, vr11, vr10 + vadd.h vr4, vr0, vr1 + vsub.h vr5, vr0, vr1 + vadd.h vr6, vr2, vr3 + vsub.h vr7, vr2, vr3 + vilvl.d vr0, vr5, vr4 + vilvh.d vr1, vr5, vr4 + vilvl.d vr2, vr7, vr6 + vilvh.d vr3, vr7, vr6 + vadd.h vr12, vr0, vr1 + vsub.h vr13, vr0, vr1 + vadd.h vr14, vr2, vr3 + vsub.h vr15, vr2, vr3 + + alsl.d t4, a1, a0, 2 + alsl.d t5, a3, a2, 2 + FLDD_LOADX_4 t4, a1, t0, t1, f0, f1, f2, f3 + FLDD_LOADX_4 t5, a3, t2, t3, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vsubwev.h.bu vr2, vr0, vr4 + vsubwod.h.bu vr3, vr0, vr4 + vsubwev.h.bu vr6, vr1, vr5 + vsubwod.h.bu vr7, vr1, vr5 + vadd.h vr8, vr2, vr3 + vsub.h vr9, vr2, vr3 + vadd.h vr10, vr6, vr7 + vsub.h vr11, vr6, vr7 + vpackev.h vr0, vr9, vr8 + vpackod.h vr1, vr9, vr8 + vpackev.h vr2, vr11, vr10 + vpackod.h vr3, vr11, vr10 + vadd.h vr4, vr0, vr1 + vsub.h vr5, vr0, vr1 + vadd.h vr6, vr2, vr3 + vsub.h vr7, vr2, vr3 + vilvl.d vr0, vr5, vr4 + vilvh.d vr1, vr5, vr4 + vilvl.d vr2, vr7, vr6 + vilvh.d vr3, vr7, vr6 + vadd.h vr4, vr0, vr1 + vsub.h vr5, vr0, vr1 + vadd.h vr6, vr2, vr3 + vsub.h vr7, vr2, vr3 + + // vr12 vr13 vr14 vr15 + vpickev.w vr0, vr13, vr12 + vpickod.w vr1, vr13, vr12 + vpickev.w vr2, vr15, vr14 + vpickod.w vr3, vr15, vr14 + vadd.h vr8, vr0, vr1 + vsub.h vr9, vr0, vr1 + vadd.h vr10, vr2, vr3 + vsub.h vr11, vr2, vr3 + vadd.h vr12, vr8, vr10 + vadd.h vr13, vr9, vr11 + vsub.h vr14, vr8, vr10 + vsub.h vr15, vr9, vr11 + + // vr4 vr5 vr6 vr7 + vpickev.w vr0, vr5, vr4 + vpickod.w vr1, vr5, vr4 + vpickev.w vr2, vr7, vr6 + vpickod.w vr3, vr7, vr6 + vadd.h vr8, vr0, vr1 + vsub.h vr9, vr0, vr1 + vadd.h vr10, vr2, vr3 + vsub.h vr11, vr2, vr3 + vadd.h vr4, vr8, vr10 + vadd.h vr5, vr9, vr11 + vsub.h vr6, vr8, vr10 + vsub.h vr7, vr9, vr11 + + vadd.h vr0, vr12, vr4 + vadd.h vr1, vr13, vr5 + vadd.h vr2, vr14, vr6 + vadd.h vr3, vr15, vr7 + vsub.h vr8, vr12, vr4 + vsub.h vr9, vr13, vr5 + vsub.h vr10, vr14, vr6 + vsub.h vr11, vr15, vr7 + vadda.h \out0, vr0, vr8 + vadda.h \out1, vr1, vr9 + vadda.h \out2, vr2, vr10 + vadda.h \out3, vr3, vr11 +.endm + +function_x264 pixel_sa8d_8x8_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + slli.d t2, a3, 1 + add.d t3, t2, a3 + pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 + vadd.h vr0, vr0, vr1 + vadd.h vr1, vr2, vr3 + vadd.h vr17, vr0, vr1 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.qu.du vr17, vr17, vr17 + vpickve2gr.wu t5, vr17, 0 + addi.d t5, t5, 2 + srli.d a0, t5, 2 +endfunc_x264 + +/* + * int x264_pixel_sa8d_16x16( pixel *pix1, intptr_t i_pix1, + * pixel *pix2, intptr_t i_pix2 ) + */ +function_x264 pixel_sa8d_16x16_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + slli.d t2, a3, 1 + add.d t3, t2, a3 + add.d t6, a0, zero + add.d t7, a2, zero + pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 + vadd.h vr0, vr0, vr1 + vadd.h vr1, vr2, vr3 + vadd.h vr16, vr0, vr1 + + addi.d a0, t6, 8 + addi.d a2, t7, 8 + pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 + vadd.h vr0, vr0, vr1 + vadd.h vr1, vr2, vr3 + vadd.h vr17, vr0, vr1 + + alsl.d a0, a1, t6, 3 + alsl.d a2, a3, t7, 3 + pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 + vadd.h vr0, vr0, vr1 + vadd.h vr1, vr2, vr3 + vadd.h vr18, vr0, vr1 + + addi.d a0, a0, 8 + addi.d a2, a2, 8 + pixel_sa8d_8x8_lsx_core vr0, vr1, vr2, vr3 + vadd.h vr0, vr0, vr1 + vadd.h vr1, vr2, vr3 + vadd.h vr19, vr0, vr1 + + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.wu.hu vr19, vr19, vr19 + vadd.w vr16, vr17, vr16 + vadd.w vr18, vr19, vr18 + vadd.w vr17, vr18, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.qu.du vr17, vr17, vr17 + vpickve2gr.wu t5, vr17, 0 + addi.d t5, t5, 2 + srli.d a0, t5, 2 +endfunc_x264 + +/* + * uint64_t pixel_var_8x8( pixel *pix, intptr_t i_stride ) + */ +function_x264 pixel_var_8x8_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + alsl.d a0, a1, a0, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vhaddw.hu.bu vr2, vr0, vr0 + vhaddw.hu.bu vr3, vr1, vr1 + vhaddw.hu.bu vr6, vr4, vr4 + vhaddw.hu.bu vr7, vr5, vr5 + vadd.h vr2, vr2, vr3 + vadd.h vr6, vr6, vr7 + vadd.h vr2, vr2, vr6 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.du.wu vr2, vr2, vr2 + vhaddw.qu.du vr2, vr2, vr2 + vpickve2gr.wu t5, vr2, 0 // sum + + vmulwev.h.bu vr2, vr0, vr0 + vmulwod.h.bu vr3, vr0, vr0 + vmulwev.h.bu vr6, vr1, vr1 + vmulwod.h.bu vr7, vr1, vr1 + vmulwev.h.bu vr8, vr4, vr4 + vmulwod.h.bu vr9, vr4, vr4 + vmulwev.h.bu vr10, vr5, vr5 + vmulwod.h.bu vr11, vr5, vr5 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.wu.hu vr3, vr3, vr3 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + + vadd.w vr2, vr2, vr3 + vadd.w vr6, vr6, vr7 + vadd.w vr8, vr8, vr9 + vadd.w vr10, vr10, vr11 + vadd.w vr2, vr2, vr6 + vadd.w vr8, vr8, vr10 + vadd.w vr2, vr2, vr8 + vhaddw.du.wu vr2, vr2, vr2 + vhaddw.qu.du vr2, vr2, vr2 + vpickve2gr.du t6, vr2, 0 // sqr + + slli.d t4, t6, 32 + add.d a0, t4, t5 +endfunc_x264 + +/* + * uint64_t pixel_var_8x16( pixel *pix, intptr_t i_stride ) + */ +function_x264 pixel_var_8x16_lsx + slli.d t0, a1, 1 + add.d t1, a1, t0 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + alsl.d a0, a1, a0, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vhaddw.hu.bu vr2, vr0, vr0 + vhaddw.hu.bu vr3, vr1, vr1 + vhaddw.hu.bu vr6, vr4, vr4 + vhaddw.hu.bu vr7, vr5, vr5 + vadd.h vr2, vr2, vr3 + vadd.h vr6, vr6, vr7 + vadd.h vr16, vr2, vr6 + + vmulwev.h.bu vr2, vr0, vr0 + vmulwod.h.bu vr3, vr0, vr0 + vmulwev.h.bu vr6, vr1, vr1 + vmulwod.h.bu vr7, vr1, vr1 + vmulwev.h.bu vr8, vr4, vr4 + vmulwod.h.bu vr9, vr4, vr4 + vmulwev.h.bu vr10, vr5, vr5 + vmulwod.h.bu vr11, vr5, vr5 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.wu.hu vr3, vr3, vr3 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vadd.w vr12, vr2, vr3 + vadd.w vr13, vr6, vr7 + vadd.w vr14, vr8, vr9 + vadd.w vr15, vr10, vr11 + vadd.w vr12, vr12, vr13 + vadd.w vr14, vr14, vr15 + vadd.w vr12, vr12, vr14 + + alsl.d a0, a1, a0, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f0, f1, f2, f3 + alsl.d a0, a1, a0, 2 + FLDD_LOADX_4 a0, a1, t0, t1, f4, f5, f6, f7 + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr4, vr5, vr4 + vilvl.d vr5, vr7, vr6 + vhaddw.hu.bu vr2, vr0, vr0 + vhaddw.hu.bu vr3, vr1, vr1 + vhaddw.hu.bu vr6, vr4, vr4 + vhaddw.hu.bu vr7, vr5, vr5 + vadd.h vr2, vr2, vr3 + vadd.h vr6, vr6, vr7 + vadd.h vr2, vr2, vr6 + vadd.h vr2, vr2, vr16 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.du.wu vr2, vr2, vr2 + vhaddw.qu.du vr2, vr2, vr2 + vpickve2gr.wu t5, vr2, 0 // sum + + vmulwev.h.bu vr2, vr0, vr0 + vmulwod.h.bu vr3, vr0, vr0 + vmulwev.h.bu vr6, vr1, vr1 + vmulwod.h.bu vr7, vr1, vr1 + vmulwev.h.bu vr8, vr4, vr4 + vmulwod.h.bu vr9, vr4, vr4 + vmulwev.h.bu vr10, vr5, vr5 + vmulwod.h.bu vr11, vr5, vr5 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.wu.hu vr3, vr3, vr3 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vadd.w vr2, vr2, vr3 + vadd.w vr6, vr6, vr7 + vadd.w vr8, vr8, vr9 + vadd.w vr10, vr10, vr11 + vadd.w vr2, vr2, vr6 + vadd.w vr8, vr8, vr10 + vadd.w vr2, vr2, vr8 + vadd.w vr2, vr2, vr12 + vhaddw.du.wu vr2, vr2, vr2 + vhaddw.qu.du vr2, vr2, vr2 + vpickve2gr.du t6, vr2, 0 // sqr + slli.d t4, t6, 32 + add.d a0, t4, t5 +endfunc_x264 + +/* + * uint64_t pixel_var_16x16( pixel *pix, intptr_t i_stride ) + */ +function_x264 pixel_var_16x16_lsx + slli.d t0, a1, 1 + add.d t1, t0, a1 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + vhaddw.hu.bu vr4, vr0, vr0 + vhaddw.hu.bu vr5, vr1, vr1 + vhaddw.hu.bu vr6, vr2, vr2 + vhaddw.hu.bu vr7, vr3, vr3 + vadd.h vr4, vr5, vr4 + vadd.h vr5, vr7, vr6 + vadd.h vr13, vr5, vr4 + + vmulwev.h.bu vr5, vr0, vr0 + vmulwod.h.bu vr6, vr0, vr0 + vmulwev.h.bu vr7, vr1, vr1 + vmulwod.h.bu vr8, vr1, vr1 + vmulwev.h.bu vr9, vr2, vr2 + vmulwod.h.bu vr10, vr2, vr2 + vmulwev.h.bu vr11, vr3, vr3 + vmulwod.h.bu vr12, vr3, vr3 + vhaddw.wu.hu vr5, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.wu.hu vr12, vr12, vr12 + vadd.w vr5, vr5, vr6 + vadd.w vr6, vr8, vr7 + vadd.w vr7, vr10, vr9 + vadd.w vr8, vr12, vr11 + vadd.w vr0, vr5, vr6 + vadd.w vr1, vr8, vr7 + vadd.w vr14, vr1, vr0 + +.rept 3 + alsl.d a0, a1, a0, 2 + LSX_LOADX_4 a0, a1, t0, t1, vr0, vr1, vr2, vr3 + vhaddw.hu.bu vr4, vr0, vr0 + vhaddw.hu.bu vr5, vr1, vr1 + vhaddw.hu.bu vr6, vr2, vr2 + vhaddw.hu.bu vr7, vr3, vr3 + vadd.h vr4, vr5, vr4 + vadd.h vr5, vr7, vr6 + vadd.h vr4, vr5, vr4 + vadd.h vr13, vr4, vr13 + + vmulwev.h.bu vr5, vr0, vr0 + vmulwod.h.bu vr6, vr0, vr0 + vmulwev.h.bu vr7, vr1, vr1 + vmulwod.h.bu vr8, vr1, vr1 + vmulwev.h.bu vr9, vr2, vr2 + vmulwod.h.bu vr10, vr2, vr2 + vmulwev.h.bu vr11, vr3, vr3 + vmulwod.h.bu vr12, vr3, vr3 + vhaddw.wu.hu vr5, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.wu.hu vr12, vr12, vr12 + vadd.w vr5, vr5, vr6 + vadd.w vr6, vr8, vr7 + vadd.w vr7, vr10, vr9 + vadd.w vr8, vr12, vr11 + vadd.w vr0, vr5, vr6 + vadd.w vr1, vr8, vr7 + vadd.w vr0, vr1, vr0 + vadd.w vr14, vr0, vr14 +.endr + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu t4, vr13, 0 + + vhaddw.du.wu vr14, vr14, vr14 + vhaddw.qu.du vr14, vr14, vr14 + vpickve2gr.du t6, vr14, 0 // sqr + + slli.d t5, t6, 32 + add.d a0, t4, t5 +endfunc_x264 + +.macro sse_diff_8width_lsx in0, in1, in2, in3 + fld.d f0, \in0, 0 + fld.d f1, \in0, FENC_STRIDE + fld.d f2, \in0, FENC_STRIDE * 2 + fld.d f3, \in0, FENC_STRIDE * 3 + fld.d f4, \in1, 0 + fld.d f5, \in1, FDEC_STRIDE + fld.d f6, \in1, FDEC_STRIDE * 2 + fld.d f7, \in1, FDEC_STRIDE * 3 + + vilvl.d vr0, vr1, vr0 + vilvl.d vr1, vr3, vr2 + vilvl.d vr2, vr5, vr4 + vilvl.d vr3, vr7, vr6 + vsubwev.h.bu vr4, vr0, vr2 + vsubwod.h.bu vr5, vr0, vr2 + vsubwev.h.bu vr6, vr1, vr3 + vsubwod.h.bu vr7, vr1, vr3 + // sqr_u + vdp2add.w.h \in2, vr4, vr4 + vdp2add.w.h \in2, vr5, vr5 + vdp2add.w.h \in2, vr6, vr6 + vdp2add.w.h \in2, vr7, vr7 + // sum_u + vadd.h vr4, vr4, vr5 + vadd.h vr6, vr6, vr7 + vadd.h \in3, vr4, vr6 +.endm + +/* + * int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] ) + */ +function_x264 pixel_var2_8x8_lsx + vxor.v vr8, vr8, vr8 + sse_diff_8width_lsx a0, a1, vr8, vr9 + addi.d t0, a0, FENC_STRIDE * 4 + addi.d t1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lsx t0, t1, vr8, vr10 + vhaddw.d.w vr8, vr8, vr8 + vhaddw.q.d vr8, vr8, vr8 + vpickve2gr.w t2, vr8, 0 // sqr_u + vadd.h vr8, vr10, vr9 + vhaddw.w.h vr8, vr8, vr8 + vhaddw.d.w vr8, vr8, vr8 + vhaddw.q.d vr8, vr8, vr8 + vpickve2gr.w t3, vr8, 0 // sum_u + + addi.d a0, a0, FENC_STRIDE / 2 + addi.d a1, a1, FDEC_STRIDE / 2 + vxor.v vr8, vr8, vr8 + sse_diff_8width_lsx a0, a1, vr8, vr9 + addi.d t0, a0, FENC_STRIDE * 4 + addi.d t1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lsx t0, t1, vr8, vr10 + vhaddw.d.w vr8, vr8, vr8 + vhaddw.q.d vr8, vr8, vr8 + vpickve2gr.w t4, vr8, 0 // sqr_v + vadd.h vr8, vr10, vr9 + vhaddw.w.h vr8, vr8, vr8 + vhaddw.d.w vr8, vr8, vr8 + vhaddw.q.d vr8, vr8, vr8 + vpickve2gr.w t5, vr8, 0 // sum_v + + st.w t2, a2, 0 + st.w t4, a2, 4 + mul.w t3, t3, t3 + mul.w t5, t5, t5 + srai.w t3, t3, 6 + srai.w t5, t5, 6 + sub.w t2, t2, t3 + sub.w t4, t4, t5 + add.w a0, t2, t4 +endfunc_x264 + +/* + * int pixel_var2_8x16( pixel *fenc, pixel *fdec, int ssd[2] ) + */ +function_x264 pixel_var2_8x16_lsx + vxor.v vr8, vr8, vr8 + sse_diff_8width_lsx a0, a1, vr8, vr9 + addi.d t0, a0, FENC_STRIDE * 4 + addi.d t1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lsx t0, t1, vr8, vr10 + addi.d t0, t0, FENC_STRIDE * 4 + addi.d t1, t1, FDEC_STRIDE * 4 + sse_diff_8width_lsx t0, t1, vr8, vr11 + addi.d t0, t0, FENC_STRIDE * 4 + addi.d t1, t1, FDEC_STRIDE * 4 + sse_diff_8width_lsx t0, t1, vr8, vr12 + vhaddw.d.w vr8, vr8, vr8 + vhaddw.q.d vr8, vr8, vr8 + vpickve2gr.w t2, vr8, 0 // sqr_u + vadd.h vr8, vr10, vr9 + vadd.h vr8, vr11, vr8 + vadd.h vr8, vr12, vr8 + vhaddw.w.h vr8, vr8, vr8 + vhaddw.d.w vr8, vr8, vr8 + vhaddw.q.d vr8, vr8, vr8 + vpickve2gr.w t3, vr8, 0 // sum_u + + addi.d a0, a0, FENC_STRIDE / 2 + addi.d a1, a1, FDEC_STRIDE / 2 + vxor.v vr8, vr8, vr8 + sse_diff_8width_lsx a0, a1, vr8, vr9 + addi.d t0, a0, FENC_STRIDE * 4 + addi.d t1, a1, FDEC_STRIDE * 4 + sse_diff_8width_lsx t0, t1, vr8, vr10 + addi.d t0, t0, FENC_STRIDE * 4 + addi.d t1, t1, FDEC_STRIDE * 4 + sse_diff_8width_lsx t0, t1, vr8, vr11 + addi.d t0, t0, FENC_STRIDE * 4 + addi.d t1, t1, FDEC_STRIDE * 4 + sse_diff_8width_lsx t0, t1, vr8, vr12 + vhaddw.d.w vr8, vr8, vr8 + vhaddw.q.d vr8, vr8, vr8 + vpickve2gr.w t4, vr8, 0 // sqr_v + vadd.h vr8, vr10, vr9 + vadd.h vr8, vr11, vr8 + vadd.h vr8, vr12, vr8 + vhaddw.w.h vr8, vr8, vr8 + vhaddw.d.w vr8, vr8, vr8 + vhaddw.q.d vr8, vr8, vr8 + vpickve2gr.w t5, vr8, 0 // sum_v + + st.w t2, a2, 0 + st.w t4, a2, 4 + mul.w t3, t3, t3 + mul.w t5, t5, t5 + srai.w t3, t3, 7 + srai.w t5, t5, 7 + sub.w t2, t2, t3 + sub.w t4, t4, t5 + add.w a0, t2, t4 +endfunc_x264 +#endif /* !HIGH_BIT_DEPTH */ diff --git a/common/loongarch/pixel-c.c b/common/loongarch/pixel-c.c new file mode 100644 index 000000000..e417ef58c --- /dev/null +++ b/common/loongarch/pixel-c.c @@ -0,0 +1,259 @@ +/***************************************************************************** + * pixel-c.c: loongarch pixel metrics + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Hecai Yuan + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "pixel.h" +#include "predict.h" + +#if !HIGH_BIT_DEPTH + +uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride ); + u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride ); + u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = x264_hadamard_ac_8x8_lsx( p_pix, i_stride ); + u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8, i_stride ); + u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride, i_stride ); + u_sum += x264_hadamard_ac_8x8_lsx( p_pix + 8 * i_stride + 8, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride ) +{ + uint64_t u_sum; + + u_sum = x264_hadamard_ac_8x8_lasx( p_pix, i_stride ); + u_sum += x264_hadamard_ac_8x8_lasx( p_pix + ( i_stride << 3 ), i_stride ); + + return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 ); +} + +void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ) +{ + ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); + + x264_predict_8x8_v_lsx( pix, p_edge ); + p_sad_array[0] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8_h_lsx( pix, p_edge ); + p_sad_array[1] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8_dc_lsx( pix, p_edge ); + p_sad_array[2] = x264_pixel_sa8d_8x8_lsx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ) +{ + ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); + + x264_predict_8x8_v_lsx( pix, p_edge ); + p_sad_array[0] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8_h_lasx( pix, p_edge ); + p_sad_array[1] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8_dc_lsx( pix, p_edge ); + p_sad_array[2] = x264_pixel_sa8d_8x8_lasx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_predict_4x4_v_lsx( p_dec ); + p_sad_array[0] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_4x4_h_lsx( p_dec ); + p_sad_array[1] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_4x4_dc_lsx( p_dec ); + p_sad_array[2] = x264_pixel_satd_4x4_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_predict_16x16_v_lsx( p_dec ); + p_sad_array[0] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_16x16_h_lsx( p_dec ); + p_sad_array[1] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_16x16_dc_lsx( p_dec ); + p_sad_array[2] = x264_pixel_satd_16x16_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_predict_16x16_v_lsx( p_dec ); + p_sad_array[0] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_16x16_h_lsx( p_dec ); + p_sad_array[1] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_16x16_dc_lsx( p_dec ); + p_sad_array[2] = x264_pixel_satd_16x16_lasx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_predict_8x8c_dc_lsx( p_dec ); + p_sad_array[0] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8c_h_lsx( p_dec ); + p_sad_array[1] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8c_v_lsx( p_dec ); + p_sad_array[2] = x264_pixel_satd_8x8_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_predict_4x4_v_lsx( p_dec ); + p_sad_array[0] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_4x4_h_lsx( p_dec ); + p_sad_array[1] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_4x4_dc_lsx( p_dec ); + p_sad_array[2] = x264_pixel_sad_4x4_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_predict_16x16_v_lsx( p_dec ); + p_sad_array[0] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_16x16_h_lsx( p_dec ); + p_sad_array[1] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_16x16_dc_lsx( p_dec ); + p_sad_array[2] = x264_pixel_sad_16x16_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ) +{ + ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] ); + + x264_predict_8x8_v_lsx( pix, p_edge ); + p_sad_array[0] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8_h_lsx( pix, p_edge ); + p_sad_array[1] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8_dc_lsx( pix, p_edge ); + p_sad_array[2] = x264_pixel_sad_8x8_lsx( pix, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + +void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ) +{ + x264_predict_8x8c_dc_lsx( p_dec ); + p_sad_array[0] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8c_h_lsx( p_dec ); + p_sad_array[1] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); + + x264_predict_8x8c_v_lsx( p_dec ); + p_sad_array[2] = x264_pixel_sad_8x8_lsx( p_dec, FDEC_STRIDE, + p_enc, FENC_STRIDE ); +} + + +#endif diff --git a/common/loongarch/pixel.h b/common/loongarch/pixel.h new file mode 100644 index 000000000..6d34b03bf --- /dev/null +++ b/common/loongarch/pixel.h @@ -0,0 +1,335 @@ +/***************************************************************************** + * pixel.h: loongarch pixel metrics + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Lu Wang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_LOONGARCH_PIXEL_H +#define X264_LOONGARCH_PIXEL_H + +#define x264_pixel_satd_4x4_lsx x264_template(pixel_satd_4x4_lsx) +int32_t x264_pixel_satd_4x4_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_4x8_lsx x264_template(pixel_satd_4x8_lsx) +int32_t x264_pixel_satd_4x8_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_4x16_lsx x264_template(pixel_satd_4x16_lsx) +int32_t x264_pixel_satd_4x16_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x4_lsx x264_template(pixel_satd_8x4_lsx) +int32_t x264_pixel_satd_8x4_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x8_lsx x264_template(pixel_satd_8x8_lsx) +int32_t x264_pixel_satd_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x16_lsx x264_template(pixel_satd_8x16_lsx) +int32_t x264_pixel_satd_8x16_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_16x8_lsx x264_template(pixel_satd_16x8_lsx) +int32_t x264_pixel_satd_16x8_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_16x16_lsx x264_template(pixel_satd_16x16_lsx) +int32_t x264_pixel_satd_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); + +#define x264_pixel_satd_4x8_lasx x264_template(pixel_satd_4x8_lasx) +int32_t x264_pixel_satd_4x8_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_4x16_lasx x264_template(pixel_satd_4x16_lasx) +int32_t x264_pixel_satd_4x16_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x4_lasx x264_template(pixel_satd_8x4_lasx) +int32_t x264_pixel_satd_8x4_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x8_lasx x264_template(pixel_satd_8x8_lasx) +int32_t x264_pixel_satd_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_8x16_lasx x264_template(pixel_satd_8x16_lasx) +int32_t x264_pixel_satd_8x16_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_16x8_lasx x264_template(pixel_satd_16x8_lasx) +int32_t x264_pixel_satd_16x8_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_satd_16x16_lasx x264_template(pixel_satd_16x16_lasx) +int32_t x264_pixel_satd_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); + +#define x264_pixel_sad_x4_16x16_lsx x264_template(pixel_sad_x4_16x16_lsx) +void x264_pixel_sad_x4_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_16x8_lsx x264_template(pixel_sad_x4_16x8_lsx) +void x264_pixel_sad_x4_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x16_lsx x264_template(pixel_sad_x4_8x16_lsx) +void x264_pixel_sad_x4_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x8_lsx x264_template(pixel_sad_x4_8x8_lsx) +void x264_pixel_sad_x4_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x4_lsx x264_template(pixel_sad_x4_8x4_lsx) +void x264_pixel_sad_x4_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_4x8_lsx x264_template(pixel_sad_x4_4x8_lsx) +void x264_pixel_sad_x4_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); + +#define x264_pixel_sad_x4_16x16_lasx x264_template(pixel_sad_x4_16x16_lasx) +void x264_pixel_sad_x4_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_16x8_lasx x264_template(pixel_sad_x4_16x8_lasx) +void x264_pixel_sad_x4_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x8_lasx x264_template(pixel_sad_x4_8x8_lasx) +void x264_pixel_sad_x4_8x8_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_8x4_lasx x264_template(pixel_sad_x4_8x4_lasx) +void x264_pixel_sad_x4_8x4_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); +#define x264_pixel_sad_x4_4x4_lsx x264_template(pixel_sad_x4_4x4_lsx) +void x264_pixel_sad_x4_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + uint8_t *p_ref3, intptr_t i_ref_stride, + int32_t p_sad_array[4] ); + +#define x264_pixel_sad_x3_16x16_lsx x264_template(pixel_sad_x3_16x16_lsx) +void x264_pixel_sad_x3_16x16_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_16x8_lsx x264_template(pixel_sad_x3_16x8_lsx) +void x264_pixel_sad_x3_16x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_8x16_lsx x264_template(pixel_sad_x3_8x16_lsx) +void x264_pixel_sad_x3_8x16_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_8x8_lsx x264_template(pixel_sad_x3_8x8_lsx) +void x264_pixel_sad_x3_8x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_8x4_lsx x264_template(pixel_sad_x3_8x4_lsx) +void x264_pixel_sad_x3_8x4_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_4x4_lsx x264_template(pixel_sad_x3_4x4_lsx) +void x264_pixel_sad_x3_4x4_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_4x8_lsx x264_template(pixel_sad_x3_4x8_lsx) +void x264_pixel_sad_x3_4x8_lsx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); + +#define x264_pixel_sad_x3_16x16_lasx x264_template(pixel_sad_x3_16x16_lasx) +void x264_pixel_sad_x3_16x16_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); +#define x264_pixel_sad_x3_16x8_lasx x264_template(pixel_sad_x3_16x8_lasx) +void x264_pixel_sad_x3_16x8_lasx( uint8_t *p_src, uint8_t *p_ref0, + uint8_t *p_ref1, uint8_t *p_ref2, + intptr_t i_ref_stride, + int32_t p_sad_array[3] ); + +#define x264_pixel_sad_16x16_lsx x264_template(pixel_sad_16x16_lsx) +int32_t x264_pixel_sad_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_16x8_lsx x264_template(pixel_sad_16x8_lsx) +int32_t x264_pixel_sad_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_8x16_lsx x264_template(pixel_sad_8x16_lsx) +int32_t x264_pixel_sad_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_8x8_lsx x264_template(pixel_sad_8x8_lsx) +int32_t x264_pixel_sad_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_8x4_lsx x264_template(pixel_sad_8x4_lsx) +int32_t x264_pixel_sad_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_4x16_lsx x264_template(pixel_sad_4x16_lsx) +int32_t x264_pixel_sad_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_4x8_lsx x264_template(pixel_sad_4x8_lsx) +int32_t x264_pixel_sad_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_sad_4x4_lsx x264_template(pixel_sad_4x4_lsx) +int32_t x264_pixel_sad_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); + +#define x264_pixel_sad_8x4_lasx x264_template(pixel_sad_8x4_lasx) +int32_t x264_pixel_sad_8x4_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); + +#define x264_hadamard_ac_8x8_lsx x264_template(hadamard_ac_8x8_lsx) +uint64_t x264_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_8x8_lsx x264_template(pixel_hadamard_ac_8x8_lsx) +uint64_t x264_pixel_hadamard_ac_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_8x16_lsx x264_template(pixel_hadamard_ac_8x16_lsx) +uint64_t x264_pixel_hadamard_ac_8x16_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_16x8_lsx x264_template(pixel_hadamard_ac_16x8_lsx) +uint64_t x264_pixel_hadamard_ac_16x8_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_16x16_lsx x264_template(pixel_hadamard_ac_16x16_lsx) +uint64_t x264_pixel_hadamard_ac_16x16_lsx( uint8_t *p_pix, intptr_t i_stride ); + +#define x264_hadamard_ac_8x8_lasx x264_template(hadamard_ac_8x8_lasx) +uint64_t x264_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_8x8_lasx x264_template(pixel_hadamard_ac_8x8_lasx) +uint64_t x264_pixel_hadamard_ac_8x8_lasx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_8x16_lasx x264_template(pixel_hadamard_ac_8x16_lasx) +uint64_t x264_pixel_hadamard_ac_8x16_lasx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_16x8_lasx x264_template(pixel_hadamard_ac_16x8_lasx) +uint64_t x264_pixel_hadamard_ac_16x8_lasx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_hadamard_ac_16x16_lasx x264_template(pixel_hadamard_ac_16x16_lasx) +uint64_t x264_pixel_hadamard_ac_16x16_lasx( uint8_t *p_pix, intptr_t i_stride ); + +#define x264_intra_satd_x3_16x16_lsx x264_template(intra_satd_x3_16x16_lsx) +void x264_intra_satd_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_satd_x3_8x8c_lsx x264_template(intra_satd_x3_8x8c_lsx) +void x264_intra_satd_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_satd_x3_4x4_lsx x264_template(intra_satd_x3_4x4_lsx) +void x264_intra_satd_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_satd_x3_16x16_lasx x264_template(intra_satd_x3_16x16_lasx) +void x264_intra_satd_x3_16x16_lasx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); + +#define x264_pixel_ssd_16x16_lsx x264_template(pixel_ssd_16x16_lsx) +int32_t x264_pixel_ssd_16x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_16x8_lsx x264_template(pixel_ssd_16x8_lsx) +int32_t x264_pixel_ssd_16x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x16_lsx x264_template(pixel_ssd_8x16_lsx) +int32_t x264_pixel_ssd_8x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x8_lsx x264_template(pixel_ssd_8x8_lsx) +int32_t x264_pixel_ssd_8x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x4_lsx x264_template(pixel_ssd_8x4_lsx) +int32_t x264_pixel_ssd_8x4_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_4x16_lsx x264_template(pixel_ssd_4x16_lsx) +int32_t x264_pixel_ssd_4x16_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_4x8_lsx x264_template(pixel_ssd_4x8_lsx) +int32_t x264_pixel_ssd_4x8_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_4x4_lsx x264_template(pixel_ssd_4x4_lsx) +int32_t x264_pixel_ssd_4x4_lsx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); + +#define x264_pixel_ssd_16x16_lasx x264_template(pixel_ssd_16x16_lasx) +int32_t x264_pixel_ssd_16x16_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_16x8_lasx x264_template(pixel_ssd_16x8_lasx) +int32_t x264_pixel_ssd_16x8_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x16_lasx x264_template(pixel_ssd_8x16_lasx) +int32_t x264_pixel_ssd_8x16_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); +#define x264_pixel_ssd_8x8_lasx x264_template(pixel_ssd_8x8_lasx) +int32_t x264_pixel_ssd_8x8_lasx( uint8_t *p_src, intptr_t i_src_stride, + uint8_t *p_ref, intptr_t i_ref_stride ); + +#define x264_pixel_var2_8x16_lsx x264_template(pixel_var2_8x16_lsx) +int32_t x264_pixel_var2_8x16_lsx( uint8_t *p_pix1, uint8_t *p_pix2, + int32_t ssd[2] ); +#define x264_pixel_var2_8x8_lsx x264_template(pixel_var2_8x8_lsx) +int32_t x264_pixel_var2_8x8_lsx( uint8_t *p_pix1, uint8_t *p_pix2, + int32_t ssd[2] ); +#define x264_pixel_var_16x16_lsx x264_template(pixel_var_16x16_lsx) +uint64_t x264_pixel_var_16x16_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_var_8x16_lsx x264_template(pixel_var_8x16_lsx) +uint64_t x264_pixel_var_8x16_lsx( uint8_t *p_pix, intptr_t i_stride ); +#define x264_pixel_var_8x8_lsx x264_template(pixel_var_8x8_lsx) +uint64_t x264_pixel_var_8x8_lsx( uint8_t *p_pix, intptr_t i_stride ); + +#define x264_pixel_var2_8x16_lasx x264_template(pixel_var2_8x16_lasx) +int32_t x264_pixel_var2_8x16_lasx( uint8_t *p_pix1, uint8_t *p_pix2, + int32_t ssd[2] ); +#define x264_pixel_var2_8x8_lasx x264_template(pixel_var2_8x8_lasx) +int32_t x264_pixel_var2_8x8_lasx( uint8_t *p_pix1, uint8_t *p_pix2, + int32_t ssd[2] ); + +#define x264_pixel_sa8d_8x8_lsx x264_template(pixel_sa8d_8x8_lsx) +int32_t x264_pixel_sa8d_8x8_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_sa8d_16x16_lsx x264_template(pixel_sa8d_16x16_lsx) +int32_t x264_pixel_sa8d_16x16_lsx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); + +#define x264_intra_sa8d_x3_8x8_lsx x264_template(intra_sa8d_x3_8x8_lsx) +void x264_intra_sa8d_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ); +#define x264_intra_sa8d_x3_8x8_lasx x264_template(intra_sa8d_x3_8x8_lasx) +void x264_intra_sa8d_x3_8x8_lasx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ); +#define x264_pixel_sa8d_8x8_lasx x264_template(pixel_sa8d_8x8_lasx) +int32_t x264_pixel_sa8d_8x8_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); +#define x264_pixel_sa8d_16x16_lasx x264_template(pixel_sa8d_16x16_lasx) +int32_t x264_pixel_sa8d_16x16_lasx( uint8_t *p_pix1, intptr_t i_stride, + uint8_t *p_pix2, intptr_t i_stride2 ); + +#define x264_intra_sad_x3_16x16_lsx x264_template(intra_sad_x3_16x16_lsx) +void x264_intra_sad_x3_16x16_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_sad_x3_8x8_lsx x264_template(intra_sad_x3_8x8_lsx) +void x264_intra_sad_x3_8x8_lsx( uint8_t *p_enc, uint8_t p_edge[36], + int32_t p_sad_array[3] ); +#define x264_intra_sad_x3_8x8c_lsx x264_template(intra_sad_x3_8x8c_lsx) +void x264_intra_sad_x3_8x8c_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); +#define x264_intra_sad_x3_4x4_lsx x264_template(intra_sad_x3_4x4_lsx) +void x264_intra_sad_x3_4x4_lsx( uint8_t *p_enc, uint8_t *p_dec, + int32_t p_sad_array[3] ); + +#endif diff --git a/common/loongarch/predict-a.S b/common/loongarch/predict-a.S new file mode 100644 index 000000000..ede46cd97 --- /dev/null +++ b/common/loongarch/predict-a.S @@ -0,0 +1,1383 @@ +/***************************************************************************** + * predict-a.S: loongarch predict functions + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Xiwei Gu + * Lu Wang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "loongson_asm.S" +#include "loongson_util.S" + +#if !HIGH_BIT_DEPTH + +/**************************************************************************** + * 4x4 prediction for intra luma block + ****************************************************************************/ +/* void x264_predict_4x4_v_c( pixel *src ) + */ +function_x264 predict_4x4_v_lsx + ld.wu t0, a0, -FDEC_STRIDE + st.w t0, a0, 0 + st.w t0, a0, FDEC_STRIDE + st.w t0, a0, FDEC_STRIDE * 2 + st.w t0, a0, FDEC_STRIDE * 3 +endfunc_x264 + +/* void x264_predict_4x4_h_c( pixel *src ) + */ +function_x264 predict_4x4_h_lsx + vldrepl.b vr0, a0, -1 + vldrepl.b vr1, a0, FDEC_STRIDE - 1 + vldrepl.b vr2, a0, FDEC_STRIDE * 2 - 1 + vldrepl.b vr3, a0, FDEC_STRIDE * 3 - 1 + fst.s f0, a0, 0 + fst.s f1, a0, FDEC_STRIDE + fst.s f2, a0, FDEC_STRIDE * 2 + fst.s f3, a0, FDEC_STRIDE * 3 +endfunc_x264 + +/* void x264_predict_4x4_dc_c( pixel *src ) + */ +function_x264 predict_4x4_dc_lsx + fld.s f0, a0, -FDEC_STRIDE + ld.bu t0, a0, -1 + ld.bu t1, a0, FDEC_STRIDE - 1 + ld.bu t2, a0, FDEC_STRIDE * 2 - 1 + ld.bu t3, a0, FDEC_STRIDE * 3 - 1 + + vhaddw.hu.bu vr1, vr0, vr0 + vhaddw.wu.hu vr2, vr1, vr1 + vpickve2gr.w t4, vr2, 0 + add.w t0, t0, t1 + add.w t0, t0, t2 + add.w t0, t0, t3 + add.w t0, t0, t4 + addi.w t0, t0, 4 + srai.w t0, t0, 3 + + vreplgr2vr.b vr0, t0 + vstelm.w vr0, a0, 0, 0 + vstelm.w vr0, a0, FDEC_STRIDE, 0 + vstelm.w vr0, a0, FDEC_STRIDE * 2, 0 + vstelm.w vr0, a0, FDEC_STRIDE * 3, 0 +endfunc_x264 + +/* void predict_4x4_dc_top_c( pixel *src ) + */ +function_x264 predict_4x4_dc_top_lsx + fld.s f0, a0, -FDEC_STRIDE + vhaddw.hu.bu vr1, vr0, vr0 + vhaddw.wu.hu vr2, vr1, vr1 + vsrari.w vr2, vr2, 2 + + vreplvei.b vr3, vr2, 0 + fst.s f3, a0, 0 + fst.s f3, a0, FDEC_STRIDE + fst.s f3, a0, FDEC_STRIDE * 2 + fst.s f3, a0, FDEC_STRIDE * 3 +endfunc_x264 + +/* void predict_4x4_dc_left_c( pixel *src ) + */ +function_x264 predict_4x4_dc_left_lsx + ld.bu t0, a0, -1 + ld.bu t1, a0, FDEC_STRIDE - 1 + ld.bu t2, a0, FDEC_STRIDE * 2 - 1 + ld.bu t3, a0, FDEC_STRIDE * 3 - 1 + add.w t0, t0, t1 + add.w t0, t0, t2 + add.w t0, t0, t3 + addi.w t0, t0, 2 + srai.w t0, t0, 2 + + vreplgr2vr.b vr3, t0 + fst.s f3, a0, 0 + fst.s f3, a0, FDEC_STRIDE + fst.s f3, a0, FDEC_STRIDE * 2 + fst.s f3, a0, FDEC_STRIDE * 3 +endfunc_x264 + +/* void predict_4x4_dc_128_c( pixel *src ) + */ +function_x264 predict_4x4_dc_128_lsx + addi.w t0, zero, 1 + slli.w t0, t0, BIT_DEPTH - 1 + + vreplgr2vr.b vr3, t0 + fst.s f3, a0, 0 + fst.s f3, a0, FDEC_STRIDE + fst.s f3, a0, FDEC_STRIDE * 2 + fst.s f3, a0, FDEC_STRIDE * 3 +endfunc_x264 + +/* void predict_4x4_ddl_c( pixel *src ) + */ +function_x264 predict_4x4_ddl_lsx + fld.d f0, a0, -FDEC_STRIDE + + vxor.v vr10, vr10, vr10 + vilvl.b vr0, vr10, vr0 + vbsrl.v vr1, vr0, 2 + vbsrl.v vr2, vr0, 4 + + // t7 + vextrins.h vr2, vr0, 0x67 + + vslli.h vr1, vr1, 1 + vadd.h vr0, vr0, vr1 + vadd.h vr2, vr0, vr2 + vssrarni.bu.h vr3, vr2, 2 + + fst.s f3, a0, 0 + vbsrl.v vr4, vr3, 1 + fst.s f4, a0, FDEC_STRIDE + vbsrl.v vr4, vr4, 1 + fst.s f4, a0, FDEC_STRIDE * 2 + vbsrl.v vr4, vr4, 1 + fst.s f4, a0, FDEC_STRIDE * 3 +endfunc_x264 + +/**************************************************************************** + * 8x8 prediction for intra chroma block (4:2:0) + ****************************************************************************/ +/* void x264_predict_8x8c_p_lsx( pixel *src ) + */ +const mula +.short 1, 2, 3, 4, 0, 0, 0, 0 +endconst + +const mulb +.short 0, 1, 2, 3, 4, 5, 6, 7 +endconst + +function_x264 predict_8x8c_p_lsx + la.local t0, mula + fld.d f3, t0, 0 + fld.s f4, a0, 4 - FDEC_STRIDE + fld.s f5, a0, -1 - FDEC_STRIDE + vxor.v vr0, vr0, vr0 + vilvl.b vr4, vr0, vr4 + vilvl.b vr5, vr0, vr5 + vshuf4i.h vr5, vr5, 0x1b + vsub.h vr4, vr4, vr5 + vmul.h vr4, vr4, vr3 + vhaddw.w.h vr4, vr4, vr4 + vhaddw.d.w vr4, vr4, vr4 + vpickve2gr.w t0, vr4, 0 /* H */ + + fld.s f6, a0, FDEC_STRIDE * 4 - 1 + fld.s f7, a0, FDEC_STRIDE * 5 - 1 + fld.s f8, a0, FDEC_STRIDE * 6 - 1 + fld.s f9, a0, FDEC_STRIDE * 7 - 1 + fld.s f10, a0, FDEC_STRIDE * 2 - 1 + fld.s f11, a0, FDEC_STRIDE - 1 + fld.s f12, a0, -1 + fld.s f13, a0, -1 - FDEC_STRIDE + vilvl.b vr6, vr7, vr6 + vilvl.b vr9, vr9, vr8 + vilvl.h vr6, vr9, vr6 + vilvl.b vr10, vr11, vr10 + vilvl.b vr12, vr13, vr12 + vilvl.h vr10, vr12, vr10 + vilvl.b vr6, vr0, vr6 + vilvl.b vr10, vr0, vr10 + vsub.h vr6, vr6, vr10 + vmul.h vr6, vr6, vr3 + vhaddw.w.h vr6, vr6, vr6 + vhaddw.d.w vr6, vr6, vr6 + vpickve2gr.w t1, vr6, 0 /* V */ + + ld.bu t2, a0, FDEC_STRIDE * 7 - 1 + ld.bu t3, a0, 7 - FDEC_STRIDE + add.w t2, t2, t3 + slli.w t2, t2, 4 /* a */ + + slli.w t3, t0, 4 + add.w t0, t0, t3 + addi.w t0, t0, 16 + srai.w t0, t0, 5 /* b */ + + slli.w t3, t1, 4 + add.w t1, t1, t3 + addi.w t1, t1, 16 + srai.w t1, t1, 5 /* c */ + + add.w t3, t0, t1 + slli.w t4, t3, 1 + add.w t4, t4, t3 + sub.w t5, t2, t4 + addi.w t5, t5, 16 /* i00 */ + + la.local t3, mulb + vld vr14, t3, 0 + vreplgr2vr.h vr12, t0 + vmul.h vr12, vr12, vr14 + + vreplgr2vr.h vr14, t5 + add.w t5, t5, t1 + vreplgr2vr.h vr15, t5 + add.w t5, t5, t1 + vreplgr2vr.h vr16, t5 + add.w t5, t5, t1 + vreplgr2vr.h vr17, t5 + add.w t5, t5, t1 + vreplgr2vr.h vr18, t5 + add.w t5, t5, t1 + vreplgr2vr.h vr19, t5 + add.w t5, t5, t1 + vreplgr2vr.h vr20, t5 + add.w t5, t5, t1 + vreplgr2vr.h vr21, t5 + + vadd.h vr14, vr12, vr14 + vadd.h vr15, vr12, vr15 + vadd.h vr16, vr12, vr16 + vadd.h vr17, vr12, vr17 + vadd.h vr18, vr12, vr18 + vadd.h vr19, vr12, vr19 + vadd.h vr20, vr12, vr20 + vadd.h vr21, vr12, vr21 + + vssrani.bu.h vr14, vr14, 5 + vssrani.bu.h vr15, vr15, 5 + vssrani.bu.h vr16, vr16, 5 + vssrani.bu.h vr17, vr17, 5 + vssrani.bu.h vr18, vr18, 5 + vssrani.bu.h vr19, vr19, 5 + vssrani.bu.h vr20, vr20, 5 + vssrani.bu.h vr21, vr21, 5 + + fst.d f14, a0, 0 + fst.d f15, a0, FDEC_STRIDE + fst.d f16, a0, FDEC_STRIDE * 2 + fst.d f17, a0, FDEC_STRIDE * 3 + fst.d f18, a0, FDEC_STRIDE * 4 + fst.d f19, a0, FDEC_STRIDE * 5 + fst.d f20, a0, FDEC_STRIDE * 6 + fst.d f21, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void x264_predict_8x8c_v_lsx( pixel *src ) + */ +function_x264 predict_8x8c_v_lsx + fld.d f0, a0, -FDEC_STRIDE + fst.d f0, a0, 0 + fst.d f0, a0, FDEC_STRIDE + fst.d f0, a0, FDEC_STRIDE * 2 + fst.d f0, a0, FDEC_STRIDE * 3 + fst.d f0, a0, FDEC_STRIDE * 4 + fst.d f0, a0, FDEC_STRIDE * 5 + fst.d f0, a0, FDEC_STRIDE * 6 + fst.d f0, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void x264_predict_8x8c_h_lsx( pixel *src ) + */ +function_x264 predict_8x8c_h_lsx + vldrepl.b vr0, a0, -1 + vldrepl.b vr1, a0, FDEC_STRIDE - 1 + vldrepl.b vr2, a0, FDEC_STRIDE * 2 - 1 + vldrepl.b vr3, a0, FDEC_STRIDE * 3 - 1 + vldrepl.b vr4, a0, FDEC_STRIDE * 4 - 1 + vldrepl.b vr5, a0, FDEC_STRIDE * 5 - 1 + vldrepl.b vr6, a0, FDEC_STRIDE * 6 - 1 + vldrepl.b vr7, a0, FDEC_STRIDE * 7 - 1 + fst.d f0, a0, 0 + fst.d f1, a0, FDEC_STRIDE + fst.d f2, a0, FDEC_STRIDE * 2 + fst.d f3, a0, FDEC_STRIDE * 3 + fst.d f4, a0, FDEC_STRIDE * 4 + fst.d f5, a0, FDEC_STRIDE * 5 + fst.d f6, a0, FDEC_STRIDE * 6 + fst.d f7, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void x264_predict_8x8c_dc_lsx( pixel *src ) + */ +function_x264 predict_8x8c_dc_lsx + fld.s f0, a0, -FDEC_STRIDE + fld.s f1, a0, 4 - FDEC_STRIDE + vhaddw.hu.bu vr2, vr0, vr0 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.hu.bu vr3, vr1, vr1 + vhaddw.wu.hu vr3, vr3, vr3 + vpickve2gr.w t0, vr2, 0 /* s0 */ + vpickve2gr.w t1, vr3, 0 /* s1 */ + ld.bu t2, a0, -1 + ld.bu t3, a0, FDEC_STRIDE - 1 + ld.bu t4, a0, FDEC_STRIDE * 2 - 1 + ld.bu t5, a0, FDEC_STRIDE * 3 - 1 + add.w t2, t2, t3 + add.w t2, t2, t4 + add.w t2, t2, t5 /* s2 */ + ld.bu t3, a0, FDEC_STRIDE * 4 - 1 + ld.bu t4, a0, FDEC_STRIDE * 5 - 1 + ld.bu t5, a0, FDEC_STRIDE * 6 - 1 + ld.bu t6, a0, FDEC_STRIDE * 7 - 1 + add.w t3, t3, t4 + add.w t3, t3, t5 + add.w t3, t3, t6 /* s3 */ + + add.w t4, t0, t2 + addi.w t4, t4, 4 + srai.w t4, t4, 3 /* ( s0 + s2 + 4 ) >> 3 */ + addi.w t5, t1, 2 + srai.w t5, t5, 2 /* ( s1 + 2 ) >> 2 */ + addi.w t6, t3, 2 + srai.w t6, t6, 2 /* ( s3 + 2 ) >> 2 */ + add.w t7, t1, t3 + addi.w t7, t7, 4 + srai.w t7, t7, 3 /* ( s1 + s3 + 4 ) >> 3 */ + vreplgr2vr.b vr4, t4 + vreplgr2vr.b vr5, t5 + vreplgr2vr.b vr6, t6 + vreplgr2vr.b vr7, t7 + vpackev.w vr4, vr5, vr4 + vpackev.w vr6, vr7, vr6 + + fst.d f4, a0, 0 + fst.d f4, a0, FDEC_STRIDE + fst.d f4, a0, FDEC_STRIDE * 2 + fst.d f4, a0, FDEC_STRIDE * 3 + + fst.d f6, a0, FDEC_STRIDE * 4 + fst.d f6, a0, FDEC_STRIDE * 5 + fst.d f6, a0, FDEC_STRIDE * 6 + fst.d f6, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void x264_predict_8x8c_dc_128_lsx( pixel *src ) + */ +function_x264 predict_8x8c_dc_128_lsx + ori t1, t0, 1 + slli.d t1, t1, BIT_DEPTH - 1 + vreplgr2vr.b vr4, t1 + fst.d f4, a0, 0 + fst.d f4, a0, FDEC_STRIDE + fst.d f4, a0, FDEC_STRIDE * 2 + fst.d f4, a0, FDEC_STRIDE * 3 + fst.d f4, a0, FDEC_STRIDE * 4 + fst.d f4, a0, FDEC_STRIDE * 5 + fst.d f4, a0, FDEC_STRIDE * 6 + fst.d f4, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void x264_predict_8x8c_dc_top_lsx( pixel *src ) + */ +function_x264 predict_8x8c_dc_top_lsx + fld.s f0, a0, -FDEC_STRIDE + fld.s f1, a0, 4 - FDEC_STRIDE + vhaddw.hu.bu vr0, vr0, vr0 + vhaddw.wu.hu vr0, vr0, vr0 + vhaddw.hu.bu vr1, vr1, vr1 + vhaddw.wu.hu vr1, vr1, vr1 + vpickve2gr.w t0, vr0, 0 /* dc0 */ + vpickve2gr.w t1, vr1, 0 /* dc1 */ + + addi.w t0, t0, 2 + srai.w t0, t0, 2 + addi.w t1, t1, 2 + srai.w t1, t1, 2 + vreplgr2vr.b vr4, t0 + vreplgr2vr.b vr5, t1 + vpackev.w vr4, vr5, vr4 + fst.d f4, a0, 0 + fst.d f4, a0, FDEC_STRIDE + fst.d f4, a0, FDEC_STRIDE * 2 + fst.d f4, a0, FDEC_STRIDE * 3 + fst.d f4, a0, FDEC_STRIDE * 4 + fst.d f4, a0, FDEC_STRIDE * 5 + fst.d f4, a0, FDEC_STRIDE * 6 + fst.d f4, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void x264_predict_8x8c_dc_left_lsx( pixel *src ) + */ +function_x264 predict_8x8c_dc_left_lsx + ld.bu t0, a0, -1 + ld.bu t1, a0, FDEC_STRIDE - 1 + ld.bu t2, a0, FDEC_STRIDE * 2 - 1 + ld.bu t3, a0, FDEC_STRIDE * 3 - 1 + add.w t0, t0, t1 + add.w t0, t0, t2 + add.w t0, t0, t3 + ld.bu t1, a0, FDEC_STRIDE * 4 - 1 + ld.bu t2, a0, FDEC_STRIDE * 5 - 1 + ld.bu t3, a0, FDEC_STRIDE * 6 - 1 + ld.bu t4, a0, FDEC_STRIDE * 7 - 1 + add.w t1, t1, t2 + add.w t1, t1, t3 + add.w t1, t1, t4 + addi.w t0, t0, 2 + srai.w t0, t0, 2 + addi.w t1, t1, 2 + srai.w t1, t1, 2 + vreplgr2vr.b vr4, t0 /* ( dc0 + 2 ) >> 2 */ + vreplgr2vr.b vr5, t1 /* ( dc1 + 2 ) >> 2 */ + fst.d f4, a0, 0 + fst.d f4, a0, FDEC_STRIDE + fst.d f4, a0, FDEC_STRIDE * 2 + fst.d f4, a0, FDEC_STRIDE * 3 + fst.d f5, a0, FDEC_STRIDE * 4 + fst.d f5, a0, FDEC_STRIDE * 5 + fst.d f5, a0, FDEC_STRIDE * 6 + fst.d f5, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/**************************************************************************** + * 8x8 prediction for intra luma block + ****************************************************************************/ +/* void predict_8x8_v_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_v_lsx + fld.d f0, a1, 16 + fst.d f0, a0, 0 + fst.d f0, a0, FDEC_STRIDE + fst.d f0, a0, FDEC_STRIDE * 2 + fst.d f0, a0, FDEC_STRIDE * 3 + fst.d f0, a0, FDEC_STRIDE * 4 + fst.d f0, a0, FDEC_STRIDE * 5 + fst.d f0, a0, FDEC_STRIDE * 6 + fst.d f0, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void predict_8x8_h_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_h_lasx + fld.d f0, a1, 7 + xvinsve0.w xr0, xr0, 5 + xvrepl128vei.b xr4, xr0, 7 + xvrepl128vei.b xr3, xr0, 6 + xvrepl128vei.b xr2, xr0, 5 + xvrepl128vei.b xr1, xr0, 4 + + fst.d f4, a0, 0 + fst.d f3, a0, FDEC_STRIDE + fst.d f2, a0, FDEC_STRIDE * 2 + fst.d f1, a0, FDEC_STRIDE * 3 + + xvstelm.d xr4, a0, FDEC_STRIDE * 4, 2 + xvstelm.d xr3, a0, FDEC_STRIDE * 5, 2 + xvstelm.d xr2, a0, FDEC_STRIDE * 6, 2 + xvstelm.d xr1, a0, FDEC_STRIDE * 7, 2 +endfunc_x264 + +function_x264 predict_8x8_h_lsx + fld.d f0, a1, 7 + vreplvei.w vr1, vr0, 0 + + vreplvei.b vr4, vr0, 7 + vreplvei.b vr5, vr1, 7 + vreplvei.b vr6, vr0, 6 + vreplvei.b vr7, vr1, 6 + vreplvei.b vr8, vr0, 5 + vreplvei.b vr9, vr1, 5 + vreplvei.b vr10, vr0, 4 + vreplvei.b vr11, vr1, 4 + + fst.d f4, a0, 0 + fst.d f6, a0, FDEC_STRIDE + fst.d f8, a0, FDEC_STRIDE * 2 + fst.d f10, a0, FDEC_STRIDE * 3 + + vstelm.d vr5, a0, FDEC_STRIDE * 4, 0 + vstelm.d vr7, a0, FDEC_STRIDE * 5, 0 + vstelm.d vr9, a0, FDEC_STRIDE * 6, 0 + vstelm.d vr11, a0, FDEC_STRIDE * 7, 0 +endfunc_x264 + +/* void predict_8x8_dc_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_dc_lsx + fld.d f0, a1, 7 + fld.d f1, a1, 16 + vilvl.d vr0, vr1, vr0 + vhaddw.hu.bu vr1, vr0, vr0 + vhaddw.wu.hu vr2, vr1, vr1 + vhaddw.du.wu vr3, vr2, vr2 + vhaddw.qu.du vr4, vr3, vr3 + vsrari.w vr4, vr4, 4 + + vreplvei.b vr5, vr4, 0 + fst.d f5, a0, 0 + fst.d f5, a0, FDEC_STRIDE + fst.d f5, a0, FDEC_STRIDE * 2 + fst.d f5, a0, FDEC_STRIDE * 3 + fst.d f5, a0, FDEC_STRIDE * 4 + fst.d f5, a0, FDEC_STRIDE * 5 + fst.d f5, a0, FDEC_STRIDE * 6 + fst.d f5, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void predict_8x8_dc_left_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_dc_left_lsx + fld.d f0, a1, 7 + vhaddw.hu.bu vr1, vr0, vr0 + vhaddw.wu.hu vr2, vr1, vr1 + vhaddw.du.wu vr3, vr2, vr2 + vsrari.w vr3, vr3, 3 + + vreplvei.b vr5, vr3, 0 + fst.d f5, a0, 0 + fst.d f5, a0, FDEC_STRIDE + fst.d f5, a0, FDEC_STRIDE * 2 + fst.d f5, a0, FDEC_STRIDE * 3 + fst.d f5, a0, FDEC_STRIDE * 4 + fst.d f5, a0, FDEC_STRIDE * 5 + fst.d f5, a0, FDEC_STRIDE * 6 + fst.d f5, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void predict_8x8_dc_top_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_dc_top_lsx + fld.d f0, a1, 16 + vhaddw.hu.bu vr1, vr0, vr0 + vhaddw.wu.hu vr2, vr1, vr1 + vhaddw.du.wu vr3, vr2, vr2 + vsrari.w vr3, vr3, 3 + + vreplvei.b vr5, vr3, 0 + fst.d f5, a0, 0 + fst.d f5, a0, FDEC_STRIDE + fst.d f5, a0, FDEC_STRIDE * 2 + fst.d f5, a0, FDEC_STRIDE * 3 + fst.d f5, a0, FDEC_STRIDE * 4 + fst.d f5, a0, FDEC_STRIDE * 5 + fst.d f5, a0, FDEC_STRIDE * 6 + fst.d f5, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void predict_8x8_dc_128_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_dc_128_lsx + addi.w t0, zero, 1 + slli.d t1, t0, (BIT_DEPTH-1) + vreplgr2vr.b vr5, t1 + fst.d f5, a0, 0 + fst.d f5, a0, FDEC_STRIDE + fst.d f5, a0, FDEC_STRIDE * 2 + fst.d f5, a0, FDEC_STRIDE * 3 + fst.d f5, a0, FDEC_STRIDE * 4 + fst.d f5, a0, FDEC_STRIDE * 5 + fst.d f5, a0, FDEC_STRIDE * 6 + fst.d f5, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void predict_8x8_ddl_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_ddl_lasx + vld vr1, a1, 16 + vbsrl.v vr2, vr1, 1 + vbsrl.v vr3, vr1, 2 + + vextrins.b vr3, vr1, 0xef + vext2xv.hu.bu xr5, xr1 + vext2xv.hu.bu xr6, xr2 + vext2xv.hu.bu xr7, xr3 + + xvslli.h xr6, xr6, 1 + xvadd.h xr8, xr5, xr6 + xvadd.h xr9, xr8, xr7 + xvssrarni.bu.h xr9, xr9, 2 + xvpermi.d xr9, xr9, 0x08 + vbsrl.v vr10, vr9, 1 + vbsrl.v vr11, vr9, 2 + vbsrl.v vr12, vr9, 3 + vbsrl.v vr13, vr9, 4 + vbsrl.v vr14, vr9, 5 + vbsrl.v vr15, vr9, 6 + vbsrl.v vr16, vr9, 7 + + fst.d f9, a0, 0 + fst.d f10, a0, FDEC_STRIDE + fst.d f11, a0, FDEC_STRIDE * 2 + fst.d f12, a0, FDEC_STRIDE * 3 + fst.d f13, a0, FDEC_STRIDE * 4 + fst.d f14, a0, FDEC_STRIDE * 5 + fst.d f15, a0, FDEC_STRIDE * 6 + fst.d f16, a0, FDEC_STRIDE * 7 +endfunc_x264 + +function_x264 predict_8x8_ddl_lsx + vld vr1, a1, 16 + vbsrl.v vr2, vr1, 1 + vbsrl.v vr3, vr1, 2 + + vextrins.b vr3, vr1, 0xef + vsllwil.hu.bu vr5, vr1, 0 + vexth.hu.bu vr15, vr1 + vsllwil.hu.bu vr6, vr2, 0 + vexth.hu.bu vr16, vr2 + vsllwil.hu.bu vr7, vr3, 0 + vexth.hu.bu vr17, vr3 + + vslli.h vr6, vr6, 1 + vslli.h vr16, vr16, 1 + vadd.h vr8, vr5, vr6 + vadd.h vr18, vr15, vr16 + vadd.h vr19, vr8, vr7 + vadd.h vr9, vr18, vr17 + vssrarni.bu.h vr9, vr19, 2 + vbsrl.v vr10, vr9, 1 + vbsrl.v vr11, vr9, 2 + vbsrl.v vr12, vr9, 3 + vbsrl.v vr13, vr9, 4 + vbsrl.v vr14, vr9, 5 + vbsrl.v vr15, vr9, 6 + vbsrl.v vr16, vr9, 7 + + fst.d f9, a0, 0 + fst.d f10, a0, FDEC_STRIDE + fst.d f11, a0, FDEC_STRIDE * 2 + fst.d f12, a0, FDEC_STRIDE * 3 + fst.d f13, a0, FDEC_STRIDE * 4 + fst.d f14, a0, FDEC_STRIDE * 5 + fst.d f15, a0, FDEC_STRIDE * 6 + fst.d f16, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void predict_8x8_ddr_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_ddr_lasx + vld vr1, a1, 7 + vbsrl.v vr2, vr1, 1 + vbsrl.v vr3, vr1, 2 + + // edge[23] + ld.bu t0, a1, 23 + vinsgr2vr.b vr3, t0, 0xe + + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + vext2xv.hu.bu xr3, xr3 + xvslli.h xr2, xr2, 1 + xvadd.h xr4, xr1, xr2 + xvadd.h xr5, xr4, xr3 + xvssrarni.bu.h xr5, xr5, 2 + xvpermi.d xr6, xr5, 0x08 + + vbsrl.v vr7, vr6, 7 + vbsrl.v vr8, vr6, 6 + vbsrl.v vr9, vr6, 5 + vbsrl.v vr10, vr6, 4 + vbsrl.v vr11, vr6, 3 + vbsrl.v vr12, vr6, 2 + vbsrl.v vr13, vr6, 1 + + fst.d f7, a0, 0 + fst.d f8, a0, FDEC_STRIDE + fst.d f9, a0, FDEC_STRIDE * 2 + fst.d f10, a0, FDEC_STRIDE * 3 + fst.d f11, a0, FDEC_STRIDE * 4 + fst.d f12, a0, FDEC_STRIDE * 5 + fst.d f13, a0, FDEC_STRIDE * 6 + fst.d f6, a0, FDEC_STRIDE * 7 +endfunc_x264 + +function_x264 predict_8x8_ddr_lsx + vld vr1, a1, 7 + vbsrl.v vr2, vr1, 1 + vbsrl.v vr3, vr1, 2 + + // edge[23] + ld.bu t0, a1, 23 + vinsgr2vr.b vr3, t0, 0xe + + vexth.hu.bu vr11, vr1 + vsllwil.hu.bu vr1, vr1, 0 + vexth.hu.bu vr12, vr2 + vsllwil.hu.bu vr2, vr2, 0 + vexth.hu.bu vr13, vr3 + vsllwil.hu.bu vr3, vr3, 0 + + vslli.h vr2, vr2, 1 + vslli.h vr12, vr12, 1 + vadd.h vr4, vr1, vr2 + vadd.h vr14, vr11, vr12 + vadd.h vr5, vr4, vr3 + vadd.h vr15, vr14, vr13 + vssrarni.bu.h vr15, vr5, 2 + + vbsrl.v vr7, vr15, 7 + vbsrl.v vr8, vr15, 6 + vbsrl.v vr9, vr15, 5 + vbsrl.v vr10, vr15, 4 + vbsrl.v vr11, vr15, 3 + vbsrl.v vr12, vr15, 2 + vbsrl.v vr13, vr15, 1 + + fst.d f7, a0, 0 + fst.d f8, a0, FDEC_STRIDE + fst.d f9, a0, FDEC_STRIDE * 2 + fst.d f10, a0, FDEC_STRIDE * 3 + fst.d f11, a0, FDEC_STRIDE * 4 + fst.d f12, a0, FDEC_STRIDE * 5 + fst.d f13, a0, FDEC_STRIDE * 6 + fst.d f15, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/* void predict_8x8_vr_c( pixel *src, pixel edge[36] ) + */ +function_x264 predict_8x8_vr_lasx + vld vr0, a1, 8 + vbsrl.v vr1, vr0, 1 + vbsrl.v vr2, vr0, 2 + + vext2xv.hu.bu xr5, xr0 + vext2xv.hu.bu xr6, xr1 + vext2xv.hu.bu xr7, xr2 + + xvadd.h xr10, xr5, xr6 + xvadd.h xr11, xr10, xr6 + xvadd.h xr12, xr11, xr7 + xvssrarni.bu.h xr12, xr12, 2 + xvssrarni.bu.h xr10, xr10, 1 + xvpermi.d xr13, xr12, 0x08 + xvpermi.d xr14, xr10, 0x08 + + vbsrl.v vr15, vr13, 6 + vbsll.v vr16, vr15, 1 + vextrins.b vr16, vr13, 0x04 + vbsll.v vr17, vr16, 1 + vextrins.b vr17, vr13, 0x02 + vbsll.v vr18, vr17, 1 + vextrins.b vr18, vr13, 0x00 + + fst.d f15, a0, FDEC_STRIDE + fst.d f16, a0, FDEC_STRIDE * 3 + fst.d f17, a0, FDEC_STRIDE * 5 + fst.d f18, a0, FDEC_STRIDE * 7 + + vbsrl.v vr16, vr14, 7 + vbsll.v vr17, vr16, 1 + vextrins.b vr17, vr13, 0x05 + vbsll.v vr18, vr17, 1 + vextrins.b vr18, vr13, 0x03 + vbsll.v vr19, vr18, 1 + vextrins.b vr19, vr13, 0x01 + + fst.d f16, a0, 0 + fst.d f17, a0, FDEC_STRIDE * 2 + fst.d f18, a0, FDEC_STRIDE * 4 + fst.d f19, a0, FDEC_STRIDE * 6 +endfunc_x264 + +function_x264 predict_8x8_vr_lsx + vld vr0, a1, 8 + vbsrl.v vr1, vr0, 1 + vbsrl.v vr2, vr0, 2 + + vexth.hu.bu vr5, vr0 + vsllwil.hu.bu vr0, vr0, 0 + vexth.hu.bu vr6, vr1 + vsllwil.hu.bu vr1, vr1, 0 + vexth.hu.bu vr7, vr2 + vsllwil.hu.bu vr2, vr2, 0 + + vadd.h vr9, vr0, vr1 + vadd.h vr10, vr5, vr6 + vadd.h vr11, vr9, vr1 + vadd.h vr12, vr10, vr6 + vadd.h vr13, vr11, vr2 + vadd.h vr14, vr12, vr7 + vssrarni.bu.h vr14, vr13, 2 + vssrarni.bu.h vr10, vr9, 1 + + vbsrl.v vr15, vr14, 6 + vbsll.v vr16, vr15, 1 + vextrins.b vr16, vr14, 0x04 + vbsll.v vr17, vr16, 1 + vextrins.b vr17, vr14, 0x02 + vbsll.v vr18, vr17, 1 + vextrins.b vr18, vr14, 0x00 + + fst.d f15, a0, FDEC_STRIDE + fst.d f16, a0, FDEC_STRIDE * 3 + fst.d f17, a0, FDEC_STRIDE * 5 + fst.d f18, a0, FDEC_STRIDE * 7 + + vbsrl.v vr16, vr10, 7 + vbsll.v vr17, vr16, 1 + vextrins.b vr17, vr14, 0x05 + vbsll.v vr18, vr17, 1 + vextrins.b vr18, vr14, 0x03 + vbsll.v vr19, vr18, 1 + vextrins.b vr19, vr14, 0x01 + + fst.d f16, a0, 0 + fst.d f17, a0, FDEC_STRIDE * 2 + fst.d f18, a0, FDEC_STRIDE * 4 + fst.d f19, a0, FDEC_STRIDE * 6 +endfunc_x264 + +/* void predict_8x8_vl_c( pixel *src, pixel edge[36] ); + */ +function_x264 predict_8x8_vl_lasx + vld vr0, a1, 16 + vbsrl.v vr1, vr0, 1 + vbsrl.v vr2, vr0, 2 + + vext2xv.hu.bu xr0, xr0 + vext2xv.hu.bu xr1, xr1 + vext2xv.hu.bu xr2, xr2 + + xvadd.h xr3, xr0, xr1 + xvadd.h xr4, xr3, xr1 + xvadd.h xr5, xr4, xr2 + xvssrarni.bu.h xr3, xr3, 1 + xvssrarni.bu.h xr5, xr5, 2 + xvpermi.d xr6, xr3, 0x8 + xvpermi.d xr7, xr5, 0x8 + + vbsrl.v vr8, vr6, 1 + vbsrl.v vr9, vr7, 1 + + fst.d f6, a0, 0 + fst.d f7, a0, FDEC_STRIDE + fst.d f8, a0, FDEC_STRIDE * 2 + fst.d f9, a0, FDEC_STRIDE * 3 + + vbsrl.v vr10, vr8, 1 + vbsrl.v vr11, vr9, 1 + vbsrl.v vr12, vr10, 1 + vbsrl.v vr13, vr11, 1 + fst.d f10, a0, FDEC_STRIDE * 4 + fst.d f11, a0, FDEC_STRIDE * 5 + fst.d f12, a0, FDEC_STRIDE * 6 + fst.d f13, a0, FDEC_STRIDE * 7 +endfunc_x264 + +function_x264 predict_8x8_vl_lsx + vld vr0, a1, 16 + vbsrl.v vr1, vr0, 1 + vbsrl.v vr2, vr0, 2 + + vexth.hu.bu vr5, vr0 + vsllwil.hu.bu vr0, vr0, 0 + vexth.hu.bu vr6, vr1 + vsllwil.hu.bu vr1, vr1, 0 + vexth.hu.bu vr7, vr2 + vsllwil.hu.bu vr2, vr2, 0 + + vadd.h vr3, vr0, vr1 + vadd.h vr13, vr5, vr6 + vadd.h vr4, vr3, vr1 + vadd.h vr14, vr13, vr6 + vadd.h vr5, vr4, vr2 + vadd.h vr15, vr14, vr7 + vssrarni.bu.h vr13, vr3, 1 + vssrarni.bu.h vr15, vr5, 2 + + vbsrl.v vr8, vr13, 1 + vbsrl.v vr9, vr15, 1 + fst.d f13, a0, 0 + fst.d f15, a0, FDEC_STRIDE + fst.d f8, a0, FDEC_STRIDE * 2 + fst.d f9, a0, FDEC_STRIDE * 3 + + vbsrl.v vr8, vr8, 1 + vbsrl.v vr9, vr9, 1 + vbsrl.v vr10, vr8, 1 + vbsrl.v vr11, vr9, 1 + fst.d f8, a0, FDEC_STRIDE * 4 + fst.d f9, a0, FDEC_STRIDE * 5 + fst.d f10, a0, FDEC_STRIDE * 6 + fst.d f11, a0, FDEC_STRIDE * 7 +endfunc_x264 + +/**************************************************************************** + * 16x16 prediction for intra luma block + ****************************************************************************/ +/* void x264_predict_16x16_dc_lsx( pixel *src ) + */ +function_x264 predict_16x16_dc_lsx + ld.bu t4, a0, -1 + ld.bu t5, a0, FDEC_STRIDE - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 2 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 3 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 4 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 5 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 6 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 7 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 8 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 9 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 10 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 11 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 12 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 13 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 14 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 15 - 1 + add.d t4, t4, t5 + + vld vr4, a0, -FDEC_STRIDE + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.wu.hu vr4, vr4, vr4 + vhaddw.du.wu vr4, vr4, vr4 + vhaddw.qu.du vr4, vr4, vr4 + vpickve2gr.wu t5, vr4, 0 + add.d t4, t4, t5 + + addi.d t5, t4, 16 + srai.w t5, t5, 5 + vreplgr2vr.b vr5, t5 + + vst vr5, a0, 0 + vst vr5, a0, FDEC_STRIDE + vst vr5, a0, FDEC_STRIDE * 2 + vst vr5, a0, FDEC_STRIDE * 3 + vst vr5, a0, FDEC_STRIDE * 4 + vst vr5, a0, FDEC_STRIDE * 5 + vst vr5, a0, FDEC_STRIDE * 6 + vst vr5, a0, FDEC_STRIDE * 7 + + vst vr5, a0, FDEC_STRIDE * 8 + vst vr5, a0, FDEC_STRIDE * 9 + vst vr5, a0, FDEC_STRIDE * 10 + vst vr5, a0, FDEC_STRIDE * 11 + vst vr5, a0, FDEC_STRIDE * 12 + vst vr5, a0, FDEC_STRIDE * 13 + vst vr5, a0, FDEC_STRIDE * 14 + vst vr5, a0, FDEC_STRIDE * 15 +endfunc_x264 + +/* void x264_predict_16x16_dc_left_lsx( pixel *src ) + */ +function_x264 predict_16x16_dc_left_lsx + ld.bu t4, a0, -1 + ld.bu t5, a0, FDEC_STRIDE - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 2 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 3 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 4 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 5 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 6 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 7 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 8 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 9 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 10 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 11 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 12 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 13 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 14 - 1 + add.d t4, t4, t5 + ld.bu t5, a0, FDEC_STRIDE * 15 - 1 + add.d t4, t4, t5 + + addi.d t5, t4, 8 + srai.w t5, t5, 4 + vreplgr2vr.b vr5, t5 + + vst vr5, a0, 0 + vst vr5, a0, FDEC_STRIDE + vst vr5, a0, FDEC_STRIDE * 2 + vst vr5, a0, FDEC_STRIDE * 3 + vst vr5, a0, FDEC_STRIDE * 4 + vst vr5, a0, FDEC_STRIDE * 5 + vst vr5, a0, FDEC_STRIDE * 6 + vst vr5, a0, FDEC_STRIDE * 7 + + vst vr5, a0, FDEC_STRIDE * 8 + vst vr5, a0, FDEC_STRIDE * 9 + vst vr5, a0, FDEC_STRIDE * 10 + vst vr5, a0, FDEC_STRIDE * 11 + vst vr5, a0, FDEC_STRIDE * 12 + vst vr5, a0, FDEC_STRIDE * 13 + vst vr5, a0, FDEC_STRIDE * 14 + vst vr5, a0, FDEC_STRIDE * 15 +endfunc_x264 + +/* void x264_predict_16x16_dc_top_lsx( pixel *src ) + */ +function_x264 predict_16x16_dc_top_lsx + vld vr4, a0, -FDEC_STRIDE + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.wu.hu vr4, vr4, vr4 + vhaddw.du.wu vr4, vr4, vr4 + vhaddw.qu.du vr4, vr4, vr4 + vpickve2gr.wu t5, vr4, 0 + + addi.d t5, t5, 8 + srai.w t5, t5, 4 + vreplgr2vr.b vr5, t5 + + vst vr5, a0, 0 + vst vr5, a0, FDEC_STRIDE + vst vr5, a0, FDEC_STRIDE * 2 + vst vr5, a0, FDEC_STRIDE * 3 + vst vr5, a0, FDEC_STRIDE * 4 + vst vr5, a0, FDEC_STRIDE * 5 + vst vr5, a0, FDEC_STRIDE * 6 + vst vr5, a0, FDEC_STRIDE * 7 + + vst vr5, a0, FDEC_STRIDE * 8 + vst vr5, a0, FDEC_STRIDE * 9 + vst vr5, a0, FDEC_STRIDE * 10 + vst vr5, a0, FDEC_STRIDE * 11 + vst vr5, a0, FDEC_STRIDE * 12 + vst vr5, a0, FDEC_STRIDE * 13 + vst vr5, a0, FDEC_STRIDE * 14 + vst vr5, a0, FDEC_STRIDE * 15 +endfunc_x264 + +/* void x264_predict_16x16_dc_128_lsx( pixel *src ) + */ +function_x264 predict_16x16_dc_128_lsx + ori t1, t0, 1 + slli.d t1, t1, BIT_DEPTH - 1 + vreplgr2vr.b vr5, t1 + + vst vr5, a0, 0 + vst vr5, a0, FDEC_STRIDE + vst vr5, a0, FDEC_STRIDE * 2 + vst vr5, a0, FDEC_STRIDE * 3 + vst vr5, a0, FDEC_STRIDE * 4 + vst vr5, a0, FDEC_STRIDE * 5 + vst vr5, a0, FDEC_STRIDE * 6 + vst vr5, a0, FDEC_STRIDE * 7 + + vst vr5, a0, FDEC_STRIDE * 8 + vst vr5, a0, FDEC_STRIDE * 9 + vst vr5, a0, FDEC_STRIDE * 10 + vst vr5, a0, FDEC_STRIDE * 11 + vst vr5, a0, FDEC_STRIDE * 12 + vst vr5, a0, FDEC_STRIDE * 13 + vst vr5, a0, FDEC_STRIDE * 14 + vst vr5, a0, FDEC_STRIDE * 15 +endfunc_x264 + +/* void x264_predict_16x16_h_lsx( pixel *src ) + */ +function_x264 predict_16x16_h_lsx + ld.bu t0, a0, -1 + ld.bu t1, a0, FDEC_STRIDE - 1 + ld.bu t2, a0, FDEC_STRIDE * 2 - 1 + ld.bu t3, a0, FDEC_STRIDE * 3 - 1 + ld.bu t4, a0, FDEC_STRIDE * 4 - 1 + ld.bu t5, a0, FDEC_STRIDE * 5 - 1 + ld.bu t6, a0, FDEC_STRIDE * 6 - 1 + ld.bu t7, a0, FDEC_STRIDE * 7 - 1 + vreplgr2vr.b vr0, t0 + vreplgr2vr.b vr1, t1 + vreplgr2vr.b vr2, t2 + vreplgr2vr.b vr3, t3 + vreplgr2vr.b vr4, t4 + vreplgr2vr.b vr5, t5 + vreplgr2vr.b vr6, t6 + vreplgr2vr.b vr7, t7 + vst vr0, a0, 0 + vst vr1, a0, FDEC_STRIDE + vst vr2, a0, FDEC_STRIDE * 2 + vst vr3, a0, FDEC_STRIDE * 3 + vst vr4, a0, FDEC_STRIDE * 4 + vst vr5, a0, FDEC_STRIDE * 5 + vst vr6, a0, FDEC_STRIDE * 6 + vst vr7, a0, FDEC_STRIDE * 7 + + ld.bu t0, a0, FDEC_STRIDE * 8 - 1 + ld.bu t1, a0, FDEC_STRIDE * 9 - 1 + ld.bu t2, a0, FDEC_STRIDE * 10 - 1 + ld.bu t3, a0, FDEC_STRIDE * 11 - 1 + ld.bu t4, a0, FDEC_STRIDE * 12 - 1 + ld.bu t5, a0, FDEC_STRIDE * 13 - 1 + ld.bu t6, a0, FDEC_STRIDE * 14 - 1 + ld.bu t7, a0, FDEC_STRIDE * 15 - 1 + vreplgr2vr.b vr0, t0 + vreplgr2vr.b vr1, t1 + vreplgr2vr.b vr2, t2 + vreplgr2vr.b vr3, t3 + vreplgr2vr.b vr4, t4 + vreplgr2vr.b vr5, t5 + vreplgr2vr.b vr6, t6 + vreplgr2vr.b vr7, t7 + vst vr0, a0, FDEC_STRIDE * 8 + vst vr1, a0, FDEC_STRIDE * 9 + vst vr2, a0, FDEC_STRIDE * 10 + vst vr3, a0, FDEC_STRIDE * 11 + vst vr4, a0, FDEC_STRIDE * 12 + vst vr5, a0, FDEC_STRIDE * 13 + vst vr6, a0, FDEC_STRIDE * 14 + vst vr7, a0, FDEC_STRIDE * 15 +endfunc_x264 + +/* void x264_predict_16x16_v_lsx( pixel *src ) + */ +function_x264 predict_16x16_v_lsx + fld.d f4, a0, -FDEC_STRIDE + fld.d f5, a0, 4 - FDEC_STRIDE + fld.d f6, a0, 8 - FDEC_STRIDE + fld.d f7, a0, 12 - FDEC_STRIDE + vilvl.w vr4, vr5, vr4 + vilvl.w vr6, vr7, vr6 + vilvl.d vr4, vr6, vr4 + + vst vr4, a0, 0 + vst vr4, a0, FDEC_STRIDE + vst vr4, a0, FDEC_STRIDE * 2 + vst vr4, a0, FDEC_STRIDE * 3 + vst vr4, a0, FDEC_STRIDE * 4 + vst vr4, a0, FDEC_STRIDE * 5 + vst vr4, a0, FDEC_STRIDE * 6 + vst vr4, a0, FDEC_STRIDE * 7 + + vst vr4, a0, FDEC_STRIDE * 8 + vst vr4, a0, FDEC_STRIDE * 9 + vst vr4, a0, FDEC_STRIDE * 10 + vst vr4, a0, FDEC_STRIDE * 11 + vst vr4, a0, FDEC_STRIDE * 12 + vst vr4, a0, FDEC_STRIDE * 13 + vst vr4, a0, FDEC_STRIDE * 14 + vst vr4, a0, FDEC_STRIDE * 15 +endfunc_x264 + +/* void x264_predict_16x16_p_lasx( pixel *src ) + */ +const mulc +.short 1, 2, 3, 4, 5, 6, 7, 8 +endconst + +const muld +.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +endconst + +function_x264 predict_16x16_p_lasx + la.local t0, mulc + vld vr3, t0, 0 + fld.d f4, a0, 8 - FDEC_STRIDE + fld.d f5, a0, -1 - FDEC_STRIDE + vxor.v vr0, vr0, vr0 + vilvl.b vr4, vr0, vr4 + vilvl.b vr5, vr0, vr5 + vshuf4i.h vr5, vr5, 0x1b + vbsll.v vr6, vr5, 8 + vpackod.d vr5, vr6, vr5 + vsub.h vr4, vr4, vr5 + vmul.h vr4, vr4, vr3 + vhaddw.w.h vr4, vr4, vr4 + vhaddw.d.w vr4, vr4, vr4 + vhaddw.q.d vr4, vr4, vr4 + vpickve2gr.w t0, vr4, 0 /* H */ + + fld.d f6, a0, FDEC_STRIDE * 8 - 1 + fld.d f7, a0, FDEC_STRIDE * 9 - 1 + fld.d f8, a0, FDEC_STRIDE * 10 - 1 + fld.d f9, a0, FDEC_STRIDE * 11 - 1 + fld.d f10, a0, FDEC_STRIDE * 12 - 1 + fld.d f11, a0, FDEC_STRIDE * 13 - 1 + fld.d f12, a0, FDEC_STRIDE * 14 - 1 + fld.d f13, a0, FDEC_STRIDE * 15 - 1 + vilvl.b vr6, vr7, vr6 + vilvl.b vr8, vr9, vr8 + vilvl.b vr10, vr11, vr10 + vilvl.b vr12, vr13, vr12 + vilvl.h vr6, vr8, vr6 + vilvl.h vr10, vr12, vr10 + vilvl.w vr6, vr10, vr6 + + fld.d f7, a0, FDEC_STRIDE * 6 - 1 + fld.d f8, a0, FDEC_STRIDE * 5 - 1 + fld.d f9, a0, FDEC_STRIDE * 4 - 1 + fld.d f10, a0, FDEC_STRIDE * 3 - 1 + fld.d f11, a0, FDEC_STRIDE * 2 - 1 + fld.d f12, a0, FDEC_STRIDE - 1 + fld.d f13, a0, -1 + fld.d f14, a0, -FDEC_STRIDE - 1 + vilvl.b vr7, vr8, vr7 + vilvl.b vr9, vr10, vr9 + vilvl.b vr11, vr12, vr11 + vilvl.b vr13, vr14, vr13 + vilvl.h vr7, vr9, vr7 + vilvl.h vr11, vr13, vr11 + vilvl.w vr7, vr11, vr7 + + vilvl.b vr6, vr0, vr6 + vilvl.b vr7, vr0, vr7 + vsub.h vr6, vr6, vr7 + vmul.h vr6, vr6, vr3 + vhaddw.w.h vr6, vr6, vr6 + vhaddw.d.w vr6, vr6, vr6 + vhaddw.q.d vr6, vr6, vr6 + vpickve2gr.w t1, vr6, 0 /* V */ + + ld.bu t2, a0, FDEC_STRIDE * 15 - 1 + ld.bu t3, a0, 15 - FDEC_STRIDE + add.w t2, t2, t3 + slli.w t2, t2, 4 /* a */ + + slli.w t3, t0, 2 + add.w t0, t0, t3 + addi.w t0, t0, 32 + srai.w t0, t0, 6 /* b */ + + slli.w t3, t1, 2 + add.w t1, t1, t3 + addi.w t1, t1, 32 + srai.w t1, t1, 6 /* c */ + + add.w t3, t0, t1 + slli.w t4, t3, 3 + sub.w t4, t4, t3 + sub.w t5, t2, t4 + addi.w t5, t5, 16 /* i00 */ + + la.local t3, muld + xvld xr14, t3, 0 + xvreplgr2vr.h xr12, t0 + xvmul.h xr12, xr12, xr14 + +.rept 16 + xvreplgr2vr.h xr14, t5 + xvadd.h xr13, xr12, xr14 + xvssrani.bu.h xr15, xr13, 5 + xvstelm.d xr15, a0, 0, 0 + xvstelm.d xr15, a0, 8, 2 + addi.d a0, a0, FDEC_STRIDE + add.w t5, t5, t1 +.endr +endfunc_x264 + +function_x264 predict_16x16_p_lsx + la.local t0, mulc + vld vr3, t0, 0 + fld.d f4, a0, 8 - FDEC_STRIDE + fld.d f5, a0, -1 - FDEC_STRIDE + vxor.v vr0, vr0, vr0 + vilvl.b vr4, vr0, vr4 + vilvl.b vr5, vr0, vr5 + vshuf4i.h vr5, vr5, 0x1b + vbsll.v vr6, vr5, 8 + vpackod.d vr5, vr6, vr5 + vsub.h vr4, vr4, vr5 + vmul.h vr4, vr4, vr3 + vhaddw.w.h vr4, vr4, vr4 + vhaddw.d.w vr4, vr4, vr4 + vhaddw.q.d vr4, vr4, vr4 + vpickve2gr.w t0, vr4, 0 /* H */ + + fld.d f6, a0, FDEC_STRIDE * 8 - 1 + fld.d f7, a0, FDEC_STRIDE * 9 - 1 + fld.d f8, a0, FDEC_STRIDE * 10 - 1 + fld.d f9, a0, FDEC_STRIDE * 11 - 1 + fld.d f10, a0, FDEC_STRIDE * 12 - 1 + fld.d f11, a0, FDEC_STRIDE * 13 - 1 + fld.d f12, a0, FDEC_STRIDE * 14 - 1 + fld.d f13, a0, FDEC_STRIDE * 15 - 1 + vilvl.b vr6, vr7, vr6 + vilvl.b vr8, vr9, vr8 + vilvl.b vr10, vr11, vr10 + vilvl.b vr12, vr13, vr12 + vilvl.h vr6, vr8, vr6 + vilvl.h vr10, vr12, vr10 + vilvl.w vr6, vr10, vr6 + + fld.d f7, a0, FDEC_STRIDE * 6 - 1 + fld.d f8, a0, FDEC_STRIDE * 5 - 1 + fld.d f9, a0, FDEC_STRIDE * 4 - 1 + fld.d f10, a0, FDEC_STRIDE * 3 - 1 + fld.d f11, a0, FDEC_STRIDE * 2 - 1 + fld.d f12, a0, FDEC_STRIDE - 1 + fld.d f13, a0, -1 + fld.d f14, a0, -FDEC_STRIDE - 1 + vilvl.b vr7, vr8, vr7 + vilvl.b vr9, vr10, vr9 + vilvl.b vr11, vr12, vr11 + vilvl.b vr13, vr14, vr13 + vilvl.h vr7, vr9, vr7 + vilvl.h vr11, vr13, vr11 + vilvl.w vr7, vr11, vr7 + + vilvl.b vr6, vr0, vr6 + vilvl.b vr7, vr0, vr7 + vsub.h vr6, vr6, vr7 + vmul.h vr6, vr6, vr3 + vhaddw.w.h vr6, vr6, vr6 + vhaddw.d.w vr6, vr6, vr6 + vhaddw.q.d vr6, vr6, vr6 + vpickve2gr.w t1, vr6, 0 /* V */ + + ld.bu t2, a0, FDEC_STRIDE * 15 - 1 + ld.bu t3, a0, 15 - FDEC_STRIDE + add.w t2, t2, t3 + slli.w t2, t2, 4 /* a */ + + slli.w t3, t0, 2 + add.w t0, t0, t3 + addi.w t0, t0, 32 + srai.w t0, t0, 6 /* b */ + + slli.w t3, t1, 2 + add.w t1, t1, t3 + addi.w t1, t1, 32 + srai.w t1, t1, 6 /* c */ + + add.w t3, t0, t1 + slli.w t4, t3, 3 + sub.w t4, t4, t3 + sub.w t5, t2, t4 + addi.w t5, t5, 16 /* i00 */ + + la.local t3, muld + vld vr14, t3, 0 + vld vr20, t3, 16 + vreplgr2vr.h vr12, t0 + vmul.h vr22, vr12, vr14 + vmul.h vr23, vr12, vr20 +.rept 16 + vreplgr2vr.h vr14, t5 + vadd.h vr13, vr22, vr14 + vadd.h vr16, vr23, vr14 + vssrani.bu.h vr15, vr13, 5 + vssrani.bu.h vr17, vr16, 5 + vpermi.w vr17, vr15, 0x44 + vst vr17, a0, 0 + addi.d a0, a0, FDEC_STRIDE + add.w t5, t5, t1 +.endr +endfunc_x264 +#endif /* !HIGH_BIT_DEPT H */ diff --git a/common/loongarch/predict-c.c b/common/loongarch/predict-c.c new file mode 100644 index 000000000..ce246e4bf --- /dev/null +++ b/common/loongarch/predict-c.c @@ -0,0 +1,106 @@ +/***************************************************************************** + * predict-c.c: loongarch intra prediction + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Xiwei Gu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "common/common.h" +#include "predict.h" + +void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] ) +{ +#if !HIGH_BIT_DEPTH + if( cpu&X264_CPU_LSX ) + { + pf[I_PRED_16x16_V ] = x264_predict_16x16_v_lsx; + pf[I_PRED_16x16_H ] = x264_predict_16x16_h_lsx; + pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_lsx; + pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_lsx; + pf[I_PRED_16x16_DC_TOP ]= x264_predict_16x16_dc_top_lsx; + pf[I_PRED_16x16_DC_128 ]= x264_predict_16x16_dc_128_lsx; + pf[I_PRED_16x16_P ] = x264_predict_16x16_p_lsx; + } + if( cpu&X264_CPU_LASX ) + { + pf[I_PRED_16x16_P ] = x264_predict_16x16_p_lasx; + } +#endif +} + +void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] ) +{ +#if !HIGH_BIT_DEPTH + if( cpu&X264_CPU_LSX ) + { + pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_lsx; + pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_lsx; + pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_lsx; + pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_lsx; + pf[I_PRED_CHROMA_DC_128] = x264_predict_8x8c_dc_128_lsx; + pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_lsx; + pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x8c_dc_left_lsx; + } +#endif +} + +void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ) +{ +#if !HIGH_BIT_DEPTH + if( cpu&X264_CPU_LSX ) + { + pf[I_PRED_8x8_V] = x264_predict_8x8_v_lsx; + pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_lsx; + pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_lsx; + pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_lsx; + pf[I_PRED_8x8_DC_128] = x264_predict_8x8_dc_128_lsx; + pf[I_PRED_8x8_H] = x264_predict_8x8_h_lsx; + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_lsx; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_lsx; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_lsx; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_lsx; + } + if( cpu&X264_CPU_LASX ) + { + pf[I_PRED_8x8_H] = x264_predict_8x8_h_lasx; + pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_lasx; + pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_lasx; + pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_lasx; + pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_lasx; + } +#endif +} + +void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] ) +{ +#if !HIGH_BIT_DEPTH + if( cpu&X264_CPU_LSX ) + { + pf[I_PRED_4x4_V] = x264_predict_4x4_v_lsx; + pf[I_PRED_4x4_H] = x264_predict_4x4_h_lsx; + pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_lsx; + pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_lsx; + pf[I_PRED_4x4_DC_LEFT]= x264_predict_4x4_dc_left_lsx; + pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_lsx; + pf[I_PRED_4x4_DC_128] = x264_predict_4x4_dc_128_lsx; + } +#endif +} diff --git a/common/loongarch/predict.h b/common/loongarch/predict.h new file mode 100644 index 000000000..b246ad60e --- /dev/null +++ b/common/loongarch/predict.h @@ -0,0 +1,150 @@ +/***************************************************************************** + * predict.h: loongarch intra prediction + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Xiwei Gu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_LOONGARCH_PREDICT_H +#define X264_LOONGARCH_PREDICT_H + +#define x264_predict_8x8c_p_lsx x264_template(predict_8x8c_p_lsx) +void x264_predict_8x8c_p_lsx(uint8_t *p_src); + +#define x264_predict_8x8c_v_lsx x264_template(predict_8x8c_v_lsx) +void x264_predict_8x8c_v_lsx(uint8_t *p_src); + +#define x264_predict_8x8c_h_lsx x264_template(predict_8x8c_h_lsx) +void x264_predict_8x8c_h_lsx(uint8_t *p_src); + +#define x264_predict_8x8c_dc_lsx x264_template(predict_8x8c_dc_lsx) +void x264_predict_8x8c_dc_lsx(pixel *src); + +#define x264_predict_8x8c_dc_128_lsx x264_template(predict_8x8c_dc_128_lsx) +void x264_predict_8x8c_dc_128_lsx(pixel *src); + +#define x264_predict_8x8c_dc_top_lsx x264_template(predict_8x8c_dc_top_lsx) +void x264_predict_8x8c_dc_top_lsx(pixel *src); + +#define x264_predict_8x8c_dc_left_lsx x264_template(predict_8x8c_dc_left_lsx) +void x264_predict_8x8c_dc_left_lsx(pixel *src); + +#define x264_predict_16x16_dc_lsx x264_template(predict_16x16_dc_lsx) +void x264_predict_16x16_dc_lsx( pixel *src ); + +#define x264_predict_16x16_dc_left_lsx x264_template(predict_16x16_dc_left_lsx) +void x264_predict_16x16_dc_left_lsx( pixel *src ); + +#define x264_predict_16x16_dc_top_lsx x264_template(predict_16x16_dc_top_lsx) +void x264_predict_16x16_dc_top_lsx( pixel *src ); + +#define x264_predict_16x16_dc_128_lsx x264_template(predict_16x16_dc_128_lsx) +void x264_predict_16x16_dc_128_lsx( pixel *src ); + +#define x264_predict_16x16_h_lsx x264_template(predict_16x16_h_lsx) +void x264_predict_16x16_h_lsx( pixel *src ); + +#define x264_predict_16x16_v_lsx x264_template(predict_16x16_v_lsx) +void x264_predict_16x16_v_lsx( pixel *src ); + +#define x264_predict_16x16_p_lasx x264_template(predict_16x16_p_lasx) +void x264_predict_16x16_p_lasx( pixel *src ); + +#define x264_predict_16x16_p_lsx x264_template(predict_16x16_p_lsx) +void x264_predict_16x16_p_lsx( pixel *src ); + +#define x264_predict_8x8_v_lsx x264_template(predict_8x8_v_lsx) +void x264_predict_8x8_v_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_h_lasx x264_template(predict_8x8_h_lasx) +void x264_predict_8x8_h_lasx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_h_lsx x264_template(predict_8x8_h_lsx) +void x264_predict_8x8_h_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_dc_lsx x264_template(predict_8x8_dc_lsx) +void x264_predict_8x8_dc_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_dc_left_lsx x264_template(predict_8x8_dc_left_lsx) +void x264_predict_8x8_dc_left_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_dc_top_lsx x264_template(predict_8x8_dc_top_lsx) +void x264_predict_8x8_dc_top_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_dc_128_lsx x264_template(predict_8x8_dc_128_lsx) +void x264_predict_8x8_dc_128_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_ddl_lasx x264_template(predict_8x8_ddl_lasx) +void x264_predict_8x8_ddl_lasx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_ddl_lsx x264_template(predict_8x8_ddl_lsx) +void x264_predict_8x8_ddl_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_ddr_lasx x264_template(predict_8x8_ddr_lasx) +void x264_predict_8x8_ddr_lasx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_ddr_lsx x264_template(predict_8x8_ddr_lsx) +void x264_predict_8x8_ddr_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_vr_lasx x264_template(predict_8x8_vr_lasx) +void x264_predict_8x8_vr_lasx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_vr_lsx x264_template(predict_8x8_vr_lsx) +void x264_predict_8x8_vr_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_vl_lasx x264_template(predict_8x8_vl_lasx) +void x264_predict_8x8_vl_lasx( pixel *src, pixel edge[36] ); + +#define x264_predict_8x8_vl_lsx x264_template(predict_8x8_vl_lsx) +void x264_predict_8x8_vl_lsx( pixel *src, pixel edge[36] ); + +#define x264_predict_4x4_v_lsx x264_template(predict_4x4_v_lsx) +void x264_predict_4x4_v_lsx( pixel *p_src ); + +#define x264_predict_4x4_h_lsx x264_template(predict_4x4_h_lsx) +void x264_predict_4x4_h_lsx( pixel *p_src ); + +#define x264_predict_4x4_dc_lsx x264_template(predict_4x4_dc_lsx) +void x264_predict_4x4_dc_lsx( pixel *p_src ); + +#define x264_predict_4x4_ddl_lsx x264_template(predict_4x4_ddl_lsx) +void x264_predict_4x4_ddl_lsx( pixel *p_src ); + +#define x264_predict_4x4_dc_top_lsx x264_template(predict_4x4_dc_top_lsx) +void x264_predict_4x4_dc_top_lsx( pixel *p_src ); + +#define x264_predict_4x4_dc_left_lsx x264_template(predict_4x4_dc_left_lsx) +void x264_predict_4x4_dc_left_lsx( pixel *p_src ); + +#define x264_predict_4x4_dc_128_lsx x264_template(predict_4x4_dc_128_lsx) +void x264_predict_4x4_dc_128_lsx( pixel *p_src ); + +#define x264_predict_4x4_init_loongarch x264_template(predict_4x4_init_loongarch) +void x264_predict_4x4_init_loongarch( int cpu, x264_predict_t pf[12] ); +#define x264_predict_8x8_init_loongarch x264_template(predict_8x8_init_loongarch) +void x264_predict_8x8_init_loongarch( int cpu, x264_predict8x8_t pf[12], + x264_predict_8x8_filter_t *predict_filter ); +#define x264_predict_8x8c_init_loongarch x264_template(predict_8x8c_init_loongarch) +void x264_predict_8x8c_init_loongarch( int cpu, x264_predict_t pf[7] ); +#define x264_predict_16x16_init_loongarch x264_template(predict_16x16_init_loongarch) +void x264_predict_16x16_init_loongarch( int cpu, x264_predict_t pf[7] ); + +#endif diff --git a/common/loongarch/quant-a.S b/common/loongarch/quant-a.S new file mode 100644 index 000000000..a2f099d97 --- /dev/null +++ b/common/loongarch/quant-a.S @@ -0,0 +1,1231 @@ +/***************************************************************************** + * quant-a.S: LoongArch quantization and level-run + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Shiyou Yin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "loongson_asm.S" +#include "loongson_util.S" + +const last64_shuf +.int 0, 4, 1, 5, 2, 6, 3, 7 +endconst + +/* + * int quant_4x4x4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] ) + */ +.macro QUANT_ONE_LASX s1, s2, s3, s4 + xvld xr1, \s1, 0 /* Load dctcoef */ + xvadda.h \s4, xr1, \s3 + xvmuh.hu \s4, \s4, \s2 + xvsigncov.h \s4, xr1, \s4 + xvst \s4, \s1, 0 +.endm + +function_x264 quant_4x4x4_lasx + xvld xr2, a1, 0 + xvld xr3, a2, 0 + QUANT_ONE_LASX a0, xr2, xr3, xr4 + addi.d a0, a0, 32 + QUANT_ONE_LASX a0, xr2, xr3, xr0 + xvssrlni.h.w xr0, xr4, 0 + addi.d a0, a0, 32 + QUANT_ONE_LASX a0, xr2, xr3, xr4 + addi.d a0, a0, 32 + QUANT_ONE_LASX a0, xr2, xr3, xr5 + xvssrlni.h.w xr5, xr4, 0 + xvssrlni.h.w xr5, xr0, 0 + xvseqi.w xr5, xr5, 0 + xvmskltz.w xr5, xr5 + xvpickve2gr.w t0, xr5, 0 + xvpickve2gr.w t1, xr5, 4 + alsl.d t0, t1, t0, 4 + and t0, t0, t1 + xori a0, t0, 0xf +endfunc_x264 + +.macro QUANT_ONE_LSX tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 + vld vr0, \tmp1, 0 + vld vr1, \tmp1, 16 + vadda.h \tmp6, vr0, \tmp4 + vadda.h \tmp7, vr1, \tmp5 + vmuh.hu \tmp6, \tmp6, \tmp2 + vmuh.hu \tmp7, \tmp7, \tmp3 + vsigncov.h \tmp6, vr0, \tmp6 + vsigncov.h \tmp7, vr1, \tmp7 + vst \tmp6, \tmp1, 0 + vst \tmp7, \tmp1, 16 +.endm + +function_x264 quant_4x4x4_lsx + vld vr2, a1, 0 + vld vr3, a1, 16 + vld vr4, a2, 0 + vld vr5, a2, 16 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr6, vr7 + addi.d a0, a0, 32 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr8, vr9 + vssrlni.h.w vr8, vr6, 0 + vssrlni.h.w vr9, vr7, 0 + addi.d a0, a0, 32 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr10, vr11 + addi.d a0, a0, 32 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr12, vr13 + vssrlni.h.w vr12, vr10, 0 + vssrlni.h.w vr13, vr11, 0 + vssrlni.h.w vr12, vr8, 0 + vssrlni.h.w vr13, vr9, 0 + vseqi.w vr12, vr12, 0 + vseqi.w vr13, vr13, 0 + vmskltz.w vr12, vr12 + vmskltz.w vr13, vr13 + vpickve2gr.w t0, vr12, 0 + vpickve2gr.w t1, vr13, 0 + alsl.d t0, t1, t0, 4 + and t0, t0, t1 + xori a0, t0, 0xf +endfunc_x264 + +function_x264 quant_4x4_lsx + vld vr2, a1, 0 + vld vr3, a1, 16 + vld vr4, a2, 0 + vld vr5, a2, 16 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr10, vr11 + vor.v vr22, vr10, vr11 + vpickve2gr.d t0, vr22, 0 + vpickve2gr.d t1, vr22, 1 + or t2, t0, t1 + addi.w t3, zero, 1 + maskeqz a0, t3, t2 +endfunc_x264 + +function_x264 quant_8x8_lsx + vld vr2, a1, 0 + vld vr3, a1, 16 + vld vr4, a2, 0 + vld vr5, a2, 16 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr12, vr13 + + addi.d a0, a0, 32 + vld vr2, a1, 32 + vld vr3, a1, 48 + vld vr4, a2, 32 + vld vr5, a2, 48 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr14, vr15 + + addi.d a0, a0, 32 + vld vr2, a1, 64 + vld vr3, a1, 80 + vld vr4, a2, 64 + vld vr5, a2, 80 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr22, vr23 + + addi.d a0, a0, 32 + vld vr2, a1, 96 + vld vr3, a1, 112 + vld vr4, a2, 96 + vld vr5, a2, 112 + QUANT_ONE_LSX a0, vr2, vr3, vr4, vr5, vr7, vr8 + + vor.v vr12, vr12, vr14 + vor.v vr13, vr13, vr15 + vor.v vr22, vr22, vr7 + vor.v vr23, vr23, vr8 + vor.v vr12, vr12, vr22 + vor.v vr13, vr13, vr23 + vor.v vr11, vr12, vr13 + vpickve2gr.d t0, vr11, 0 + vpickve2gr.d t1, vr11, 1 + or t2, t0, t1 + addi.w t3, zero, 1 + maskeqz a0, t3, t2 +endfunc_x264 + +function_x264 quant_4x4_dc_lsx + vld vr0, a0, 0 + vld vr1, a0, 16 + vreplgr2vr.w vr2, a1 + vreplgr2vr.w vr3, a2 + vslei.h vr4, vr0, 0 + vslei.h vr5, vr1, 0 + + vexth.w.h vr7, vr0 + vsllwil.w.h vr6, vr0, 0 + vexth.w.h vr9, vr1 + vsllwil.w.h vr8, vr1, 0 + vadda.w vr6, vr3, vr6 + vadda.w vr7, vr3, vr7 + vadda.w vr8, vr3, vr8 + vadda.w vr9, vr3, vr9 + vmul.w vr6, vr6, vr2 + vmul.w vr7, vr7, vr2 + vmul.w vr8, vr8, vr2 + vmul.w vr9, vr9, vr2 + vsrani.h.w vr8, vr6, 16 + vsrani.h.w vr9, vr7, 16 + vpermi.w vr10, vr9, 0x0E + vpermi.w vr9, vr8, 0x44 + vpermi.w vr10, vr8, 0x4E + vneg.h vr11, vr9 + vneg.h vr12, vr10 + vbitsel.v vr13, vr9, vr11, vr4 + vbitsel.v vr14, vr10, vr12, vr5 + vst vr13, a0, 0 + vst vr14, a0, 16 + + vor.v vr15, vr11, vr12 + vpickve2gr.d t0, vr15, 0 + vpickve2gr.d t1, vr15, 1 + or t2, t0, t1 + addi.w t3, zero, 1 + maskeqz a0, t3, t2 +endfunc_x264 + +/* + * int quant_2x2_dc( dctcoef dct[4], int mf, int bias ) + */ +function_x264 quant_2x2_dc_lsx + fld.d f0, a0, 0 + vreplgr2vr.w vr1, a1 + vreplgr2vr.w vr2, a2 + + vslei.h vr3, vr0, 0 + + vsllwil.w.h vr4, vr0, 0 + vadda.w vr4, vr4, vr2 + vmul.w vr4, vr4, vr1 + vsrani.h.w vr4, vr4, 16 + vneg.h vr8, vr4 + vbitsel.v vr9, vr4, vr8, vr3 + vstelm.d vr9, a0, 0, 0 + + vpickve2gr.w t0, vr9, 0 + vpickve2gr.w t1, vr9, 1 + or t2, t0, t1 + addi.w t3, zero, 1 + maskeqz a0, t3, t2 +endfunc_x264 + +/* + * int coeff_last64_c(dctcoef *l) + */ +function_x264 coeff_last64_lasx + addi.w t0, zero, 63 + xvxor.v xr20, xr0, xr0 + + xvld xr0, a0, 0 + xvld xr1, a0, 32 + xvld xr2, a0, 64 + xvld xr3, a0, 96 + + xvldi xr4, 1 + la.local t1, last64_shuf + xvld xr7, t1, 0 + xvldi xr9, 0x408 + xvldi xr10, 0x401 + + xvssrlni.bu.h xr1, xr0, 0 + xvssrlni.bu.h xr3, xr2, 0 + xvsle.bu xr5, xr4, xr1 + xvsle.bu xr6, xr4, xr3 + xvssrlni.bu.h xr6, xr5, 4 + xvperm.w xr6, xr6, xr7 + xvclz.w xr7, xr6 + xvssrlni.hu.w xr7, xr7, 2 + xvpermi.d xr8, xr7, 0xd8 + + xvsub.h xr9, xr9, xr8 + xvsll.h xr10, xr10, xr9 + xvssrlni.bu.h xr10, xr10, 1 + xvclz.d xr11, xr10 + xvpickve2gr.w t3, xr11, 0 + sub.w a0, t0, t3 +endfunc_x264 + +function_x264 coeff_last64_lsx + addi.w t0, zero, 63 + vxor.v vr20, vr0, vr0 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + vld vr4, a0, 64 + vld vr5, a0, 80 + vld vr6, a0, 96 + vld vr7, a0, 112 + vldi vr8, 1 + vldi vr9, 0x408 + vldi vr10, 0x401 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vssrlni.bu.h vr2, vr2, 0 + vssrlni.bu.h vr3, vr3, 0 + vssrlni.bu.h vr4, vr4, 0 + vssrlni.bu.h vr5, vr5, 0 + vssrlni.bu.h vr6, vr6, 0 + vssrlni.bu.h vr7, vr7, 0 + vpermi.w vr2, vr0, 0x44 + vpermi.w vr3, vr1, 0x44 + vpermi.w vr6, vr4, 0x44 + vpermi.w vr7, vr5, 0x44 + vsle.bu vr2, vr8, vr2 + vsle.bu vr3, vr8, vr3 + vsle.bu vr6, vr8, vr6 + vsle.bu vr7, vr8, vr7 + vssrlni.bu.h vr2, vr2, 4 + vssrlni.bu.h vr3, vr3, 4 + vssrlni.bu.h vr6, vr6, 4 + vssrlni.bu.h vr7, vr7, 4 + vpermi.w vr6, vr2, 0x44 + vpermi.w vr7, vr3, 0x44 + vpermi.w vr11, vr7, 0x0E + vpermi.w vr7, vr6, 0x44 + vpermi.w vr7, vr7, 0xD8 + vpermi.w vr11, vr6, 0x4E + vpermi.w vr11, vr11, 0xD8 + vclz.w vr7, vr7 + vclz.w vr11, vr11 + vssrlni.hu.w vr7, vr7, 2 + vssrlni.hu.w vr11, vr11, 2 + vpermi.w vr12, vr11, 0x0E + vpermi.w vr11, vr7, 0x44 + vpermi.w vr12, vr7, 0x4E + vsub.h vr11, vr9, vr11 + vsub.h vr12, vr9, vr12 + vsll.h vr13, vr10, vr11 + vsll.h vr14, vr10, vr12 + vssrlni.bu.h vr13, vr13, 1 + vssrlni.bu.h vr14, vr14, 1 + + vclz.d vr15, vr14 + vpickve2gr.w t1, vr15, 0 + sub.w a0, t0, t1 +endfunc_x264 + +/* + * int coeff_last16_c(dctcoef *l) + */ +function_x264 coeff_last16_lasx + addi.w t0, zero, 15 + + xvld xr0, a0, 0 + xvldi xr2, 1 + + xvssrlni.bu.h xr0, xr0, 0 + xvpermi.d xr1, xr0, 0xd8 + xvsle.bu xr3, xr2, xr1 + xvssrlni.bu.h xr3, xr3, 4 + xvclz.d xr4, xr3 + xvpickve2gr.w t1, xr4, 0 + + srai.w t1, t1, 2 + sub.w a0, t0, t1 +endfunc_x264 + +function_x264 coeff_last16_lsx + addi.w t0, zero, 15 + vld vr0, a0, 0 + vld vr1, a0, 16 + vldi vr2, 1 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vpermi.w vr1, vr0, 0x44 + vsle.bu vr3, vr2, vr1 + vssrlni.bu.h vr3, vr3, 4 + vclz.d vr4, vr3 + vpickve2gr.w t1, vr4, 0 + + srai.w t1, t1, 2 + sub.w a0, t0, t1 +endfunc_x264 + +/* + * int coeff_last15_c(dctcoef *l) + */ +function_x264 coeff_last15_lasx + addi.w t0, zero, 15 + + vld vr0, a0, 0 + vld vr1, a0, 16 + xvldi xr3, 1 + + vinsgr2vr.h vr1, zero, 7 + xvpermi.q xr1, xr0, 0x20 + + xvssrlni.bu.h xr1, xr1, 0 + xvpermi.d xr2, xr1, 0xd8 + xvsle.bu xr4, xr3, xr2 + xvssrlni.bu.h xr4, xr4, 4 + xvclz.d xr5, xr4 + xvpickve2gr.w t1, xr5, 0 + + srai.w t1, t1, 2 + sub.w a0, t0, t1 +endfunc_x264 + +function_x264 coeff_last15_lsx + addi.w t0, zero, 15 + vld vr0, a0, 0 + vld vr1, a0, 16 + vldi vr2, 1 + vinsgr2vr.h vr1, zero, 7 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vpermi.w vr1, vr0, 0x44 + vsle.bu vr3, vr2, vr1 + vssrlni.bu.h vr3, vr3, 4 + vclz.d vr4, vr3 + vpickve2gr.w t1, vr4, 0 + + srai.w t1, t1, 2 + sub.w a0, t0, t1 +endfunc_x264 + +/* + * int coeff_last8_c(dctcoef *l) + */ +function_x264 coeff_last8_lsx + addi.w t0, zero, 7 + vld vr0, a0, 0 + vclz.d vr1, vr0 + vpickve2gr.w t1, vr1, 0 + vpickve2gr.w t2, vr1, 2 + li.d t3, 64 + bne t2, t3, .LAST8_LOW_LSX + addi.d t4, t1, 0 + addi.d t0, t0, -4 + b .LAST8_END_LSX +.LAST8_LOW_LSX: + addi.d t4, t2, 0 +.LAST8_END_LSX: + srai.w t4, t4, 4 + sub.w a0, t0, t4 +endfunc_x264 + +/* + * int coeff_last4_c(dctcoef *l) + */ +function_x264 coeff_last4_lsx + addi.w t0, zero, 3 + vld vr0, a0, 0 + vclz.d vr1, vr0 + vpickve2gr.w t1, vr1, 0 + srai.w t1, t1, 4 + sub.w a0, t0, t1 +endfunc_x264 + +// (dct[i] * dequant_mf[i]) << (i_qbits) +.macro DCT_MF a0, a1, in0, out0, out1 + vld vr1, \a0, 0 + xvld xr2, \a1, 0 + + vext2xv.w.h xr5, xr1 + xvmul.w xr5, xr5, xr2 + xvsll.w \out0, xr5, \in0 + + vld vr1, \a0, 16 + xvld xr2, \a1, 32 + vext2xv.w.h xr5, xr1 + xvmul.w xr5, xr5, xr2 + xvsll.w \out1, xr5, \in0 +.endm + +// (dct[i] * dequant_mf[i] + f) >> (-i_qbits) +.macro DCT_MF_F a0, a1, in0, out0, out1 + vld vr1, \a0, 0 + xvld xr2, \a1, 0 + + vext2xv.w.h xr5, xr1 + xvmul.w xr5, xr5, xr2 + xvsrar.w \out0, xr5, \in0 + + vld vr1, \a0, 16 + xvld xr2, \a1, 32 + vext2xv.w.h xr5, xr1 + xvmul.w xr5, xr5, xr2 + xvsrar.w \out1, xr5, \in0 +.endm + +/* + * void dequant_4x4( dctcoef dct[16], int dequant_mf[6][16], int i_qp ) + */ +function_x264 dequant_4x4_lasx + addi.w t1, zero, 6 + addi.w t2, zero, 4 + div.w t0, a2, t1 + sub.w t0, t0, t2 // i_qp/6 - 4 + mod.w t1, a2, t1 // i_qp%6 + slli.w t1, t1, 6 + add.d a1, a1, t1 + + blt t0, zero, .DQ4x4_DEQUANT_SHR + + // i_qbits >= 0 + xvreplgr2vr.w xr0, t0 + DCT_MF a0, a1, xr0, xr6, xr7 + b .DQ4x4_END + +.DQ4x4_DEQUANT_SHR: + sub.w t4, zero, t0 + xvreplgr2vr.w xr4, t4 + DCT_MF_F a0, a1, xr4, xr6, xr7 + +.DQ4x4_END: + xvpickev.h xr8, xr7, xr6 + xvpermi.d xr8, xr8, 0xd8 + xvst xr8, a0, 0 +endfunc_x264 + +.macro DCT_MF_LSX tmp0, tmp1, in0, out0, out1, out2, out3 + vld vr0, \tmp0, 0 + vld vr1, \tmp1, 0 + vld vr2, \tmp1, 16 + vexth.w.h vr4, vr0 + vsllwil.w.h vr3, vr0, 0 + vmul.w vr3, vr3, vr1 + vmul.w vr4, vr4, vr2 + vsll.w \out0, vr3, \in0 + vsll.w \out1, vr4, \in0 + + vld vr0, \tmp0, 16 + vld vr1, \tmp1, 32 + vld vr2, \tmp1, 48 + vsllwil.w.h vr3, vr0, 0 + vpermi.w vr4, vr0, 0x0E + vsllwil.w.h vr4, vr4, 0 + vmul.w vr3, vr3, vr1 + vmul.w vr4, vr4, vr2 + vsll.w \out2, vr3, \in0 + vsll.w \out3, vr4, \in0 +.endm + +.macro DCT_MF_F_LSX tmp0, tmp1, in0, out0, out1, out2, out3 + vld vr0, \tmp0, 0 + vld vr1, \tmp1, 0 + vld vr2, \tmp1, 16 + vexth.w.h vr4, vr0 + vsllwil.w.h vr3, vr0, 0 + vmul.w vr3, vr3, vr1 + vmul.w vr4, vr4, vr2 + vsrar.w \out0, vr3, \in0 + vsrar.w \out1, vr4, \in0 + + vld vr0, \tmp0, 16 + vld vr1, \tmp1, 32 + vld vr2, \tmp1, 48 + vexth.w.h vr4, vr0 + vsllwil.w.h vr3, vr0, 0 + vmul.w vr3, vr3, vr1 + vmul.w vr4, vr4, vr2 + vsrar.w \out2, vr3, \in0 + vsrar.w \out3, vr4, \in0 +.endm + +function_x264 dequant_4x4_lsx + addi.w t1, zero, 6 + addi.w t2, zero, 4 + div.w t0, a2, t1 + sub.w t0, t0, t2 + mod.w t1, a2, t1 + slli.w t1, t1, 6 + add.d a1, a1, t1 + blt t0, zero, .DQ4x4_DEQUANT_SHR_LSX + + vreplgr2vr.w vr6, t0 + DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 + b .DQ4x4_END_LSX + +.DQ4x4_DEQUANT_SHR_LSX: + sub.w t4, zero, t0 + vreplgr2vr.w vr6, t4 + DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 +.DQ4x4_END_LSX: + vpickev.h vr11, vr9, vr7 + vpickev.h vr12, vr10, vr8 + vpermi.w vr13, vr12, 0x0E + vpermi.w vr12, vr11, 0x44 + vpermi.w vr13, vr11, 0x4E + vst vr12, a0, 0 + vst vr13, a0, 16 +endfunc_x264 +/* + * void dequant_8x8( dctcoef dct[64], int dequant_mf[6][64], int i_qp ) + */ +function_x264 dequant_8x8_lasx + addi.w t1, zero, 6 + div.w t0, a2, t1 + sub.w t0, t0, t1 + mod.w t1, a2, t1 // i_qp%6 + slli.w t1, t1, 8 + add.d a1, a1, t1 + + blt t0, zero, .DQ8x8_DEQUANT_SHR + // i_qbits >= 0 + xvreplgr2vr.w xr0, t0 + DCT_MF a0, a1, xr0, xr6, xr7 + xvpickev.h xr8, xr7, xr6 + xvpermi.d xr8, xr8, 0xd8 + xvst xr8, a0, 0 + +.rept 3 + addi.d a0, a0, 32 + addi.d a1, a1, 64 + DCT_MF a0, a1, xr0, xr6, xr7 + xvpickev.h xr8, xr7, xr6 + xvpermi.d xr8, xr8, 0xd8 + xvst xr8, a0, 0 +.endr + b .DQ8x8_END + +// i_qbits < 0 +.DQ8x8_DEQUANT_SHR: + sub.w t4, zero, t0 + xvreplgr2vr.w xr4, t4 + + DCT_MF_F a0, a1, xr4, xr6, xr7 + xvpickev.h xr8, xr7, xr6 + xvpermi.d xr8, xr8, 0xd8 + xvst xr8, a0, 0 + +.rept 3 + addi.d a0, a0, 32 + addi.d a1, a1, 64 + DCT_MF_F a0, a1, xr4, xr6, xr7 + xvpickev.h xr8, xr7, xr6 + xvpermi.d xr8, xr8, 0xd8 + xvst xr8, a0, 0 +.endr + +.DQ8x8_END: +endfunc_x264 + +function_x264 dequant_8x8_lsx + addi.w t1, zero, 6 + div.w t0, a2, t1 + sub.w t0, t0, t1 + mod.w t1, a2, t1 + slli.w t1, t1, 8 + add.d a1, a1, t1 + + blt t0, zero, .DQ8x8_DEQUANT_SHR_LSX + vreplgr2vr.w vr6, t0 + DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 + vpickev.h vr11, vr9, vr7 + vpickev.h vr12, vr10, vr8 + vpermi.w vr13, vr12, 0x0E + vpermi.w vr12, vr11, 0x44 + vpermi.w vr13, vr11, 0x4E + vst vr12, a0, 0 + vst vr13, a0, 16 +.rept 3 + addi.d a0, a0, 32 + addi.d a1, a1, 64 + DCT_MF_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 + vpickev.h vr11, vr9, vr7 + vpickev.h vr12, vr10, vr8 + vpermi.w vr13, vr12, 0x0E + vpermi.w vr12, vr11, 0x44 + vpermi.w vr13, vr11, 0x4E + vst vr12, a0, 0 + vst vr13, a0, 16 +.endr + b .DQ8x8_END_LSX + +.DQ8x8_DEQUANT_SHR_LSX: + sub.w t4, zero, t0 + vreplgr2vr.w vr6, t4 + DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 + vpickev.h vr11, vr9, vr7 + vpickev.h vr12, vr10, vr8 + vpermi.w vr13, vr12, 0x0E + vpermi.w vr12, vr11, 0x44 + vpermi.w vr13, vr11, 0x4E + vst vr12, a0, 0 + vst vr13, a0, 16 +.rept 3 + addi.d a0, a0, 32 + addi.d a1, a1, 64 + DCT_MF_F_LSX a0, a1, vr6, vr7, vr8, vr9, vr10 + vpickev.h vr11, vr9, vr7 + vpickev.h vr12, vr10, vr8 + vpermi.w vr13, vr12, 0x0E + vpermi.w vr12, vr11, 0x44 + vpermi.w vr13, vr11, 0x4E + vst vr12, a0, 0 + vst vr13, a0, 16 +.endr +.DQ8x8_END_LSX: +endfunc_x264 + +/* + * void dequant_4x4_dc( dctcoef dct[16], int dequant_mf[6][16], int i_qp ) + */ +function_x264 dequant_4x4_dc_lasx + addi.w t0, zero, 6 + div.w t1, a2, t0 + sub.w t1, t1, t0 + + blt t1, zero, .DQ4x4DC_LT_ZERO + // i_qbits >= 0 + mod.w t2, a2, t0 + slli.w t2, t2, 6 + ldx.w t0, a1, t2 + sll.w t0, t0, t1 + + vld vr1, a0, 0 + vld vr10, a0, 16 + xvreplgr2vr.w xr2, t0 + + vext2xv.w.h xr3, xr1 + xvmul.w xr6, xr3, xr2 + + vext2xv.w.h xr3, xr10 + xvmul.w xr7, xr3, xr2 + b .DQ4x4DC_END + +// i_qbits < 0 +.DQ4x4DC_LT_ZERO: + mod.w t2, a2, t0 + slli.w t2, t2, 6 + ldx.w t0, a1, t2 + sub.w t3, zero, t1 + + vld vr1, a0, 0 + vld vr10, a0, 16 + xvreplgr2vr.w xr2, t0 + xvreplgr2vr.w xr4, t3 + + vext2xv.w.h xr5, xr1 + xvmul.w xr5, xr5, xr2 + xvsrar.w xr6, xr5, xr4 + + vext2xv.w.h xr5, xr10 + xvmul.w xr5, xr5, xr2 + xvsrar.w xr7, xr5, xr4 + +.DQ4x4DC_END: + xvpickev.h xr8, xr7, xr6 + xvpermi.d xr8, xr8, 0xd8 + xvst xr8, a0, 0 +endfunc_x264 + +function_x264 dequant_4x4_dc_lsx + addi.w t0, zero, 6 + div.w t1, a2, t0 + sub.w t1, t1, t0 + + blt t1, zero, .DQ4x4DC_LT_ZERO_LSX + mod.w t2, a2, t0 + slli.w t2, t2, 6 + ldx.w t0, a1, t2 + sll.w t0, t0, t1 + vld vr1, a0, 0 + vld vr2, a0, 16 + vreplgr2vr.w vr3, t0 + vexth.w.h vr6, vr1 + vsllwil.w.h vr5, vr1, 0 + vmul.w vr5, vr5, vr3 + vmul.w vr6, vr6, vr3 + + vexth.w.h vr8, vr2 + vsllwil.w.h vr7, vr2, 0 + vmul.w vr7, vr7, vr3 + vmul.w vr8, vr8, vr3 + b .DQ4x4DC_END_LSX +.DQ4x4DC_LT_ZERO_LSX: + mod.w t2, a2, t0 + slli.w t2, t2, 6 + ldx.w t0, a1, t2 + sub.w t3, zero, t1 + vld vr1, a0, 0 + vld vr2, a0, 16 + vreplgr2vr.w vr3, t0 + vreplgr2vr.w vr4, t3 + vexth.w.h vr6, vr1 + vsllwil.w.h vr5, vr1, 0 + vexth.w.h vr8, vr2 + vsllwil.w.h vr7, vr2, 0 + vmul.w vr5, vr5, vr3 + vmul.w vr6, vr6, vr3 + vmul.w vr7, vr7, vr3 + vmul.w vr8, vr8, vr3 + vsrar.w vr5, vr5, vr4 + vsrar.w vr6, vr6, vr4 + vsrar.w vr7, vr7, vr4 + vsrar.w vr8, vr8, vr4 +.DQ4x4DC_END_LSX: + vpickev.h vr9, vr7, vr5 + vpickev.h vr10, vr8, vr6 + vpermi.w vr11, vr10, 0x0E + vpermi.w vr10, vr9, 0x44 + vpermi.w vr11, vr9, 0x4E + vst vr10, a0, 0 + vst vr11, a0, 16 +endfunc_x264 + +/* + * int decimate_score15( dctcoef *dct ) + */ +function_x264 decimate_score15_lsx + addi.w t0, zero, 15 + la.local t3, x264_decimate_table4 + addi.d t4, a0, 2 + + vld vr0, t4, 0 + vld vr1, t4, 16 + vldi vr3, 1 + vinsgr2vr.h vr1, zero, 7 + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vpermi.w vr2, vr1, 0x0E + vpermi.w vr1, vr0, 0x44 + vpermi.w vr2, vr0, 0x4E + vsle.bu vr4, vr3, vr1 + vsle.bu vr5, vr3, vr2 + vssrlni.bu.h vr4, vr4, 4 + vssrlni.bu.h vr5, vr5, 4 + vclz.d vr4, vr4 + vclz.d vr5, vr5 + vpickve2gr.w t1, vr4, 0 + + srai.w t1, t1, 2 + sub.w t2, t0, t1 + addi.w t0, zero, 2 + move a0, zero + slli.d t2, t2, 1 +.LOOP_SCORE_15_LSX: + blt t2, zero, .END_SCORE_15_LSX + ldx.h t5, t4, t2 + addi.d t6, t5, 1 + bltu t0, t6, .RET_SCORE_15_1_LSX + addi.d t2, t2, -2 + move t5, zero +.WHILE_SCORE_15_LSX: + blt t2, zero, .END_WHILE_15_LSX + ldx.h t1, t4, t2 + bnez t1, .END_WHILE_15_LSX + addi.d t2, t2, -2 + addi.d t5, t5, 1 + b .WHILE_SCORE_15_LSX +.END_WHILE_15_LSX: + ldx.b t1, t3, t5 + add.d a0, a0, t1 + b .LOOP_SCORE_15_LSX +.RET_SCORE_15_1_LSX: + addi.d a0, zero, 9 + jirl $r0, $r1, 0x0 +.END_SCORE_15_LSX: +endfunc_x264 + +/* + * int decimate_score16( dctcoef *dct ) + */ +function_x264 decimate_score16_lsx + addi.w t0, zero, 15 + la.local t3, x264_decimate_table4 + addi.w t0, zero, 15 + vld vr0, a0, 0 + vld vr1, a0, 16 + vldi vr2, 1 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vpermi.w vr3, vr1, 0x0E + vpermi.w vr1, vr0, 0x44 + vpermi.w vr3, vr0, 0x4E + vsle.bu vr4, vr2, vr1 + vsle.bu vr5, vr2, vr3 + vssrlni.bu.h vr4, vr4, 4 + vssrlni.bu.h vr5, vr5, 4 + vclz.d vr4, vr4 + vclz.d vr5, vr5 + vpickve2gr.w t1, vr4, 0 + + srai.w t1, t1, 2 + sub.w t2, t0, t1 + move t4, a0 + addi.d t0, zero, 2 + move a0, zero + slli.d t2, t2, 1 +.LOOP_SCORE_16_LSX: + blt t2, zero, .END_SCORE_16_LSX + ldx.h t5, t4, t2 + addi.d t6, t5, 1 + bltu t0, t6, .RET_SCORE_16_1_LSX + addi.d t2, t2, -2 + move t5, zero +.WHILE_SCORE_16_LSX: + blt t2, zero, .END_WHILE_16_LSX + ldx.h t1, t4, t2 + bnez t1, .END_WHILE_16_LSX + addi.d t2, t2, -2 + addi.d t5, t5, 1 + b .WHILE_SCORE_16_LSX +.END_WHILE_16_LSX: + ldx.b t1, t3, t5 + add.d a0, a0, t1 + b .LOOP_SCORE_16_LSX +.RET_SCORE_16_1_LSX: + addi.d a0, zero, 9 + jirl $r0, $r1, 0x0 +.END_SCORE_16_LSX: +endfunc_x264 + +/* + * int decimate_score64( dctcoef *dct ) + */ +function_x264 decimate_score64_lsx + addi.w t0, zero, 63 + la.local t3, x264_decimate_table8 + vxor.v vr20, vr0, vr0 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + vld vr4, a0, 64 + vld vr5, a0, 80 + vld vr6, a0, 96 + vld vr7, a0, 112 + vldi vr8, 1 + vldi vr9, 0x408 + vldi vr10, 0x401 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vssrlni.bu.h vr2, vr2, 0 + vssrlni.bu.h vr3, vr3, 0 + vssrlni.bu.h vr4, vr4, 0 + vssrlni.bu.h vr5, vr5, 0 + vssrlni.bu.h vr6, vr6, 0 + vssrlni.bu.h vr7, vr7, 0 + vpermi.w vr2, vr0, 0x44 + vpermi.w vr3, vr1, 0x44 + vpermi.w vr6, vr4, 0x44 + vpermi.w vr7, vr5, 0x44 + vsle.bu vr2, vr8, vr2 + vsle.bu vr3, vr8, vr3 + vsle.bu vr6, vr8, vr6 + vsle.bu vr7, vr8, vr7 + vssrlni.bu.h vr2, vr2, 4 + vssrlni.bu.h vr3, vr3, 4 + vssrlni.bu.h vr6, vr6, 4 + vssrlni.bu.h vr7, vr7, 4 + vpermi.w vr6, vr2, 0x44 + vpermi.w vr7, vr3, 0x44 + vpermi.w vr11, vr7, 0x0E + vpermi.w vr7, vr6, 0x44 + vpermi.w vr7, vr7, 0xD8 + vpermi.w vr11, vr6, 0x4E + vpermi.w vr11, vr11, 0xD8 + vclz.w vr7, vr7 + vclz.w vr11, vr11 + vssrlni.hu.w vr7, vr7, 2 + vssrlni.hu.w vr11, vr11, 2 + vpermi.w vr12, vr11, 0x0E + vpermi.w vr11, vr7, 0x44 + vpermi.w vr12, vr7, 0x4E + vsub.h vr11, vr9, vr11 + vsub.h vr12, vr9, vr12 + vsll.h vr13, vr10, vr11 + vsll.h vr14, vr10, vr12 + vssrlni.bu.h vr13, vr13, 1 + vssrlni.bu.h vr14, vr14, 1 + + vclz.d vr15, vr14 + vpickve2gr.w t1, vr15, 0 + sub.w t2, t0, t1 + move t4, a0 + addi.d t0, zero, 2 + slli.d t2, t2, 1 + move a0, zero +.LOOP_SCORE_64_LSX: + blt t2, zero, .END_SCORE_64_LSX + ldx.h t5, t4, t2 + addi.d t6, t5, 1 + bltu t0, t6, .RET_SCORE_64_1_LSX + addi.d t2, t2, -2 + move t5, zero +.WHILE_SCORE_64_LSX: + blt t2, zero, .END_WHILE_64_LSX + ldx.h t1, t4, t2 + bnez t1, .END_WHILE_64_LSX + addi.d t2, t2, -2 + addi.d t5, t5, 1 + b .WHILE_SCORE_64_LSX +.END_WHILE_64_LSX: + ldx.b t1, t3, t5 + add.d a0, a0, t1 + b .LOOP_SCORE_64_LSX +.RET_SCORE_64_1_LSX: + addi.d a0, zero, 9 + jirl $r0, $r1, 0x0 +.END_SCORE_64_LSX: +endfunc_x264 + +/* + * int coeff_level_run16( dctcoef *dct, x264_run_level_t *runlevel ) + */ +function_x264 coeff_level_run16_lasx + addi.w t0, zero, 15 + + xvld xr0, a0, 0 + xvldi xr2, 1 + + xvssrlni.bu.h xr0, xr0, 0 + xvpermi.d xr1, xr0, 0xd8 + xvsle.bu xr3, xr2, xr1 + xvsrlni.b.h xr3, xr3, 4 + xvpickve2gr.du t8, xr3, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN16_LASX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN16_LASX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LASX +.END_COEFF_LEVEL_RUN16_LASX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 + +function_x264 coeff_level_run15_lasx + addi.w t0, zero, 15 + + vld vr0, a0, 0 + vld vr1, a0, 16 + xvldi xr3, 1 + + vinsgr2vr.h vr1, zero, 7 + xvpermi.q xr1, xr0, 0x20 + + xvssrlni.bu.h xr1, xr1, 0 + xvpermi.d xr2, xr1, 0xd8 + xvsle.bu xr4, xr3, xr2 + xvsrlni.b.h xr4, xr4, 4 + xvpickve2gr.du t8, xr4, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN15_LASX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN15_LASX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LASX +.END_COEFF_LEVEL_RUN15_LASX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 + +function_x264 coeff_level_run16_lsx + addi.w t0, zero, 15 + vld vr0, a0, 0 + vld vr1, a0, 16 + vldi vr2, 1 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vpermi.w vr1, vr0, 0x44 + vsle.bu vr3, vr2, vr1 + vsrlni.b.h vr3, vr3, 4 + vpickve2gr.du t8, vr3, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN16_LSX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN16_LSX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN16_LSX +.END_COEFF_LEVEL_RUN16_LSX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 + +function_x264 coeff_level_run15_lsx + addi.w t0, zero, 15 + vld vr0, a0, 0 + vld vr1, a0, 16 + vldi vr2, 1 + vinsgr2vr.h vr1, zero, 7 + + vssrlni.bu.h vr0, vr0, 0 + vssrlni.bu.h vr1, vr1, 0 + vpermi.w vr1, vr0, 0x44 + vsle.bu vr3, vr2, vr1 + vsrlni.b.h vr3, vr3, 4 + vpickve2gr.du t8, vr3, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN15_LSX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN15_LSX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN15_LSX +.END_COEFF_LEVEL_RUN15_LSX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 + +function_x264 coeff_level_run8_lsx + addi.w t0, zero, 15 + vld vr0, a0, 0 + vxor.v vr1, vr1, vr1 + vldi vr2, 1 + + vssrlni.bu.h vr0, vr0, 0 + vpermi.w vr1, vr0, 0x44 + vsle.bu vr3, vr2, vr1 + vsrlni.b.h vr3, vr3, 4 + vpickve2gr.du t8, vr3, 0 + clz.d t1, t8 + + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + st.w t0, a1, 0x00 // Store runlevel->last + addi.d t3, a1, 23 + nor t2, zero, zero + addi.d t2, t2, -15 + and t3, t3, t2 // runlevel->level + xor t4, t4, t4 // mask + xor t5, t5, t5 // total: number of non-zero elements + addi.w t6, zero, 1 // const 1 +.LOOP_COEFF_LEVEL_RUN8_LSX: + slli.w t7, t0, 1 + ldx.h t2, a0, t7 + st.h t2, t3, 0 + addi.d t3, t3, 2 + + addi.w t5, t5, 1 + sll.w t2, t6, t0 + or t4, t4, t2 + bge zero, t4, .END_COEFF_LEVEL_RUN8_LSX + + addi.w t0, t0, -1 + slli.w t1, t1, 2 + addi.w t1, t1, 4 + sll.d t8, t8, t1 + clz.d t1, t8 + srai.w t1, t1, 2 + sub.w t0, t0, t1 // Index of the first non-zero element starting from the highest bit + bge t0, zero, .LOOP_COEFF_LEVEL_RUN8_LSX +.END_COEFF_LEVEL_RUN8_LSX: + st.w t4, a1, 4 + move a0, t5 +endfunc_x264 diff --git a/common/loongarch/quant.h b/common/loongarch/quant.h new file mode 100644 index 000000000..cc3a53999 --- /dev/null +++ b/common/loongarch/quant.h @@ -0,0 +1,96 @@ +/***************************************************************************** + * quant.h: loongarch quantization and level-run + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Shiyou Yin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#ifndef X264_LOONGARCH_QUANT_H +#define X264_LOONGARCH_QUANT_H + +#define x264_coeff_last64_lsx x264_template(coeff_last64_lsx) +int32_t x264_coeff_last64_lsx( int16_t *p_src ); +#define x264_coeff_last16_lsx x264_template(coeff_last16_lsx) +int32_t x264_coeff_last16_lsx( int16_t *p_src ); +#define x264_coeff_last15_lsx x264_template(coeff_last15_lsx) +int32_t x264_coeff_last15_lsx( int16_t *p_src ); +#define x264_coeff_last8_lsx x264_template(coeff_last8_lsx) +int32_t x264_coeff_last8_lsx( int16_t *p_src ); +#define x264_coeff_last4_lsx x264_template(coeff_last4_lsx) +int32_t x264_coeff_last4_lsx( int16_t *p_src ); + +#define x264_quant_4x4_lsx x264_template(quant_4x4_lsx) +int32_t x264_quant_4x4_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ); +#define x264_quant_4x4x4_lsx x264_template(quant_4x4x4_lsx) +int32_t x264_quant_4x4x4_lsx( int16_t p_dct[4][16], + uint16_t pu_mf[16], uint16_t pu_bias[16] ); +#define x264_quant_8x8_lsx x264_template(quant_8x8_lsx) +int32_t x264_quant_8x8_lsx( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ); +#define x264_quant_4x4_dc_lsx x264_template(quant_4x4_dc_lsx) +int32_t x264_quant_4x4_dc_lsx( dctcoef dct[16], int32_t mf, int32_t bias ); +#define x264_quant_2x2_dc_lsx x264_template(quant_2x2_dc_lsx) +int32_t x264_quant_2x2_dc_lsx( dctcoef dct[4], int32_t mf, int32_t bias ); + +#define x264_dequant_4x4_lsx x264_template(dequant_4x4_lsx) +void x264_dequant_4x4_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +#define x264_dequant_8x8_lsx x264_template(dequant_8x8_lsx) +void x264_dequant_8x8_lsx( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); +#define x264_dequant_4x4_dc_lsx x264_template(dequant_4x4_dc_lsx) +void x264_dequant_4x4_dc_lsx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); + +#define x264_decimate_score15_lsx x264_template(decimate_score15_lsx) +int x264_decimate_score15_lsx( dctcoef *dct ); +#define x264_decimate_score16_lsx x264_template(decimate_score16_lsx) +int x264_decimate_score16_lsx( dctcoef *dct ); +#define x264_decimate_score64_lsx x264_template(decimate_score64_lsx) +int x264_decimate_score64_lsx( dctcoef *dct ); + +#define x264_coeff_last64_lasx x264_template(coeff_last64_lasx) +int32_t x264_coeff_last64_lasx( int16_t *p_src ); +#define x264_coeff_last16_lasx x264_template(coeff_last16_lasx) +int32_t x264_coeff_last16_lasx( int16_t *p_src ); +#define x264_coeff_last15_lasx x264_template(coeff_last15_lasx) +int32_t x264_coeff_last15_lasx( int16_t *p_src ); + +#define x264_quant_4x4x4_lasx x264_template(quant_4x4x4_lasx) +int32_t x264_quant_4x4x4_lasx( int16_t p_dct[4][16], + uint16_t pu_mf[16], uint16_t pu_bias[16] ); + +#define x264_dequant_4x4_lasx x264_template(dequant_4x4_lasx) +void x264_dequant_4x4_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +#define x264_dequant_8x8_lasx x264_template(dequant_8x8_lasx) +void x264_dequant_8x8_lasx( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); +#define x264_dequant_4x4_dc_lasx x264_template(dequant_4x4_dc_lasx) +void x264_dequant_4x4_dc_lasx( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); + +#define x264_coeff_level_run16_lasx x264_template(coeff_level_run16_lasx) +int x264_coeff_level_run16_lasx( dctcoef *, x264_run_level_t * ); +#define x264_coeff_level_run15_lasx x264_template(coeff_level_run15_lasx) +int x264_coeff_level_run15_lasx( dctcoef *, x264_run_level_t * ); + +#define x264_coeff_level_run16_lsx x264_template(coeff_level_run16_lsx) +int x264_coeff_level_run16_lsx( dctcoef *, x264_run_level_t * ); +#define x264_coeff_level_run15_lsx x264_template(coeff_level_run15_lsx) +int x264_coeff_level_run15_lsx( dctcoef *, x264_run_level_t * ); +#define x264_coeff_level_run8_lsx x264_template(coeff_level_run8_lsx) +int x264_coeff_level_run8_lsx( dctcoef *, x264_run_level_t * ); + +#endif/* X264_LOONGARCH_QUANT_H */ diff --git a/common/loongarch/sad-a.S b/common/loongarch/sad-a.S new file mode 100644 index 000000000..1612f3c0c --- /dev/null +++ b/common/loongarch/sad-a.S @@ -0,0 +1,2585 @@ +/***************************************************************************** + * sad-a.S: loongarch sad functions + ***************************************************************************** + * Copyright (C) 2023-2024 x264 project + * + * Authors: Lu Wang + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "loongson_asm.S" +#include "loongson_util.S" + +#if !HIGH_BIT_DEPTH + + +/* void x264_pixel_sad_x4_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_16x16_lasx + slli.d t1, a5, 1 + add.d t2, a5, t1 + slli.d t3, a5, 2 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 0 + xvld xr16, a0, 32 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr12, xr8, xr8 + xvhaddw.hu.bu xr13, xr9, xr9 + xvhaddw.hu.bu xr14, xr10, xr10 + xvhaddw.hu.bu xr15, xr11, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr16, xr4 + xvabsd.bu xr9, xr16, xr5 + xvabsd.bu xr10, xr16, xr6 + xvabsd.bu xr11, xr16, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 64 + xvld xr16, a0, 96 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr16, xr4 + xvabsd.bu xr9, xr16, xr5 + xvabsd.bu xr10, xr16, xr6 + xvabsd.bu xr11, xr16, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 128 + xvld xr16, a0, 160 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr16, xr4 + xvabsd.bu xr9, xr16, xr5 + xvabsd.bu xr10, xr16, xr6 + xvabsd.bu xr11, xr16, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 192 + xvld xr16, a0, 224 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr16, xr4 + xvabsd.bu xr9, xr16, xr5 + xvabsd.bu xr10, xr16, xr6 + xvabsd.bu xr11, xr16, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + xvori.b xr17, xr12, 0 + xvori.b xr18, xr13, 0 + xvpermi.q xr12, xr14, 0x02 + xvpermi.q xr14, xr17, 0x31 + xvpermi.q xr13, xr15, 0x02 + xvpermi.q xr15, xr18, 0x31 + xvadd.h xr12, xr12, xr14 + xvadd.h xr13, xr13, xr15 + xvhaddw.w.h xr12, xr12, xr12 + xvhaddw.w.h xr13, xr13, xr13 + xvhaddw.d.w xr12, xr12, xr12 + xvhaddw.d.w xr13, xr13, xr13 + xvhaddw.q.d xr12, xr12, xr12 + xvhaddw.q.d xr13, xr13, xr13 + xvpackev.w xr13, xr13, xr12 + // Store data to p_sad_array + xvstelm.d xr13, a6, 0, 0 + xvstelm.d xr13, a6, 8, 2 +endfunc_x264 + +/* void x264_pixel_sad_x4_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_16x8_lasx + slli.d t1, a5, 1 + add.d t2, a5, t1 + slli.d t3, a5, 2 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 0 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr12, xr8, xr8 + xvhaddw.hu.bu xr13, xr9, xr9 + xvhaddw.hu.bu xr14, xr10, xr10 + xvhaddw.hu.bu xr15, xr11, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 32 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 64 + vld vr4, a1, 0 + vldx vr8, a1, a5 + vld vr5, a2, 0 + vldx vr9, a2, a5 + vld vr6, a3, 0 + vldx vr10, a3, a5 + vld vr7, a4, 0 + vldx vr11, a4, a5 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + xvld xr3, a0, 96 + vldx vr4, a1, t1 + vldx vr8, a1, t2 + vldx vr5, a2, t1 + vldx vr9, a2, t2 + vldx vr6, a3, t1 + vldx vr10, a3, t2 + vldx vr7, a4, t1 + vldx vr11, a4, t2 + xvpermi.q xr4, xr8, 0x02 + xvpermi.q xr5, xr9, 0x02 + xvpermi.q xr6, xr10, 0x02 + xvpermi.q xr7, xr11, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvadd.h xr12, xr12, xr8 + xvadd.h xr13, xr13, xr9 + xvadd.h xr14, xr14, xr10 + xvadd.h xr15, xr15, xr11 + + xvori.b xr17, xr12, 0 + xvori.b xr18, xr13, 0 + xvpermi.q xr12, xr14, 0x02 + xvpermi.q xr14, xr17, 0x31 + xvpermi.q xr13, xr15, 0x02 + xvpermi.q xr15, xr18, 0x31 + xvadd.h xr12, xr12, xr14 + xvadd.h xr13, xr13, xr15 + xvhaddw.w.h xr12, xr12, xr12 + xvhaddw.w.h xr13, xr13, xr13 + xvhaddw.d.w xr12, xr12, xr12 + xvhaddw.d.w xr13, xr13, xr13 + xvhaddw.q.d xr12, xr12, xr12 + xvhaddw.q.d xr13, xr13, xr13 + xvpackev.w xr13, xr13, xr12 + // Store data to p_sad_array + xvstelm.d xr13, a6, 0, 0 + xvstelm.d xr13, a6, 8, 2 +endfunc_x264 + +/* void x264_pixel_sad_x4_8x8_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x8_lasx + slli.d t1, a5, 1 + add.d t2, t1, a5 + slli.d t3, a5, 2 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 + vilvl.d vr4, vr5, vr4 + vilvl.d vr6, vr7, vr6 + vilvl.d vr8, vr9, vr8 + vilvl.d vr10, vr11, vr10 + vilvl.d vr14, vr15, vr14 + vilvl.d vr16, vr17, vr16 + vilvl.d vr18, vr19, vr18 + vilvl.d vr20, vr21, vr20 + xvpermi.q xr4, xr6, 0x02 + xvpermi.q xr8, xr10, 0x02 + xvpermi.q xr14, xr16, 0x02 + xvpermi.q xr18, xr20, 0x02 + // Calculate the absolute value of the difference + xvldrepl.d xr3, a0, 0 + xvabsd.bu xr5, xr3, xr4 + xvldrepl.d xr3, a0, 16 + xvabsd.bu xr9, xr3, xr8 + xvldrepl.d xr3, a0, 32 + xvabsd.bu xr10, xr3, xr14 + xvldrepl.d xr3, a0, 48 + xvabsd.bu xr11, xr3, xr18 + xvaddwev.h.bu xr0, xr5, xr9 + xvaddwod.h.bu xr1, xr5, xr9 + xvaddwev.h.bu xr2, xr10, xr11 + xvaddwod.h.bu xr22, xr10, xr11 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + add.d a4, a4, t3 + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 + vilvl.d vr4, vr5, vr4 + vilvl.d vr6, vr7, vr6 + vilvl.d vr8, vr9, vr8 + vilvl.d vr10, vr11, vr10 + vilvl.d vr14, vr15, vr14 + vilvl.d vr16, vr17, vr16 + vilvl.d vr18, vr19, vr18 + vilvl.d vr20, vr21, vr20 + xvpermi.q xr4, xr6, 0x02 + xvpermi.q xr8, xr10, 0x02 + xvpermi.q xr14, xr16, 0x02 + xvpermi.q xr18, xr20, 0x02 + // Calculate the absolute value of the difference + xvldrepl.d xr3, a0, 64 + xvabsd.bu xr5, xr3, xr4 + xvldrepl.d xr3, a0, 80 + xvabsd.bu xr9, xr3, xr8 + xvldrepl.d xr3, a0, 96 + xvabsd.bu xr10, xr3, xr14 + xvldrepl.d xr3, a0, 112 + xvabsd.bu xr11, xr3, xr18 + xvaddwev.h.bu xr12, xr5, xr9 + xvaddwod.h.bu xr13, xr5, xr9 + xvaddwev.h.bu xr14, xr10, xr11 + xvaddwod.h.bu xr15, xr10, xr11 + xvadd.h xr5, xr0, xr12 + xvadd.h xr9, xr1, xr13 + xvadd.h xr10, xr2, xr14 + xvadd.h xr11, xr22, xr15 + xvadd.h xr5, xr5, xr9 + xvadd.h xr10, xr10, xr11 + xvadd.h xr10, xr10, xr5 + xvhaddw.wu.hu xr10, xr10, xr10 + xvhaddw.du.wu xr10, xr10, xr10 + xvpermi.q xr5, xr10, 0x01 + xvpickev.w xr10, xr5, xr10 + // Store data to p_sad_array + vst vr10, a6, 0 +endfunc_x264 + +/* void x264_pixel_sad_x4_8x4_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x4_lasx + slli.d t1, a5, 1 + add.d t2, t1, a5 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + fld.d f2, a0, 0 + fld.d f3, a0, 16 + fld.d f12, a0, 32 + fld.d f13, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f14, f18 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f15, f19 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f16, f20 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f17, f21 + + vilvl.d vr3, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr7, vr11, vr7 + vilvl.d vr13, vr13, vr12 + vilvl.d vr14, vr18, vr14 + vilvl.d vr15, vr19, vr15 + vilvl.d vr16, vr20, vr16 + vilvl.d vr17, vr21, vr17 + xvpermi.q xr3, xr13, 0x02 + xvpermi.q xr4, xr16, 0x02 + xvpermi.q xr5, xr17, 0x02 + xvpermi.q xr6, xr14, 0x02 + xvpermi.q xr7, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr8, xr3, xr4 + xvabsd.bu xr9, xr3, xr5 + xvabsd.bu xr10, xr3, xr6 + xvabsd.bu xr11, xr3, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvpermi.d xr10, xr10, 0x4e + xvpermi.d xr11, xr11, 0x4e + xvadd.h xr8, xr8, xr10 + xvadd.h xr9, xr9, xr11 + xvhaddw.w.h xr8, xr8, xr8 + xvhaddw.w.h xr9, xr9, xr9 + xvhaddw.d.w xr8, xr8, xr8 + xvhaddw.d.w xr9, xr9, xr9 + xvhaddw.q.d xr8, xr8, xr8 + xvhaddw.q.d xr9, xr9, xr9 + xvpackev.w xr9, xr9, xr8 + + // Store data to p_sad_array + xvstelm.d xr9, a6, 0, 0 + xvstelm.d xr9, a6, 8, 2 +endfunc_x264 + +/* void x264_pixel_sad_x4_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_4x4_lsx + slli.d t0, a5, 1 + add.d t1, a5, t0 + slli.d t2, a5, 2 + + // Load data from p_src, p_ref0, p_ref1, p_ref2 and p_ref3 + fld.s f2, a0, 0 + fld.s f3, a0, 16 + fld.s f4, a1, 0 + fldx.s f8, a1, a5 + fld.s f5, a2, 0 + fldx.s f9, a2, a5 + fld.s f6, a3, 0 + fldx.s f10, a3, a5 + fld.s f7, a4, 0 + fldx.s f11, a4, a5 + vilvl.w vr3, vr3, vr2 + vilvl.w vr4, vr8, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr7, vr11, vr7 + + fld.s f2, a0, 32 + fld.s f0, a0, 48 + fldx.s f8, a1, t0 + fldx.s f12, a1, t1 + fldx.s f9, a2, t0 + fldx.s f13, a2, t1 + fldx.s f10, a3, t0 + fldx.s f14, a3, t1 + fldx.s f11, a4, t0 + fldx.s f15, a4, t1 + vilvl.w vr2, vr0, vr2 + vilvl.w vr8, vr12, vr8 + vilvl.w vr9, vr13, vr9 + vilvl.w vr10, vr14, vr10 + vilvl.w vr11, vr15, vr11 + vilvl.d vr3, vr2, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr7, vr11, vr7 + + // Calculate the absolute value of the difference + vabsd.bu vr8, vr3, vr4 + vabsd.bu vr9, vr3, vr5 + vabsd.bu vr10, vr3, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.wu.hu vr10, vr10, vr10 + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.du.wu vr10, vr10, vr10 + vhaddw.du.wu vr11, vr11, vr11 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + vhaddw.qu.du vr10, vr10, vr10 + vhaddw.qu.du vr11, vr11, vr11 + + // Store data to p_sad_array + vstelm.w vr8, a6, 0, 0 + vstelm.w vr9, a6, 4, 0 + vstelm.w vr10, a6, 8, 0 + vstelm.w vr11, a6, 12, 0 +endfunc_x264 + +/* void x264_pixel_sad_x3_16x16_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_16x16_lasx + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + slli.d t1, a4, 1 + add.d t2, a4, t1 + slli.d t3, a4, 2 + + xvld xr2, a0, 0 + xvld xr3, a0, 32 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr16, xr7, xr7 + xvhaddw.hu.bu xr17, xr8, xr8 + xvhaddw.hu.bu xr18, xr9, xr9 + xvhaddw.hu.bu xr19, xr10, xr10 + xvhaddw.hu.bu xr20, xr11, xr11 + xvhaddw.hu.bu xr21, xr12, xr12 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + xvld xr2, a0, 64 + xvld xr3, a0, 96 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr7, xr7, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvhaddw.hu.bu xr12, xr12, xr12 + xvadd.h xr16, xr16, xr7 + xvadd.h xr17, xr17, xr8 + xvadd.h xr18, xr18, xr9 + xvadd.h xr19, xr19, xr10 + xvadd.h xr20, xr20, xr11 + xvadd.h xr21, xr21, xr12 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + xvld xr2, a0, 128 + xvld xr3, a0, 160 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr7, xr7, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvhaddw.hu.bu xr12, xr12, xr12 + xvadd.h xr16, xr16, xr7 + xvadd.h xr17, xr17, xr8 + xvadd.h xr18, xr18, xr9 + xvadd.h xr19, xr19, xr10 + xvadd.h xr20, xr20, xr11 + xvadd.h xr21, xr21, xr12 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + xvld xr2, a0, 192 + xvld xr3, a0, 224 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr7, xr7, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvhaddw.hu.bu xr12, xr12, xr12 + xvadd.h xr16, xr16, xr7 + xvadd.h xr17, xr17, xr8 + xvadd.h xr18, xr18, xr9 + xvadd.h xr19, xr19, xr10 + xvadd.h xr20, xr20, xr11 + xvadd.h xr21, xr21, xr12 + xvadd.h xr11, xr16, xr19 + xvadd.h xr12, xr17, xr20 + xvadd.h xr13, xr18, xr21 + + xvhaddw.wu.hu xr11, xr11, xr11 + xvhaddw.wu.hu xr12, xr12, xr12 + xvhaddw.wu.hu xr13, xr13, xr13 + xvhaddw.du.wu xr11, xr11, xr11 + xvhaddw.du.wu xr12, xr12, xr12 + xvhaddw.du.wu xr13, xr13, xr13 + xvhaddw.qu.du xr11, xr11, xr11 + xvhaddw.qu.du xr12, xr12, xr12 + xvhaddw.qu.du xr13, xr13, xr13 + xvpickve.w xr17, xr11, 4 + xvpickve.w xr18, xr12, 4 + xvpickve.w xr19, xr13, 4 + xvadd.w xr11, xr11, xr17 + xvadd.w xr12, xr12, xr18 + xvadd.w xr13, xr13, xr19 + + // Store data to p_sad_array + vstelm.w vr11, a5, 0, 0 + vstelm.w vr12, a5, 4, 0 + vstelm.w vr13, a5, 8, 0 +endfunc_x264 + +/* void x264_pixel_sad_x3_16x8_lasx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_16x8_lasx + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + slli.d t1, a4, 1 + add.d t2, a4, t1 + slli.d t3, a4, 2 + + xvld xr2, a0, 0 + xvld xr3, a0, 32 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr16, xr7, xr7 + xvhaddw.hu.bu xr17, xr8, xr8 + xvhaddw.hu.bu xr18, xr9, xr9 + xvhaddw.hu.bu xr19, xr10, xr10 + xvhaddw.hu.bu xr20, xr11, xr11 + xvhaddw.hu.bu xr21, xr12, xr12 + + add.d a1, a1, t3 + add.d a2, a2, t3 + add.d a3, a3, t3 + xvld xr2, a0, 64 + xvld xr3, a0, 96 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + xvpermi.q xr4, xr7, 0x02 + xvpermi.q xr5, xr8, 0x02 + xvpermi.q xr6, xr9, 0x02 + xvpermi.q xr10, xr13, 0x02 + xvpermi.q xr11, xr14, 0x02 + xvpermi.q xr12, xr15, 0x02 + + // Calculate the absolute value of the difference + xvabsd.bu xr7, xr2, xr4 + xvabsd.bu xr8, xr2, xr5 + xvabsd.bu xr9, xr2, xr6 + xvabsd.bu xr10, xr3, xr10 + xvabsd.bu xr11, xr3, xr11 + xvabsd.bu xr12, xr3, xr12 + xvhaddw.hu.bu xr7, xr7, xr7 + xvhaddw.hu.bu xr8, xr8, xr8 + xvhaddw.hu.bu xr9, xr9, xr9 + xvhaddw.hu.bu xr10, xr10, xr10 + xvhaddw.hu.bu xr11, xr11, xr11 + xvhaddw.hu.bu xr12, xr12, xr12 + xvadd.h xr16, xr16, xr7 + xvadd.h xr17, xr17, xr8 + xvadd.h xr18, xr18, xr9 + xvadd.h xr19, xr19, xr10 + xvadd.h xr20, xr20, xr11 + xvadd.h xr21, xr21, xr12 + xvadd.h xr11, xr16, xr19 + xvadd.h xr12, xr17, xr20 + xvadd.h xr13, xr18, xr21 + + xvhaddw.wu.hu xr11, xr11, xr11 + xvhaddw.wu.hu xr12, xr12, xr12 + xvhaddw.wu.hu xr13, xr13, xr13 + xvhaddw.du.wu xr11, xr11, xr11 + xvhaddw.du.wu xr12, xr12, xr12 + xvhaddw.du.wu xr13, xr13, xr13 + xvhaddw.qu.du xr11, xr11, xr11 + xvhaddw.qu.du xr12, xr12, xr12 + xvhaddw.qu.du xr13, xr13, xr13 + xvpickve.w xr17, xr11, 4 + xvpickve.w xr18, xr12, 4 + xvpickve.w xr19, xr13, 4 + xvadd.w xr11, xr11, xr17 + xvadd.w xr12, xr12, xr18 + xvadd.w xr13, xr13, xr19 + + // Store data to p_sad_array + vstelm.w vr11, a5, 0, 0 + vstelm.w vr12, a5, 4, 0 + vstelm.w vr13, a5, 8, 0 +endfunc_x264 + +/* void x264_pixel_sad_x3_4x4_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_4x4_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.s f3, a0, 0 + fld.s f7, a0, 16 + fld.s f11, a0, 32 + fld.s f15, a0, 48 + FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + + vilvl.w vr3, vr7, vr3 + vilvl.w vr4, vr8, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr11, vr15, vr11 + vilvl.w vr12, vr16, vr12 + vilvl.w vr13, vr17, vr13 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr11, vr3 + vilvl.d vr4, vr12, vr4 + vilvl.d vr5, vr13, vr5 + vilvl.d vr6, vr14, vr6 + + // Calculate the absolute value of the difference + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.du.wu vr7, vr7, vr7 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.qu.du vr7, vr7, vr7 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + + // Store data to p_sad_array + vstelm.w vr7, a5, 0, 0 + vstelm.w vr8, a5, 4, 0 + vstelm.w vr9, a5, 8, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_8x4_lasx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_8x4_lasx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + // Load data from p_src and p_ref + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr4, vr6, vr4 + vilvl.d vr7, vr9, vr7 + vilvl.d vr8, vr10, vr8 + xvpermi.q xr3, xr7, 0x02 + xvpermi.q xr4, xr8, 0x02 + // Calculate the absolute value of the difference + xvabsd.bu xr5, xr3, xr4 + xvhaddw.hu.bu xr6, xr5, xr5 + xvhaddw.wu.hu xr6, xr6, xr6 + xvhaddw.du.wu xr6, xr6, xr6 + xvhaddw.qu.du xr6, xr6, xr6 + + xvpickve2gr.wu t2, xr6, 0 + xvpickve2gr.wu t3, xr6, 4 + add.d a0, t2, t3 +endfunc_x264 + +/* int32_t x264_pixel_sad_4x4_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_4x4_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + // Load data from p_src and p_ref + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + + // Calculate the absolute value of the difference + vabsd.bu vr5, vr3, vr4 + vhaddw.hu.bu vr6, vr5, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.du.wu vr6, vr6, vr6 + vhaddw.qu.du vr6, vr6, vr6 + vpickve2gr.wu a0, vr6, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_4x8_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_4x8_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + // Load data from p_src and p_ref + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vabsd.bu vr11, vr3, vr4 + vhaddw.hu.bu vr11, vr11, vr11 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vabsd.bu vr5, vr3, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + + vadd.h vr6, vr11, vr5 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.du.wu vr6, vr6, vr6 + vhaddw.qu.du vr6, vr6, vr6 + vpickve2gr.wu a0, vr6, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_4x16_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_4x16_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + // Load data from p_src and p_ref + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vabsd.bu vr11, vr3, vr4 + vhaddw.hu.bu vr11, vr11, vr11 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDS_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDS_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.w vr3, vr5, vr3 + vilvl.w vr4, vr6, vr4 + vilvl.w vr7, vr9, vr7 + vilvl.w vr8, vr10, vr8 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vabsd.bu vr12, vr3, vr4 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr11, vr11, vr12 +.endr + + vhaddw.wu.hu vr11, vr11, vr11 + vhaddw.du.wu vr11, vr11, vr11 + vhaddw.qu.du vr11, vr11, vr11 + vpickve2gr.wu a0, vr11, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_8x4_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_8x4_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr6, vr11, vr12 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.du.wu vr6, vr6, vr6 + vhaddw.qu.du vr6, vr6, vr6 + vpickve2gr.wu a0, vr6, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_8x8_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_8x8_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr13, vr11, vr12 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr6, vr11, vr12 + vadd.h vr6, vr6, vr13 + vhaddw.wu.hu vr6, vr6, vr6 + vhaddw.du.wu vr6, vr6, vr6 + vhaddw.qu.du vr6, vr6, vr6 + vpickve2gr.wu a0, vr6, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_8x16_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_8x16_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr13, vr11, vr12 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + FLDD_LOADX_4 a0, a1, t1, t3, f3, f5, f7, f9 + FLDD_LOADX_4 a2, a3, t2, t4, f4, f6, f8, f10 + vilvl.d vr3, vr5, vr3 + vilvl.d vr7, vr9, vr7 + vilvl.d vr4, vr6, vr4 + vilvl.d vr8, vr10, vr8 + vabsd.bu vr11, vr3, vr4 + vabsd.bu vr12, vr7, vr8 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vadd.h vr14, vr11, vr12 + vadd.h vr13, vr13, vr14 +.endr + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu a0, vr13, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_16x8_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_16x8_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 + vabsd.bu vr8, vr0, vr4 + vabsd.bu vr9, vr1, vr5 + vabsd.bu vr10, vr2, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vadd.h vr8, vr8, vr9 + vadd.h vr9, vr10, vr11 + vadd.h vr14, vr8, vr9 + + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 + vabsd.bu vr8, vr0, vr4 + vabsd.bu vr9, vr1, vr5 + vabsd.bu vr10, vr2, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vadd.h vr8, vr8, vr9 + vadd.h vr9, vr10, vr11 + vadd.h vr12, vr8, vr9 + + vadd.h vr13, vr12, vr14 + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu a0, vr13, 0 +endfunc_x264 + +/* int32_t x264_pixel_sad_16x16_lsx(uint8_t *p_src, intptr_t i_src_stride, + * uint8_t *p_ref, intptr_t i_ref_stride) + */ +function_x264 pixel_sad_16x16_lsx + slli.d t1, a1, 1 + slli.d t2, a3, 1 + add.d t3, a1, t1 + add.d t4, a3, t2 + + LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 + vabsd.bu vr8, vr0, vr4 + vabsd.bu vr9, vr1, vr5 + vabsd.bu vr10, vr2, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vadd.h vr8, vr8, vr9 + vadd.h vr9, vr10, vr11 + vadd.h vr13, vr8, vr9 + +.rept 3 + alsl.d a0, a1, a0, 2 + alsl.d a2, a3, a2, 2 + LSX_LOADX_4 a0, a1, t1, t3, vr0, vr1, vr2, vr3 + LSX_LOADX_4 a2, a3, t2, t4, vr4, vr5, vr6, vr7 + vabsd.bu vr8, vr0, vr4 + vabsd.bu vr9, vr1, vr5 + vabsd.bu vr10, vr2, vr6 + vabsd.bu vr11, vr3, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vadd.h vr8, vr8, vr9 + vadd.h vr9, vr10, vr11 + vadd.h vr12, vr8, vr9 + vadd.h vr13, vr12, vr13 +.endr + + vhaddw.wu.hu vr13, vr13, vr13 + vhaddw.du.wu vr13, vr13, vr13 + vhaddw.qu.du vr13, vr13, vr13 + vpickve2gr.wu a0, vr13, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_4x8_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.s f3, a0, 0 + fld.s f7, a0, 16 + fld.s f11, a0, 32 + fld.s f15, a0, 48 + FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.w vr3, vr7, vr3 + vilvl.w vr4, vr8, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr11, vr15, vr11 + vilvl.w vr12, vr16, vr12 + vilvl.w vr13, vr17, vr13 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr11, vr3 + vilvl.d vr4, vr12, vr4 + vilvl.d vr5, vr13, vr5 + vilvl.d vr6, vr14, vr6 + vabsd.bu vr0, vr3, vr4 + vabsd.bu vr1, vr3, vr5 + vabsd.bu vr2, vr3, vr6 + + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + fld.s f3, a0, 64 + fld.s f7, a0, 80 + fld.s f11, a0, 96 + fld.s f15, a0, 112 + FLDS_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.w vr3, vr7, vr3 + vilvl.w vr4, vr8, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr11, vr15, vr11 + vilvl.w vr12, vr16, vr12 + vilvl.w vr13, vr17, vr13 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr11, vr3 + vilvl.d vr4, vr12, vr4 + vilvl.d vr5, vr13, vr5 + vilvl.d vr6, vr14, vr6 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + + vhaddw.hu.bu vr0, vr0, vr0 + vhaddw.hu.bu vr1, vr1, vr1 + vhaddw.hu.bu vr2, vr2, vr2 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vadd.h vr7, vr7, vr0 + vadd.h vr8, vr8, vr1 + vadd.h vr9, vr9, vr2 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.du.wu vr7, vr7, vr7 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.qu.du vr7, vr7, vr7 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + + // Store data to p_sad_array + vstelm.w vr7, a5, 0, 0 + vstelm.w vr8, a5, 4, 0 + vstelm.w vr9, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_8x4_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f3, a0, 0 + fld.d f7, a0, 16 + fld.d f11, a0, 32 + fld.d f15, a0, 48 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr0, vr3, vr4 + vabsd.bu vr1, vr3, vr5 + vabsd.bu vr2, vr3, vr6 + vabsd.bu vr3, vr11, vr12 + vabsd.bu vr4, vr11, vr13 + vabsd.bu vr5, vr11, vr14 + vhaddw.hu.bu vr0, vr0, vr0 + vhaddw.hu.bu vr1, vr1, vr1 + vhaddw.hu.bu vr2, vr2, vr2 + vhaddw.hu.bu vr3, vr3, vr3 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vadd.h vr7, vr0, vr3 + vadd.h vr8, vr1, vr4 + vadd.h vr9, vr2, vr5 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.du.wu vr7, vr7, vr7 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.qu.du vr7, vr7, vr7 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + + // Store data to p_sad_array + vstelm.w vr7, a5, 0, 0 + vstelm.w vr8, a5, 4, 0 + vstelm.w vr9, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_8x8_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f3, a0, 0 + fld.d f7, a0, 16 + fld.d f11, a0, 32 + fld.d f15, a0, 48 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vabsd.bu vr10, vr11, vr12 + vabsd.bu vr15, vr11, vr13 + vabsd.bu vr16, vr11, vr14 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vadd.h vr0, vr7, vr10 + vadd.h vr1, vr8, vr15 + vadd.h vr2, vr9, vr16 + + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + fld.d f3, a0, 64 + fld.d f7, a0, 80 + fld.d f11, a0, 96 + fld.d f15, a0, 112 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vabsd.bu vr10, vr11, vr12 + vabsd.bu vr15, vr11, vr13 + vabsd.bu vr16, vr11, vr14 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vadd.h vr7, vr7, vr10 + vadd.h vr8, vr8, vr15 + vadd.h vr9, vr9, vr16 + + vadd.h vr7, vr7, vr0 + vadd.h vr8, vr8, vr1 + vadd.h vr9, vr9, vr2 + vhaddw.wu.hu vr7, vr7, vr7 + vhaddw.wu.hu vr8, vr8, vr8 + vhaddw.wu.hu vr9, vr9, vr9 + vhaddw.du.wu vr7, vr7, vr7 + vhaddw.du.wu vr8, vr8, vr8 + vhaddw.du.wu vr9, vr9, vr9 + vhaddw.qu.du vr7, vr7, vr7 + vhaddw.qu.du vr8, vr8, vr8 + vhaddw.qu.du vr9, vr9, vr9 + + // Store data to p_sad_array + vstelm.w vr7, a5, 0, 0 + vstelm.w vr8, a5, 4, 0 + vstelm.w vr9, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_8x16_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f3, a0, 0 + fld.d f7, a0, 16 + fld.d f11, a0, 32 + fld.d f15, a0, 48 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vabsd.bu vr10, vr11, vr12 + vabsd.bu vr15, vr11, vr13 + vabsd.bu vr16, vr11, vr14 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vadd.h vr0, vr7, vr10 + vadd.h vr1, vr8, vr15 + vadd.h vr2, vr9, vr16 + +.rept 3 + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + addi.d a0, a0, 64 + fld.d f3, a0, 0 + fld.d f7, a0, 16 + fld.d f11, a0, 32 + fld.d f15, a0, 48 + FLDD_LOADX_4 a1, a4, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a4, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a4, t1, t2, f6, f10, f14, f18 + vilvl.d vr3, vr7, vr3 + vilvl.d vr4, vr8, vr4 + vilvl.d vr5, vr9, vr5 + vilvl.d vr6, vr10, vr6 + vilvl.d vr11, vr15, vr11 + vilvl.d vr12, vr16, vr12 + vilvl.d vr13, vr17, vr13 + vilvl.d vr14, vr18, vr14 + vabsd.bu vr7, vr3, vr4 + vabsd.bu vr8, vr3, vr5 + vabsd.bu vr9, vr3, vr6 + vabsd.bu vr10, vr11, vr12 + vabsd.bu vr15, vr11, vr13 + vabsd.bu vr16, vr11, vr14 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vadd.h vr7, vr7, vr10 + vadd.h vr8, vr8, vr15 + vadd.h vr9, vr9, vr16 + vadd.h vr0, vr7, vr0 + vadd.h vr1, vr8, vr1 + vadd.h vr2, vr9, vr2 +.endr + + vhaddw.wu.hu vr0, vr0, vr0 + vhaddw.wu.hu vr1, vr1, vr1 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.du.wu vr0, vr0, vr0 + vhaddw.du.wu vr1, vr1, vr1 + vhaddw.du.wu vr2, vr2, vr2 + vhaddw.qu.du vr0, vr0, vr0 + vhaddw.qu.du vr1, vr1, vr1 + vhaddw.qu.du vr2, vr2, vr2 + + // Store data to p_sad_array + vstelm.w vr0, a5, 0, 0 + vstelm.w vr1, a5, 4, 0 + vstelm.w vr2, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_16x8_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr1, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr2, vr10 + vabsd.bu vr11, vr2, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr3, vr13 + vabsd.bu vr14, vr3, vr14 + vabsd.bu vr15, vr3, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr0, vr7, vr4 + vadd.h vr1, vr13, vr10 + vadd.h vr16, vr1, vr0 + vadd.h vr0, vr8, vr5 + vadd.h vr1, vr14, vr11 + vadd.h vr17, vr1, vr0 + vadd.h vr0, vr9, vr6 + vadd.h vr1, vr15, vr12 + vadd.h vr18, vr1, vr0 + + // vr16, vr17, vr18 + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + vld vr0, a0, 64 + vld vr1, a0, 80 + vld vr2, a0, 96 + vld vr3, a0, 112 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr1, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr2, vr10 + vabsd.bu vr11, vr2, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr3, vr13 + vabsd.bu vr14, vr3, vr14 + vabsd.bu vr15, vr3, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr0, vr7, vr4 + vadd.h vr1, vr13, vr10 + vadd.h vr2, vr1, vr0 + vadd.h vr0, vr8, vr5 + vadd.h vr1, vr14, vr11 + vadd.h vr3, vr1, vr0 + vadd.h vr0, vr9, vr6 + vadd.h vr1, vr15, vr12 + vadd.h vr4, vr1, vr0 + + vadd.h vr0, vr16, vr2 + vadd.h vr1, vr17, vr3 + vadd.h vr2, vr18, vr4 + vhaddw.wu.hu vr0, vr0, vr0 + vhaddw.wu.hu vr1, vr1, vr1 + vhaddw.wu.hu vr2, vr2, vr2 + vhaddw.du.wu vr0, vr0, vr0 + vhaddw.du.wu vr1, vr1, vr1 + vhaddw.du.wu vr2, vr2, vr2 + vhaddw.qu.du vr0, vr0, vr0 + vhaddw.qu.du vr1, vr1, vr1 + vhaddw.qu.du vr2, vr2, vr2 + + // Store data to p_sad_array + vstelm.w vr0, a5, 0, 0 + vstelm.w vr1, a5, 4, 0 + vstelm.w vr2, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x3_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * intptr_t i_ref_stride, + * int32_t p_sad_array[3]) + */ +function_x264 pixel_sad_x3_16x16_lsx + slli.d t1, a4, 1 + add.d t2, a4, t1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr1, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr2, vr10 + vabsd.bu vr11, vr2, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr3, vr13 + vabsd.bu vr14, vr3, vr14 + vabsd.bu vr15, vr3, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr0, vr7, vr4 + vadd.h vr1, vr13, vr10 + vadd.h vr16, vr1, vr0 + vadd.h vr0, vr8, vr5 + vadd.h vr1, vr14, vr11 + vadd.h vr17, vr1, vr0 + vadd.h vr0, vr9, vr6 + vadd.h vr1, vr15, vr12 + vadd.h vr18, vr1, vr0 + +.rept 3 + alsl.d a1, a4, a1, 2 + alsl.d a2, a4, a2, 2 + alsl.d a3, a4, a3, 2 + addi.d a0, a0, 64 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a4, t1, t2, vr4, vr7, vr10, vr13 + LSX_LOADX_4 a2, a4, t1, t2, vr5, vr8, vr11, vr14 + LSX_LOADX_4 a3, a4, t1, t2, vr6, vr9, vr12, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr1, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr2, vr10 + vabsd.bu vr11, vr2, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr3, vr13 + vabsd.bu vr14, vr3, vr14 + vabsd.bu vr15, vr3, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr0, vr7, vr4 + vadd.h vr1, vr13, vr10 + vadd.h vr2, vr1, vr0 + vadd.h vr0, vr8, vr5 + vadd.h vr1, vr14, vr11 + vadd.h vr3, vr1, vr0 + vadd.h vr0, vr9, vr6 + vadd.h vr1, vr15, vr12 + vadd.h vr4, vr1, vr0 + + vadd.h vr16, vr16, vr2 + vadd.h vr17, vr17, vr3 + vadd.h vr18, vr18, vr4 +.endr + + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.du.wu vr16, vr16, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.du.wu vr18, vr18, vr18 + vhaddw.qu.du vr16, vr16, vr16 + vhaddw.qu.du vr17, vr17, vr17 + vhaddw.qu.du vr18, vr18, vr18 + + // Store data to p_sad_array + vstelm.w vr16, a5, 0, 0 + vstelm.w vr17, a5, 4, 0 + vstelm.w vr18, a5, 8, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_4x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_4x8_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + fld.s f0, a0, 0 + fld.s f1, a0, 16 + fld.s f2, a0, 32 + fld.s f3, a0, 48 + FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + + vilvl.w vr0, vr1, vr0 + vilvl.w vr2, vr3, vr2 + vilvl.d vr0, vr2, vr0 + vilvl.w vr4, vr8, vr4 + vilvl.w vr12, vr16, vr12 + vilvl.d vr1, vr12, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr13, vr17, vr13 + vilvl.d vr2, vr13, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr14, vr6 + vilvl.w vr7, vr11, vr7 + vilvl.w vr15, vr19, vr15 + vilvl.d vr4, vr15, vr7 + vabsd.bu vr1, vr0, vr1 + vabsd.bu vr2, vr0, vr2 + vabsd.bu vr3, vr0, vr3 + vabsd.bu vr4, vr0, vr4 + vhaddw.hu.bu vr20, vr1, vr1 + vhaddw.hu.bu vr21, vr2, vr2 + vhaddw.hu.bu vr22, vr3, vr3 + vhaddw.hu.bu vr23, vr4, vr4 + + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + fld.s f0, a0, 64 + fld.s f1, a0, 80 + fld.s f2, a0, 96 + fld.s f3, a0, 112 + FLDS_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDS_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDS_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDS_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + + vilvl.w vr0, vr1, vr0 + vilvl.w vr2, vr3, vr2 + vilvl.d vr0, vr2, vr0 + vilvl.w vr4, vr8, vr4 + vilvl.w vr12, vr16, vr12 + vilvl.d vr1, vr12, vr4 + vilvl.w vr5, vr9, vr5 + vilvl.w vr13, vr17, vr13 + vilvl.d vr2, vr13, vr5 + vilvl.w vr6, vr10, vr6 + vilvl.w vr14, vr18, vr14 + vilvl.d vr3, vr14, vr6 + vilvl.w vr7, vr11, vr7 + vilvl.w vr15, vr19, vr15 + vilvl.d vr4, vr15, vr7 + vabsd.bu vr1, vr0, vr1 + vabsd.bu vr2, vr0, vr2 + vabsd.bu vr3, vr0, vr3 + vabsd.bu vr4, vr0, vr4 + vhaddw.hu.bu vr1, vr1, vr1 + vhaddw.hu.bu vr2, vr2, vr2 + vhaddw.hu.bu vr3, vr3, vr3 + vhaddw.hu.bu vr4, vr4, vr4 + vadd.h vr16, vr20, vr1 + vadd.h vr17, vr21, vr2 + vadd.h vr18, vr22, vr3 + vadd.h vr19, vr23, vr4 + + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.wu.hu vr19, vr19, vr19 + vhaddw.du.wu vr16, vr16, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.du.wu vr18, vr18, vr18 + vhaddw.du.wu vr19, vr19, vr19 + vhaddw.qu.du vr16, vr16, vr16 + vhaddw.qu.du vr17, vr17, vr17 + vhaddw.qu.du vr18, vr18, vr18 + vhaddw.qu.du vr19, vr19, vr19 + + // Store data to p_sad_array + vstelm.w vr16, a6, 0, 0 + vstelm.w vr17, a6, 4, 0 + vstelm.w vr18, a6, 8, 0 + vstelm.w vr19, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_8x4_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x4_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f0, a0, 0 + fld.d f1, a0, 16 + fld.d f2, a0, 32 + fld.d f3, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr16, vr4, vr12 + vadd.h vr17, vr5, vr13 + vadd.h vr18, vr6, vr14 + vadd.h vr19, vr7, vr15 + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.wu.hu vr19, vr19, vr19 + vhaddw.du.wu vr16, vr16, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.du.wu vr18, vr18, vr18 + vhaddw.du.wu vr19, vr19, vr19 + vhaddw.qu.du vr16, vr16, vr16 + vhaddw.qu.du vr17, vr17, vr17 + vhaddw.qu.du vr18, vr18, vr18 + vhaddw.qu.du vr19, vr19, vr19 + + // Store data to p_sad_array + vstelm.w vr16, a6, 0, 0 + vstelm.w vr17, a6, 4, 0 + vstelm.w vr18, a6, 8, 0 + vstelm.w vr19, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_8x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x8_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f0, a0, 0 + fld.d f1, a0, 16 + fld.d f2, a0, 32 + fld.d f3, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr20, vr4, vr12 + vadd.h vr21, vr5, vr13 + vadd.h vr22, vr6, vr14 + vadd.h vr23, vr7, vr15 + + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + fld.d f0, a0, 64 + fld.d f1, a0, 80 + fld.d f2, a0, 96 + fld.d f3, a0, 112 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr16, vr4, vr12 + vadd.h vr17, vr5, vr13 + vadd.h vr18, vr6, vr14 + vadd.h vr19, vr7, vr15 + + vadd.h vr16, vr16, vr20 + vadd.h vr17, vr17, vr21 + vadd.h vr18, vr18, vr22 + vadd.h vr19, vr19, vr23 + vhaddw.wu.hu vr16, vr16, vr16 + vhaddw.wu.hu vr17, vr17, vr17 + vhaddw.wu.hu vr18, vr18, vr18 + vhaddw.wu.hu vr19, vr19, vr19 + vhaddw.du.wu vr16, vr16, vr16 + vhaddw.du.wu vr17, vr17, vr17 + vhaddw.du.wu vr18, vr18, vr18 + vhaddw.du.wu vr19, vr19, vr19 + vhaddw.qu.du vr16, vr16, vr16 + vhaddw.qu.du vr17, vr17, vr17 + vhaddw.qu.du vr18, vr18, vr18 + vhaddw.qu.du vr19, vr19, vr19 + // Store data to p_sad_array + vstelm.w vr16, a6, 0, 0 + vstelm.w vr17, a6, 4, 0 + vstelm.w vr18, a6, 8, 0 + vstelm.w vr19, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_8x16_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_8x16_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + // Load data from p_src, p_ref0, p_ref1 and p_ref2 + fld.d f0, a0, 0 + fld.d f1, a0, 16 + fld.d f2, a0, 32 + fld.d f3, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr20, vr4, vr12 + vadd.h vr21, vr5, vr13 + vadd.h vr22, vr6, vr14 + vadd.h vr23, vr7, vr15 + +.rept 3 + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + addi.d a0, a0, 64 + fld.d f0, a0, 0 + fld.d f1, a0, 16 + fld.d f2, a0, 32 + fld.d f3, a0, 48 + FLDD_LOADX_4 a1, a5, t1, t2, f4, f8, f12, f16 + FLDD_LOADX_4 a2, a5, t1, t2, f5, f9, f13, f17 + FLDD_LOADX_4 a3, a5, t1, t2, f6, f10, f14, f18 + FLDD_LOADX_4 a4, a5, t1, t2, f7, f11, f15, f19 + vilvl.d vr0, vr1, vr0 + vilvl.d vr2, vr3, vr2 + vilvl.d vr4, vr8, vr4 + vilvl.d vr12, vr16, vr12 + vilvl.d vr5, vr9, vr5 + vilvl.d vr13, vr17, vr13 + vilvl.d vr6, vr10, vr6 + vilvl.d vr14, vr18, vr14 + vilvl.d vr7, vr11, vr7 + vilvl.d vr15, vr19, vr15 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vadd.h vr16, vr4, vr12 + vadd.h vr17, vr5, vr13 + vadd.h vr18, vr6, vr14 + vadd.h vr19, vr7, vr15 + vadd.h vr20, vr16, vr20 + vadd.h vr21, vr17, vr21 + vadd.h vr22, vr18, vr22 + vadd.h vr23, vr19, vr23 +.endr + vhaddw.wu.hu vr20, vr20, vr20 + vhaddw.wu.hu vr21, vr21, vr21 + vhaddw.wu.hu vr22, vr22, vr22 + vhaddw.wu.hu vr23, vr23, vr23 + vhaddw.du.wu vr20, vr20, vr20 + vhaddw.du.wu vr21, vr21, vr21 + vhaddw.du.wu vr22, vr22, vr22 + vhaddw.du.wu vr23, vr23, vr23 + vhaddw.qu.du vr20, vr20, vr20 + vhaddw.qu.du vr21, vr21, vr21 + vhaddw.qu.du vr22, vr22, vr22 + vhaddw.qu.du vr23, vr23, vr23 + // Store data to p_sad_array + vstelm.w vr20, a6, 0, 0 + vstelm.w vr21, a6, 4, 0 + vstelm.w vr22, a6, 8, 0 + vstelm.w vr23, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_16x8_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_16x8_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 + LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 + LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 + LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr1, vr10 + vabsd.bu vr11, vr1, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vabsd.bu vr16, vr3, vr16 + vabsd.bu vr17, vr3, vr17 + vabsd.bu vr18, vr3, vr18 + vabsd.bu vr19, vr3, vr19 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vadd.h vr0, vr4, vr8 + vadd.h vr1, vr12, vr16 + vadd.h vr20, vr0, vr1 + vadd.h vr0, vr5, vr9 + vadd.h vr1, vr13, vr17 + vadd.h vr21, vr0, vr1 + vadd.h vr0, vr6, vr10 + vadd.h vr1, vr14, vr18 + vadd.h vr22, vr0, vr1 + vadd.h vr0, vr7, vr11 + vadd.h vr1, vr15, vr19 + vadd.h vr23, vr0, vr1 + + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + vld vr0, a0, 64 + vld vr1, a0, 80 + vld vr2, a0, 96 + vld vr3, a0, 112 + LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 + LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 + LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 + LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr1, vr10 + vabsd.bu vr11, vr1, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vabsd.bu vr16, vr3, vr16 + vabsd.bu vr17, vr3, vr17 + vabsd.bu vr18, vr3, vr18 + vabsd.bu vr19, vr3, vr19 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vadd.h vr0, vr4, vr8 + vadd.h vr1, vr12, vr16 + vadd.h vr16, vr0, vr1 + vadd.h vr0, vr5, vr9 + vadd.h vr1, vr13, vr17 + vadd.h vr17, vr0, vr1 + vadd.h vr0, vr6, vr10 + vadd.h vr1, vr14, vr18 + vadd.h vr18, vr0, vr1 + vadd.h vr0, vr7, vr11 + vadd.h vr1, vr15, vr19 + vadd.h vr19, vr0, vr1 + + vadd.h vr20, vr16, vr20 + vadd.h vr21, vr17, vr21 + vadd.h vr22, vr18, vr22 + vadd.h vr23, vr19, vr23 + vhaddw.wu.hu vr20, vr20, vr20 + vhaddw.wu.hu vr21, vr21, vr21 + vhaddw.wu.hu vr22, vr22, vr22 + vhaddw.wu.hu vr23, vr23, vr23 + vhaddw.du.wu vr20, vr20, vr20 + vhaddw.du.wu vr21, vr21, vr21 + vhaddw.du.wu vr22, vr22, vr22 + vhaddw.du.wu vr23, vr23, vr23 + vhaddw.qu.du vr20, vr20, vr20 + vhaddw.qu.du vr21, vr21, vr21 + vhaddw.qu.du vr22, vr22, vr22 + vhaddw.qu.du vr23, vr23, vr23 + // Store data to p_sad_array + vstelm.w vr20, a6, 0, 0 + vstelm.w vr21, a6, 4, 0 + vstelm.w vr22, a6, 8, 0 + vstelm.w vr23, a6, 12, 0 +endfunc_x264 + +/* + * void x264_pixel_sad_x4_16x16_lsx(uint8_t *p_src, uint8_t *p_ref0, + * uint8_t *p_ref1, uint8_t *p_ref2, + * uint8_t *p_ref3, intptr_t i_ref_stride, + * int32_t p_sad_array[4]) + */ +function_x264 pixel_sad_x4_16x16_lsx + slli.d t1, a5, 1 + add.d t2, a5, t1 + + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 + LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 + LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 + LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr1, vr10 + vabsd.bu vr11, vr1, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vabsd.bu vr16, vr3, vr16 + vabsd.bu vr17, vr3, vr17 + vabsd.bu vr18, vr3, vr18 + vabsd.bu vr19, vr3, vr19 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vadd.h vr0, vr4, vr8 + vadd.h vr1, vr12, vr16 + vadd.h vr20, vr0, vr1 + vadd.h vr0, vr5, vr9 + vadd.h vr1, vr13, vr17 + vadd.h vr21, vr0, vr1 + vadd.h vr0, vr6, vr10 + vadd.h vr1, vr14, vr18 + vadd.h vr22, vr0, vr1 + vadd.h vr0, vr7, vr11 + vadd.h vr1, vr15, vr19 + vadd.h vr23, vr0, vr1 + +.rept 3 + alsl.d a1, a5, a1, 2 + alsl.d a2, a5, a2, 2 + alsl.d a3, a5, a3, 2 + alsl.d a4, a5, a4, 2 + addi.d a0, a0, 64 + vld vr0, a0, 0 + vld vr1, a0, 16 + vld vr2, a0, 32 + vld vr3, a0, 48 + LSX_LOADX_4 a1, a5, t1, t2, vr4, vr8, vr12, vr16 + LSX_LOADX_4 a2, a5, t1, t2, vr5, vr9, vr13, vr17 + LSX_LOADX_4 a3, a5, t1, t2, vr6, vr10, vr14, vr18 + LSX_LOADX_4 a4, a5, t1, t2, vr7, vr11, vr15, vr19 + vabsd.bu vr4, vr0, vr4 + vabsd.bu vr5, vr0, vr5 + vabsd.bu vr6, vr0, vr6 + vabsd.bu vr7, vr0, vr7 + vabsd.bu vr8, vr1, vr8 + vabsd.bu vr9, vr1, vr9 + vabsd.bu vr10, vr1, vr10 + vabsd.bu vr11, vr1, vr11 + vabsd.bu vr12, vr2, vr12 + vabsd.bu vr13, vr2, vr13 + vabsd.bu vr14, vr2, vr14 + vabsd.bu vr15, vr2, vr15 + vabsd.bu vr16, vr3, vr16 + vabsd.bu vr17, vr3, vr17 + vabsd.bu vr18, vr3, vr18 + vabsd.bu vr19, vr3, vr19 + vhaddw.hu.bu vr4, vr4, vr4 + vhaddw.hu.bu vr5, vr5, vr5 + vhaddw.hu.bu vr6, vr6, vr6 + vhaddw.hu.bu vr7, vr7, vr7 + vhaddw.hu.bu vr8, vr8, vr8 + vhaddw.hu.bu vr9, vr9, vr9 + vhaddw.hu.bu vr10, vr10, vr10 + vhaddw.hu.bu vr11, vr11, vr11 + vhaddw.hu.bu vr12, vr12, vr12 + vhaddw.hu.bu vr13, vr13, vr13 + vhaddw.hu.bu vr14, vr14, vr14 + vhaddw.hu.bu vr15, vr15, vr15 + vhaddw.hu.bu vr16, vr16, vr16 + vhaddw.hu.bu vr17, vr17, vr17 + vhaddw.hu.bu vr18, vr18, vr18 + vhaddw.hu.bu vr19, vr19, vr19 + vadd.h vr0, vr4, vr8 + vadd.h vr1, vr12, vr16 + vadd.h vr16, vr0, vr1 + vadd.h vr0, vr5, vr9 + vadd.h vr1, vr13, vr17 + vadd.h vr17, vr0, vr1 + vadd.h vr0, vr6, vr10 + vadd.h vr1, vr14, vr18 + vadd.h vr18, vr0, vr1 + vadd.h vr0, vr7, vr11 + vadd.h vr1, vr15, vr19 + vadd.h vr19, vr0, vr1 + vadd.h vr20, vr16, vr20 + vadd.h vr21, vr17, vr21 + vadd.h vr22, vr18, vr22 + vadd.h vr23, vr19, vr23 +.endr + vhaddw.wu.hu vr20, vr20, vr20 + vhaddw.wu.hu vr21, vr21, vr21 + vhaddw.wu.hu vr22, vr22, vr22 + vhaddw.wu.hu vr23, vr23, vr23 + vhaddw.du.wu vr20, vr20, vr20 + vhaddw.du.wu vr21, vr21, vr21 + vhaddw.du.wu vr22, vr22, vr22 + vhaddw.du.wu vr23, vr23, vr23 + vhaddw.qu.du vr20, vr20, vr20 + vhaddw.qu.du vr21, vr21, vr21 + vhaddw.qu.du vr22, vr22, vr22 + vhaddw.qu.du vr23, vr23, vr23 + // Store data to p_sad_array + vstelm.w vr20, a6, 0, 0 + vstelm.w vr21, a6, 4, 0 + vstelm.w vr22, a6, 8, 0 + vstelm.w vr23, a6, 12, 0 +endfunc_x264 +#endif /* !HIGH_BIT_DEPTH */ diff --git a/common/macroblock.c b/common/macroblock.c index 55a8baaa1..477a32136 100644 --- a/common/macroblock.c +++ b/common/macroblock.c @@ -1,7 +1,7 @@ /***************************************************************************** * macroblock.c: macroblock common functions ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Fiona Glaser * Laurent Aimar diff --git a/common/macroblock.h b/common/macroblock.h index c7b984f82..5e81db6a2 100644 --- a/common/macroblock.h +++ b/common/macroblock.h @@ -1,7 +1,7 @@ /***************************************************************************** * macroblock.h: macroblock common functions ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/common/mc.c b/common/mc.c index 89b500c6a..b515607c9 100644 --- a/common/mc.c +++ b/common/mc.c @@ -1,7 +1,7 @@ /***************************************************************************** * mc.c: motion compensation ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt @@ -41,6 +41,9 @@ #if HAVE_MSA #include "mips/mc.h" #endif +#if HAVE_LSX +# include "loongarch/mc.h" +#endif static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride, @@ -687,6 +690,9 @@ void x264_mc_init( uint32_t cpu, x264_mc_functions_t *pf, int cpu_independent ) if( cpu&X264_CPU_MSA ) x264_mc_init_mips( cpu, pf ); #endif +#if HAVE_LSX + x264_mc_init_loongarch( cpu, pf ); +#endif if( cpu_independent ) { diff --git a/common/mc.h b/common/mc.h index 9e3401718..a36e0a0a7 100644 --- a/common/mc.h +++ b/common/mc.h @@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: motion compensation ***************************************************************************** - * Copyright (C) 2004-2023 x264 project + * Copyright (C) 2004-2024 x264 project * * Authors: Loren Merritt * diff --git a/common/mips/dct-c.c b/common/mips/dct-c.c index caecf661b..6ab8b96ef 100644 --- a/common/mips/dct-c.c +++ b/common/mips/dct-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * dct-c.c: msa transform and zigzag ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Rishikesh More * diff --git a/common/mips/dct.h b/common/mips/dct.h index 247241d66..667fca17e 100644 --- a/common/mips/dct.h +++ b/common/mips/dct.h @@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: msa transform and zigzag ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Rishikesh More * diff --git a/common/mips/deblock-c.c b/common/mips/deblock-c.c index 000374707..ccfdd0dbe 100644 --- a/common/mips/deblock-c.c +++ b/common/mips/deblock-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * deblock-c.c: msa deblocking ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Neha Rana * diff --git a/common/mips/deblock.h b/common/mips/deblock.h index 0dcbcf0f8..60e1a5090 100644 --- a/common/mips/deblock.h +++ b/common/mips/deblock.h @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.h: msa deblocking ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/common/mips/macros.h b/common/mips/macros.h index e2cb9f474..02bf1ac01 100644 --- a/common/mips/macros.h +++ b/common/mips/macros.h @@ -1,7 +1,7 @@ /***************************************************************************** * macros.h: msa macros ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Rishikesh More * diff --git a/common/mips/mc-c.c b/common/mips/mc-c.c index 7fb9f3a0a..3bc8b4455 100644 --- a/common/mips/mc-c.c +++ b/common/mips/mc-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * mc-c.c: msa motion compensation ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Neha Rana * diff --git a/common/mips/mc.h b/common/mips/mc.h index 84726daf7..6ed55807c 100644 --- a/common/mips/mc.h +++ b/common/mips/mc.h @@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: msa motion compensation ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Neha Rana * diff --git a/common/mips/pixel-c.c b/common/mips/pixel-c.c index 0490b4a3b..3307d95a6 100644 --- a/common/mips/pixel-c.c +++ b/common/mips/pixel-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * pixel-c.c: msa pixel metrics ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Mandar Sahastrabuddhe * diff --git a/common/mips/pixel.h b/common/mips/pixel.h index c3e83e31f..dd19299b3 100644 --- a/common/mips/pixel.h +++ b/common/mips/pixel.h @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.h: msa pixel metrics ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Mandar Sahastrabuddhe * diff --git a/common/mips/predict-c.c b/common/mips/predict-c.c index 816d031ae..ff22d0c94 100644 --- a/common/mips/predict-c.c +++ b/common/mips/predict-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * predict-c.c: msa intra prediction ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Mandar Sahastrabuddhe * diff --git a/common/mips/predict.h b/common/mips/predict.h index 57e301b05..c72ca9375 100644 --- a/common/mips/predict.h +++ b/common/mips/predict.h @@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: msa intra prediction ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Rishikesh More * diff --git a/common/mips/quant-c.c b/common/mips/quant-c.c index 7a2fd0308..ad8fdbc7a 100644 --- a/common/mips/quant-c.c +++ b/common/mips/quant-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * quant-c.c: msa quantization and level-run ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Rishikesh More * diff --git a/common/mips/quant.h b/common/mips/quant.h index 8bedff1c4..143f829f4 100644 --- a/common/mips/quant.h +++ b/common/mips/quant.h @@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: msa quantization and level-run ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Rishikesh More * diff --git a/common/mvpred.c b/common/mvpred.c index 1712fa538..951910c4b 100644 --- a/common/mvpred.c +++ b/common/mvpred.c @@ -1,7 +1,7 @@ /***************************************************************************** * mvpred.c: motion vector prediction ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Fiona Glaser diff --git a/common/opencl.c b/common/opencl.c index 1c3ac927d..dd04f2278 100644 --- a/common/opencl.c +++ b/common/opencl.c @@ -1,7 +1,7 @@ /***************************************************************************** * opencl.c: OpenCL initialization and kernel compilation ***************************************************************************** - * Copyright (C) 2012-2023 x264 project + * Copyright (C) 2012-2024 x264 project * * Authors: Steve Borho * Anton Mitrofanov diff --git a/common/opencl.h b/common/opencl.h index d156e7519..c8ade378a 100644 --- a/common/opencl.h +++ b/common/opencl.h @@ -1,7 +1,7 @@ /***************************************************************************** * opencl.h: OpenCL structures and defines ***************************************************************************** - * Copyright (C) 2012-2023 x264 project + * Copyright (C) 2012-2024 x264 project * * Authors: Steve Borho * Anton Mitrofanov diff --git a/common/osdep.c b/common/osdep.c index d22d190eb..fdcf4e112 100644 --- a/common/osdep.c +++ b/common/osdep.c @@ -1,7 +1,7 @@ /***************************************************************************** * osdep.c: platform-specific code ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Steven Walters * Laurent Aimar diff --git a/common/osdep.h b/common/osdep.h index 223012129..ca24455ed 100644 --- a/common/osdep.h +++ b/common/osdep.h @@ -1,7 +1,7 @@ /***************************************************************************** * osdep.h: platform-specific code ***************************************************************************** - * Copyright (C) 2007-2023 x264 project + * Copyright (C) 2007-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar @@ -314,7 +314,7 @@ static inline int x264_is_regular_file( FILE *filehandle ) #define EXPAND(x) x -#if ARCH_X86 || ARCH_X86_64 +#if ARCH_X86 || ARCH_X86_64 || ARCH_LOONGARCH #define NATIVE_ALIGN 64 #define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) #define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 ) diff --git a/common/pixel.c b/common/pixel.c index 29aac6dae..6e71ed9d5 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.c: pixel metrics ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar @@ -45,7 +45,9 @@ #if HAVE_MSA # include "mips/pixel.h" #endif - +#if HAVE_LSX +# include "loongarch/pixel.h" +#endif /**************************************************************************** * pixel_sad_WxH @@ -827,12 +829,32 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf ) #define INIT8_NAME( name1, name2, cpu ) \ INIT7_NAME( name1, name2, cpu ) \ pixf->name1[PIXEL_4x16] = x264_pixel_##name2##_4x16##cpu; +#if HAVE_SVE +#define INIT7_NAME_SVE_SSD_10BIT( ) \ + pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sve; \ + pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sve; +#endif +#if HAVE_SVE +#define INIT8_NAME_SVE_SSD( ) \ + pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_sve; \ + pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_sve; \ + pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sve; \ + pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sve; \ + pixf->ssd[PIXEL_4x16] = x264_pixel_ssd_4x16_sve; +#define INIT8_NAME_SVE_SSD_10BIT() \ + INIT7_NAME_SVE_SSD_10BIT() \ + pixf->ssd[PIXEL_4x16] = x264_pixel_ssd_4x16_sve; +#endif #define INIT2( name, cpu ) INIT2_NAME( name, name, cpu ) #define INIT4( name, cpu ) INIT4_NAME( name, name, cpu ) #define INIT5( name, cpu ) INIT5_NAME( name, name, cpu ) #define INIT6( name, cpu ) INIT6_NAME( name, name, cpu ) #define INIT7( name, cpu ) INIT7_NAME( name, name, cpu ) #define INIT8( name, cpu ) INIT8_NAME( name, name, cpu ) +#if HAVE_SVE +#define INIT8_SVE_SSD( ) INIT8_NAME_SVE_SSD( ) +#define INIT8_SVE_SSD_10BIT( ) INIT8_NAME_SVE_SSD_10BIT( ) +#endif #define INIT_ADS( cpu ) \ pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\ @@ -1058,7 +1080,38 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf ) if( cpu&X264_CPU_NEON ) { INIT8( sad, _neon ); + INIT7( sad_x3, _neon); + pixf->vsad = x264_pixel_vsad_neon; + pixf->asd8 = x264_pixel_asd8_neon; + INIT8(ssd, _neon); + pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_neon; + pixf->satd[PIXEL_4x8] = x264_pixel_satd_4x8_neon; + pixf->satd[PIXEL_4x4] = x264_pixel_satd_4x4_neon; + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_neon; + pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_neon; + pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_neon; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_neon; + pixf->satd[PIXEL_16x16] = x264_pixel_satd_16x16_neon; + INIT7(sad_x4, _neon); + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon; + INIT4(hadamard_ac, _neon); + pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; + pixf->ssim_end4 = x264_pixel_ssim_end4_neon; + } +#if HAVE_SVE + if( cpu&X264_CPU_SVE ) + { + INIT8_SVE_SSD_10BIT(); } +#endif #endif // HAVE_AARCH64 #else // !HIGH_BIT_DEPTH @@ -1472,6 +1525,18 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf ) pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; } +#if HAVE_SVE + if( cpu&X264_CPU_SVE ) + { + INIT8_SVE_SSD( ); + INIT4( hadamard_ac, _sve ); + + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sve; + + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sve; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sve; + } +#endif #endif // HAVE_AARCH64 #if HAVE_MSA @@ -1506,6 +1571,67 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf ) } #endif // HAVE_MSA +#if HAVE_LSX + if( cpu&X264_CPU_LSX ) + { + INIT8( sad, _lsx ); + INIT8_NAME( sad_aligned, sad, _lsx ); + INIT8( ssd, _lsx ); + INIT7( sad_x3, _lsx ); + INIT7( sad_x4, _lsx ); + INIT8( satd, _lsx ); + INIT4( hadamard_ac, _lsx ); + + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_lsx; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_lsx; + + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_lsx; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_lsx; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_lsx; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_lsx; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_lsx; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_lsx; + + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_lsx; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_lsx; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_lsx; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_lsx; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_lsx; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_lsx; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_lsx; + } + + if( cpu&X264_CPU_LASX ) + { + INIT4( ssd, _lasx ); + INIT4( hadamard_ac, _lasx ); + + pixf->satd[PIXEL_16x16] = x264_pixel_satd_16x16_lasx; + pixf->satd[PIXEL_16x8] = x264_pixel_satd_16x8_lasx; + pixf->satd[PIXEL_8x16] = x264_pixel_satd_8x16_lasx; + pixf->satd[PIXEL_8x8] = x264_pixel_satd_8x8_lasx; + pixf->satd[PIXEL_8x4] = x264_pixel_satd_8x4_lasx; + pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_lasx; + pixf->satd[PIXEL_4x8] = x264_pixel_satd_4x8_lasx; + + pixf->sad[PIXEL_8x4] = x264_pixel_sad_8x4_lasx; + + pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_lasx; + pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_lasx; + pixf->sad_x4[PIXEL_8x8] = x264_pixel_sad_x4_8x8_lasx; + pixf->sad_x4[PIXEL_8x4] = x264_pixel_sad_x4_8x4_lasx; + pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_lasx; + pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_lasx; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_lasx; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_lasx; + + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_lasx; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_lasx; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_lasx; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_lasx; + } +#endif /* HAVE_LSX */ + #endif // HIGH_BIT_DEPTH #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) diff --git a/common/pixel.h b/common/pixel.h index 7acef88e1..28d35d13b 100644 --- a/common/pixel.h +++ b/common/pixel.h @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.c: pixel metrics ***************************************************************************** - * Copyright (C) 2004-2023 x264 project + * Copyright (C) 2004-2024 x264 project * * Authors: Loren Merritt * Fiona Glaser diff --git a/common/ppc/dct.c b/common/ppc/dct.c index e4bd56648..1fea8bdad 100644 --- a/common/ppc/dct.c +++ b/common/ppc/dct.c @@ -1,7 +1,7 @@ /***************************************************************************** * dct.c: ppc transform and zigzag ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Guillaume Poirier * Eric Petit diff --git a/common/ppc/dct.h b/common/ppc/dct.h index 3ac3b0258..a0646e9b7 100644 --- a/common/ppc/dct.h +++ b/common/ppc/dct.h @@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: ppc transform and zigzag ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Eric Petit * Guillaume Poirier diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c index 3070e831a..7ea2926d9 100644 --- a/common/ppc/deblock.c +++ b/common/ppc/deblock.c @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.c: ppc deblocking ***************************************************************************** - * Copyright (C) 2007-2023 x264 project + * Copyright (C) 2007-2024 x264 project * * Authors: Guillaume Poirier * diff --git a/common/ppc/deblock.h b/common/ppc/deblock.h index c36e2e470..34c8ac86d 100644 --- a/common/ppc/deblock.h +++ b/common/ppc/deblock.h @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.h: ppc deblocking ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/common/ppc/mc.c b/common/ppc/mc.c index e3b673aa9..cc0f2ceea 100644 --- a/common/ppc/mc.c +++ b/common/ppc/mc.c @@ -1,7 +1,7 @@ /***************************************************************************** * mc.c: ppc motion compensation ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Eric Petit * Guillaume Poirier diff --git a/common/ppc/mc.h b/common/ppc/mc.h index a59c73a34..eff75d941 100644 --- a/common/ppc/mc.h +++ b/common/ppc/mc.h @@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: ppc motion compensation ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Eric Petit * diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c index 8aebda2c5..574f75cf9 100644 --- a/common/ppc/pixel.c +++ b/common/ppc/pixel.c @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.c: ppc pixel metrics ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Eric Petit * Guillaume Poirier @@ -854,43 +854,43 @@ static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pi sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); \ sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v ); -#define PIXEL_SAD_X3_ALTIVEC( name, ly ) \ -static int name( uint8_t *fenc, uint8_t *pix0, \ - uint8_t *pix1, uint8_t *pix2, \ - intptr_t i_stride, int scores[3] ) \ -{ \ - ALIGNED_16( int sum0 ); \ - ALIGNED_16( int sum1 ); \ - ALIGNED_16( int sum2 ); \ - \ - LOAD_ZERO; \ - vec_u8_t fencv, pix0v, pix1v, pix2v; \ - vec_s32_t sum0v, sum1v, sum2v; \ - \ - sum0v = vec_splat_s32( 0 ); \ - sum1v = vec_splat_s32( 0 ); \ - sum2v = vec_splat_s32( 0 ); \ - \ - for( int y = 0; y < ly; y++ ) \ - { \ - PROCESS_PIXS \ - } \ - \ - sum0v = vec_sums( sum0v, zero_s32v ); \ - sum1v = vec_sums( sum1v, zero_s32v ); \ - sum2v = vec_sums( sum2v, zero_s32v ); \ - \ - sum0v = vec_splat( sum0v, 3 ); \ - sum1v = vec_splat( sum1v, 3 ); \ - sum2v = vec_splat( sum2v, 3 ); \ - \ - vec_ste( sum0v, 0, &sum0 ); \ - vec_ste( sum1v, 0, &sum1 ); \ - vec_ste( sum2v, 0, &sum2 ); \ - \ - scores[0] = sum0; \ - scores[1] = sum1; \ - scores[2] = sum2; \ +#define PIXEL_SAD_X3_ALTIVEC( name, ly ) \ +static void name( uint8_t *fenc, uint8_t *pix0, \ + uint8_t *pix1, uint8_t *pix2, \ + intptr_t i_stride, int scores[3] ) \ +{ \ + ALIGNED_16( int sum0 ); \ + ALIGNED_16( int sum1 ); \ + ALIGNED_16( int sum2 ); \ + \ + LOAD_ZERO; \ + vec_u8_t fencv, pix0v, pix1v, pix2v; \ + vec_s32_t sum0v, sum1v, sum2v; \ + \ + sum0v = vec_splat_s32( 0 ); \ + sum1v = vec_splat_s32( 0 ); \ + sum2v = vec_splat_s32( 0 ); \ + \ + for( int y = 0; y < ly; y++ ) \ + { \ + PROCESS_PIXS \ + } \ + \ + sum0v = vec_sums( sum0v, zero_s32v ); \ + sum1v = vec_sums( sum1v, zero_s32v ); \ + sum2v = vec_sums( sum2v, zero_s32v ); \ + \ + sum0v = vec_splat( sum0v, 3 ); \ + sum1v = vec_splat( sum1v, 3 ); \ + sum2v = vec_splat( sum2v, 3 ); \ + \ + vec_ste( sum0v, 0, &sum0 ); \ + vec_ste( sum1v, 0, &sum1 ); \ + vec_ste( sum2v, 0, &sum2 ); \ + \ + scores[0] = sum0; \ + scores[1] = sum1; \ + scores[2] = sum2; \ } PIXEL_SAD_X3_ALTIVEC( pixel_sad_x3_8x8_altivec, 4 ) @@ -965,10 +965,10 @@ static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0, } #define PIXEL_SAD_X4_ALTIVEC( name, ly ) \ -static int name( uint8_t *fenc, \ - uint8_t *pix0, uint8_t *pix1, \ - uint8_t *pix2, uint8_t *pix3, \ - intptr_t i_stride, int scores[4] ) \ +static void name( uint8_t *fenc, \ + uint8_t *pix0, uint8_t *pix1, \ + uint8_t *pix2, uint8_t *pix3, \ + intptr_t i_stride, int scores[4] ) \ { \ ALIGNED_16( int sum0 ); \ ALIGNED_16( int sum1 ); \ diff --git a/common/ppc/pixel.h b/common/ppc/pixel.h index 552400f2f..7342fdfbd 100644 --- a/common/ppc/pixel.h +++ b/common/ppc/pixel.h @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.h: ppc pixel metrics ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Eric Petit * diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h index d196bde10..2298f5e25 100644 --- a/common/ppc/ppccommon.h +++ b/common/ppc/ppccommon.h @@ -1,7 +1,7 @@ /***************************************************************************** * ppccommon.h: ppc utility macros ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Eric Petit * diff --git a/common/ppc/predict.c b/common/ppc/predict.c index 13c7c1d55..528fc05d4 100644 --- a/common/ppc/predict.c +++ b/common/ppc/predict.c @@ -1,7 +1,7 @@ /***************************************************************************** * predict.c: ppc intra prediction ***************************************************************************** - * Copyright (C) 2007-2023 x264 project + * Copyright (C) 2007-2024 x264 project * * Authors: Guillaume Poirier * diff --git a/common/ppc/predict.h b/common/ppc/predict.h index 78b98d75c..0713c086a 100644 --- a/common/ppc/predict.h +++ b/common/ppc/predict.h @@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: ppc intra prediction ***************************************************************************** - * Copyright (C) 2007-2023 x264 project + * Copyright (C) 2007-2024 x264 project * * Authors: Guillaume Poirier * diff --git a/common/ppc/quant.c b/common/ppc/quant.c index 635ccb196..00c7a290b 100644 --- a/common/ppc/quant.c +++ b/common/ppc/quant.c @@ -1,7 +1,7 @@ /***************************************************************************** * quant.c: ppc quantization ***************************************************************************** - * Copyright (C) 2007-2023 x264 project + * Copyright (C) 2007-2024 x264 project * * Authors: Guillaume Poirier * diff --git a/common/ppc/quant.h b/common/ppc/quant.h index fdff4b663..6a1084e46 100644 --- a/common/ppc/quant.h +++ b/common/ppc/quant.h @@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: ppc quantization ***************************************************************************** - * Copyright (C) 2007-2023 x264 project + * Copyright (C) 2007-2024 x264 project * * Authors: Guillaume Poirier * diff --git a/common/predict.c b/common/predict.c index 935588810..8aa3b1f8d 100644 --- a/common/predict.c +++ b/common/predict.c @@ -1,7 +1,7 @@ /***************************************************************************** * predict.c: intra prediction ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt @@ -46,6 +46,9 @@ #if HAVE_MSA # include "mips/predict.h" #endif +#if HAVE_LSX +# include "loongarch/predict.h" +#endif /**************************************************************************** * 16x16 prediction for intra luma block @@ -924,6 +927,10 @@ void x264_predict_16x16_init( uint32_t cpu, x264_predict_t pf[7] ) } #endif #endif + +#if HAVE_LSX + x264_predict_16x16_init_loongarch( cpu, pf ); +#endif } void x264_predict_8x8c_init( uint32_t cpu, x264_predict_t pf[7] ) @@ -961,6 +968,10 @@ void x264_predict_8x8c_init( uint32_t cpu, x264_predict_t pf[7] ) } #endif #endif + +#if HAVE_LSX + x264_predict_8x8c_init_loongarch( cpu, pf ); +#endif } void x264_predict_8x16c_init( uint32_t cpu, x264_predict_t pf[7] ) @@ -1022,6 +1033,10 @@ void x264_predict_8x8_init( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict } #endif #endif + +#if HAVE_LSX + x264_predict_8x8_init_loongarch( cpu, pf, predict_filter ); +#endif } void x264_predict_4x4_init( uint32_t cpu, x264_predict_t pf[12] ) @@ -1050,5 +1065,9 @@ void x264_predict_4x4_init( uint32_t cpu, x264_predict_t pf[12] ) #if HAVE_AARCH64 x264_predict_4x4_init_aarch64( cpu, pf ); #endif + +#if HAVE_LSX + x264_predict_4x4_init_loongarch( cpu, pf ); +#endif } diff --git a/common/predict.h b/common/predict.h index 992674ae3..55afa50b8 100644 --- a/common/predict.h +++ b/common/predict.h @@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: intra prediction ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/common/quant.c b/common/quant.c index b5ea75646..262c5c527 100644 --- a/common/quant.c +++ b/common/quant.c @@ -1,7 +1,7 @@ /***************************************************************************** * quant.c: quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Loren Merritt * Fiona Glaser @@ -43,6 +43,9 @@ #if HAVE_MSA # include "mips/quant.h" #endif +#if HAVE_LSX +# include "loongarch/quant.h" +#endif #define QUANT_ONE( coef, mf, f ) \ { \ @@ -557,6 +560,38 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf ) pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512; } #endif // HAVE_MMX +#if HAVE_AARCH64 + + if( cpu&X264_CPU_NEON ) + { + pf->quant_2x2_dc = x264_quant_2x2_dc_neon; + pf->quant_4x4_dc = x264_quant_4x4_dc_neon; + pf->quant_4x4 = x264_quant_4x4_neon; + pf->quant_4x4x4 = x264_quant_4x4x4_neon; + pf->quant_8x8 = x264_quant_8x8_neon; + + pf->dequant_4x4 = x264_dequant_4x4_neon; + pf->dequant_8x8 = x264_dequant_8x8_neon; + pf->dequant_4x4_dc = x264_dequant_4x4_dc_neon; + + pf->decimate_score15 = x264_decimate_score15_neon; + pf->decimate_score16 = x264_decimate_score16_neon; + pf->decimate_score64 = x264_decimate_score64_neon; + + pf->coeff_last4 = x264_coeff_last4_neon; + pf->coeff_last8 = x264_coeff_last8_neon; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon; + pf->coeff_level_run4 = x264_coeff_level_run4_neon; + pf->coeff_level_run8 = x264_coeff_level_run8_neon; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon; + pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon; + + pf->denoise_dct = x264_denoise_dct_neon; + } + +#endif // HAVE_AARCH64 #else // !HIGH_BIT_DEPTH #if HAVE_MMX INIT_TRELLIS( sse2 ); @@ -801,6 +836,44 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf ) pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_msa; } #endif + +#if HAVE_LSX + if( cpu&X264_CPU_LSX ) + { + pf->quant_4x4 = x264_quant_4x4_lsx; + pf->quant_4x4x4 = x264_quant_4x4x4_lsx; + pf->quant_8x8 = x264_quant_8x8_lsx; + pf->quant_4x4_dc = x264_quant_4x4_dc_lsx; + pf->quant_2x2_dc = x264_quant_2x2_dc_lsx; + pf->dequant_4x4 = x264_dequant_4x4_lsx; + pf->dequant_8x8 = x264_dequant_8x8_lsx; + pf->dequant_4x4_dc = x264_dequant_4x4_dc_lsx; + pf->decimate_score15 = x264_decimate_score15_lsx; + pf->decimate_score16 = x264_decimate_score16_lsx; + pf->decimate_score64 = x264_decimate_score64_lsx; + pf->coeff_last4 = x264_coeff_last4_lsx; + pf->coeff_last8 = x264_coeff_last8_lsx; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lsx; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lsx; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lsx; + pf->coeff_level_run8 = x264_coeff_level_run8_lsx; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lsx; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lsx; + } + if( cpu&X264_CPU_LASX ) + { + pf->quant_4x4x4 = x264_quant_4x4x4_lasx; + pf->dequant_4x4 = x264_dequant_4x4_lasx; + pf->dequant_8x8 = x264_dequant_8x8_lasx; + pf->dequant_4x4_dc = x264_dequant_4x4_dc_lasx; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lasx; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lasx; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lasx; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lasx; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lasx; + } +#endif + #endif // HIGH_BIT_DEPTH pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] = pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4]; diff --git a/common/quant.h b/common/quant.h index 7a1ba48a7..bf48da95d 100644 --- a/common/quant.h +++ b/common/quant.h @@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Loren Merritt * Fiona Glaser diff --git a/common/rectangle.c b/common/rectangle.c index 470679b63..f4dc07d36 100644 --- a/common/rectangle.c +++ b/common/rectangle.c @@ -1,7 +1,7 @@ /***************************************************************************** * rectangle.c: rectangle filling ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Fiona Glaser * diff --git a/common/rectangle.h b/common/rectangle.h index 502a5b722..6a3201af2 100644 --- a/common/rectangle.h +++ b/common/rectangle.h @@ -1,7 +1,7 @@ /***************************************************************************** * rectangle.h: rectangle filling ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Fiona Glaser * Loren Merritt diff --git a/common/set.c b/common/set.c index 2d54763c7..57434ce0b 100644 --- a/common/set.c +++ b/common/set.c @@ -1,7 +1,7 @@ /***************************************************************************** * set.c: quantization init ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Loren Merritt * diff --git a/common/set.h b/common/set.h index 68cf6087c..d9c6641d5 100644 --- a/common/set.h +++ b/common/set.h @@ -1,7 +1,7 @@ /***************************************************************************** * set.h: quantization init ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/common/tables.c b/common/tables.c index 5eca4cffd..b71f23f7e 100644 --- a/common/tables.c +++ b/common/tables.c @@ -1,7 +1,7 @@ /***************************************************************************** * tables.c: const tables ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/tables.h b/common/tables.h index 7166c69fc..69ac21865 100644 --- a/common/tables.h +++ b/common/tables.h @@ -1,7 +1,7 @@ /***************************************************************************** * tables.h: const tables ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/threadpool.c b/common/threadpool.c index dfd814758..f53bba26c 100644 --- a/common/threadpool.c +++ b/common/threadpool.c @@ -1,7 +1,7 @@ /***************************************************************************** * threadpool.c: thread pooling ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/common/threadpool.h b/common/threadpool.h index e0e94d3b6..d2d3a2383 100644 --- a/common/threadpool.h +++ b/common/threadpool.h @@ -1,7 +1,7 @@ /***************************************************************************** * threadpool.h: thread pooling ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/common/vlc.c b/common/vlc.c index 1ce10adc0..b118c8a63 100644 --- a/common/vlc.c +++ b/common/vlc.c @@ -1,7 +1,7 @@ /***************************************************************************** * vlc.c : vlc tables ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Fiona Glaser diff --git a/common/win32thread.c b/common/win32thread.c index 90a6c9acc..b1bc6f98e 100644 --- a/common/win32thread.c +++ b/common/win32thread.c @@ -1,7 +1,7 @@ /***************************************************************************** * win32thread.c: windows threading ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * Pegasys Inc. diff --git a/common/win32thread.h b/common/win32thread.h index 391f108fd..30f688d56 100644 --- a/common/win32thread.h +++ b/common/win32thread.h @@ -1,7 +1,7 @@ /***************************************************************************** * win32thread.h: windows threading ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/common/x86/bitstream-a.asm b/common/x86/bitstream-a.asm index 29f37ca23..50892627d 100644 --- a/common/x86/bitstream-a.asm +++ b/common/x86/bitstream-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* bitstream-a.asm: x86 bitstream functions ;***************************************************************************** -;* Copyright (C) 2010-2023 x264 project +;* Copyright (C) 2010-2024 x264 project ;* ;* Authors: Fiona Glaser ;* Henrik Gramner diff --git a/common/x86/bitstream.h b/common/x86/bitstream.h index 36a11a5ef..c9e798238 100644 --- a/common/x86/bitstream.h +++ b/common/x86/bitstream.h @@ -1,7 +1,7 @@ /***************************************************************************** * bitstream.h: x86 bitstream functions ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm index 6dfab5f75..31e624b79 100644 --- a/common/x86/cabac-a.asm +++ b/common/x86/cabac-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* cabac-a.asm: x86 cabac ;***************************************************************************** -;* Copyright (C) 2008-2023 x264 project +;* Copyright (C) 2008-2024 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser diff --git a/common/x86/const-a.asm b/common/x86/const-a.asm index e45626261..a489f87aa 100644 --- a/common/x86/const-a.asm +++ b/common/x86/const-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* const-a.asm: x86 global constants ;***************************************************************************** -;* Copyright (C) 2010-2023 x264 project +;* Copyright (C) 2010-2024 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser diff --git a/common/x86/cpu-a.asm b/common/x86/cpu-a.asm index dd63fc1a1..bdcacc31c 100644 --- a/common/x86/cpu-a.asm +++ b/common/x86/cpu-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* cpu-a.asm: x86 cpu utilities ;***************************************************************************** -;* Copyright (C) 2003-2023 x264 project +;* Copyright (C) 2003-2024 x264 project ;* ;* Authors: Laurent Aimar ;* Loren Merritt diff --git a/common/x86/dct-32.asm b/common/x86/dct-32.asm index 54af62052..11e476788 100644 --- a/common/x86/dct-32.asm +++ b/common/x86/dct-32.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* dct-32.asm: x86_32 transform and zigzag ;***************************************************************************** -;* Copyright (C) 2003-2023 x264 project +;* Copyright (C) 2003-2024 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm index ca34c6698..249c9e739 100644 --- a/common/x86/dct-64.asm +++ b/common/x86/dct-64.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* dct-64.asm: x86_64 transform and zigzag ;***************************************************************************** -;* Copyright (C) 2003-2023 x264 project +;* Copyright (C) 2003-2024 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index c4a29cf48..18e2d3e2b 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* dct-a.asm: x86 transform and zigzag ;***************************************************************************** -;* Copyright (C) 2003-2023 x264 project +;* Copyright (C) 2003-2024 x264 project ;* ;* Authors: Holger Lubitz ;* Loren Merritt diff --git a/common/x86/dct.h b/common/x86/dct.h index 33fe568a9..c03dc3cb3 100644 --- a/common/x86/dct.h +++ b/common/x86/dct.h @@ -1,7 +1,7 @@ /***************************************************************************** * dct.h: x86 transform and zigzag ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index 646e528f5..c4880ad01 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* deblock-a.asm: x86 deblocking ;***************************************************************************** -;* Copyright (C) 2005-2023 x264 project +;* Copyright (C) 2005-2024 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser diff --git a/common/x86/deblock.h b/common/x86/deblock.h index fc92731eb..76a55fa9c 100644 --- a/common/x86/deblock.h +++ b/common/x86/deblock.h @@ -1,7 +1,7 @@ /***************************************************************************** * deblock.h: x86 deblocking ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm index 184e1b019..c077bf4c1 100644 --- a/common/x86/mc-a.asm +++ b/common/x86/mc-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* mc-a.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2003-2023 x264 project +;* Copyright (C) 2003-2024 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 720cd35b7..b808fff63 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* mc-a2.asm: x86 motion compensation ;***************************************************************************** -;* Copyright (C) 2005-2023 x264 project +;* Copyright (C) 2005-2024 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index ad202b950..68d0e1dd0 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * mc-c.c: x86 motion compensation ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/x86/mc.h b/common/x86/mc.h index 4d65b17c2..9242dabb4 100644 --- a/common/x86/mc.h +++ b/common/x86/mc.h @@ -1,7 +1,7 @@ /***************************************************************************** * mc.h: x86 motion compensation ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/common/x86/pixel-32.asm b/common/x86/pixel-32.asm index 7ad5e3e39..f742bd55b 100644 --- a/common/x86/pixel-32.asm +++ b/common/x86/pixel-32.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* pixel-32.asm: x86_32 pixel metrics ;***************************************************************************** -;* Copyright (C) 2003-2023 x264 project +;* Copyright (C) 2003-2024 x264 project ;* ;* Authors: Loren Merritt ;* Laurent Aimar diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 4d57d5b5f..deeffb7ca 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* pixel.asm: x86 pixel metrics ;***************************************************************************** -;* Copyright (C) 2003-2023 x264 project +;* Copyright (C) 2003-2024 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 84eb3be86..d948baa01 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -1,7 +1,7 @@ /***************************************************************************** * pixel.h: x86 pixel metrics ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm index 812ae940d..d9f395253 100644 --- a/common/x86/predict-a.asm +++ b/common/x86/predict-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* predict-a.asm: x86 intra prediction ;***************************************************************************** -;* Copyright (C) 2005-2023 x264 project +;* Copyright (C) 2005-2024 x264 project ;* ;* Authors: Loren Merritt ;* Holger Lubitz diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c index 40946c1df..1845db1d1 100644 --- a/common/x86/predict-c.c +++ b/common/x86/predict-c.c @@ -1,7 +1,7 @@ /***************************************************************************** * predict-c.c: intra prediction ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/x86/predict.h b/common/x86/predict.h index 7571b7455..ee3bc84d4 100644 --- a/common/x86/predict.h +++ b/common/x86/predict.h @@ -1,7 +1,7 @@ /***************************************************************************** * predict.h: x86 intra prediction ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm index 11f0a088a..12dad607e 100644 --- a/common/x86/quant-a.asm +++ b/common/x86/quant-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* quant-a.asm: x86 quantization and level-run ;***************************************************************************** -;* Copyright (C) 2005-2023 x264 project +;* Copyright (C) 2005-2024 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser diff --git a/common/x86/quant.h b/common/x86/quant.h index 89560e154..7b609ae89 100644 --- a/common/x86/quant.h +++ b/common/x86/quant.h @@ -1,7 +1,7 @@ /***************************************************************************** * quant.h: x86 quantization and level-run ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Loren Merritt * Fiona Glaser diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm index 93cccc3b1..e2bd24b76 100644 --- a/common/x86/sad-a.asm +++ b/common/x86/sad-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* sad-a.asm: x86 sad functions ;***************************************************************************** -;* Copyright (C) 2003-2023 x264 project +;* Copyright (C) 2003-2024 x264 project ;* ;* Authors: Loren Merritt ;* Fiona Glaser diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm index a19e5e738..fd2d0c0bb 100644 --- a/common/x86/sad16-a.asm +++ b/common/x86/sad16-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* sad16-a.asm: x86 high depth sad functions ;***************************************************************************** -;* Copyright (C) 2010-2023 x264 project +;* Copyright (C) 2010-2024 x264 project ;* ;* Authors: Oskar Arvidsson ;* Henrik Gramner diff --git a/common/x86/trellis-64.asm b/common/x86/trellis-64.asm index 69b0a010a..a6cf26b3d 100644 --- a/common/x86/trellis-64.asm +++ b/common/x86/trellis-64.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* trellis-64.asm: x86_64 trellis quantization ;***************************************************************************** -;* Copyright (C) 2012-2023 x264 project +;* Copyright (C) 2012-2024 x264 project ;* ;* Authors: Loren Merritt ;* diff --git a/common/x86/util.h b/common/x86/util.h index aaa527784..b2c794167 100644 --- a/common/x86/util.h +++ b/common/x86/util.h @@ -1,7 +1,7 @@ /***************************************************************************** * util.h: x86 inline asm ***************************************************************************** - * Copyright (C) 2008-2023 x264 project + * Copyright (C) 2008-2024 x264 project * * Authors: Fiona Glaser * Loren Merritt diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 708f78bf3..486bad68d 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* x86inc.asm: x264asm abstraction layer ;***************************************************************************** -;* Copyright (C) 2005-2023 x264 project +;* Copyright (C) 2005-2024 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner @@ -111,7 +111,7 @@ %endif %define HAVE_PRIVATE_EXTERN 1 -%ifdef __NASM_VER__ +%ifdef __NASM_VERSION_ID__ %use smartalign %if __NASM_VERSION_ID__ < 0x020e0000 ; 2.14 %define HAVE_PRIVATE_EXTERN 0 @@ -239,7 +239,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %elif PIC call $+5 ; special-cased to not affect the RSB on most CPU:s pop %1 - add %1, (%2)-$+1 + add %1, -$+1+%2 %else mov %1, %2 %endif @@ -393,7 +393,24 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %endmacro -%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only) +%macro RESET_STACK_STATE 0 + %ifidn rstk, rsp + %assign stack_offset stack_offset - stack_size_padded + %else + %xdefine rstk rsp + %endif + %assign stack_size 0 + %assign stack_size_padded 0 + %assign xmm_regs_used 0 +%endmacro + +%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs + RESET_STACK_STATE + %ifnum %2 + %if mmsize != 8 + %assign xmm_regs_used %2 + %endif + %endif %ifnum %1 %if %1 != 0 %assign %%pad 0 @@ -403,11 +420,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %if WIN64 %assign %%pad %%pad + 32 ; shadow space - %if mmsize != 8 - %assign xmm_regs_used %2 - %if xmm_regs_used > 8 - %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers - %endif + %if xmm_regs_used > 8 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %if required_stack_alignment <= STACK_ALIGNMENT @@ -503,35 +517,62 @@ DECLARE_REG 14, R13, 120 %endif %endmacro -%macro WIN64_PUSH_XMM 0 - ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 + high_mm_regs - movaps [rstk + stack_offset + 8], xmm6 - %endif - %if xmm_regs_used > 7 + high_mm_regs - movaps [rstk + stack_offset + 24], xmm7 - %endif - %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 - %if %%xmm_regs_on_stack > 0 - %assign %%i 8 - %rep %%xmm_regs_on_stack - movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i - %assign %%i %%i+1 - %endrep +; Push XMM registers to the stack. If no argument is specified all used register +; will be pushed, otherwise only push previously unpushed registers. +%macro WIN64_PUSH_XMM 0-2 ; new_xmm_regs_used, xmm_regs_pushed + %if mmsize != 8 + %if %0 == 2 + %assign %%pushed %2 + %assign xmm_regs_used %1 + %elif %0 == 1 + %assign %%pushed xmm_regs_used + %assign xmm_regs_used %1 + %else + %assign %%pushed 0 + %endif + ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. + %if %%pushed <= 6 + high_mm_regs && xmm_regs_used > 6 + high_mm_regs + movaps [rstk + stack_offset + 8], xmm6 + %endif + %if %%pushed <= 7 + high_mm_regs && xmm_regs_used > 7 + high_mm_regs + movaps [rstk + stack_offset + 24], xmm7 + %endif + %assign %%pushed %%pushed - high_mm_regs - 8 + %if %%pushed < 0 + %assign %%pushed 0 + %endif + %assign %%regs_to_push xmm_regs_used - %%pushed - high_mm_regs - 8 + %if %%regs_to_push > 0 + ASSERT (%%regs_to_push + %%pushed) * 16 <= stack_size_padded - stack_size - 32 + %assign %%i %%pushed + 8 + %rep %%regs_to_push + movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i + %assign %%i %%i+1 + %endrep + %endif %endif %endmacro -%macro WIN64_SPILL_XMM 1 - %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 + high_mm_regs - %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 - %if %%xmm_regs_on_stack > 0 - ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. - %assign %%pad %%xmm_regs_on_stack*16 + 32 - %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) - SUB rsp, stack_size_padded +; Allocated stack space for XMM registers and push all, or a subset, of those +%macro WIN64_SPILL_XMM 1-2 ; xmm_regs_used, xmm_regs_reserved + RESET_STACK_STATE + %if mmsize != 8 + %assign xmm_regs_used %1 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %if %0 == 2 + ASSERT %2 >= %1 + %assign %%xmm_regs_on_stack %2 - high_mm_regs - 8 + %else + %assign %%xmm_regs_on_stack %1 - high_mm_regs - 8 + %endif + %if %%xmm_regs_on_stack > 0 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad %%xmm_regs_on_stack*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) + SUB rsp, stack_size_padded + %endif + WIN64_PUSH_XMM %endif - WIN64_PUSH_XMM %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 0 @@ -562,9 +603,7 @@ DECLARE_REG 14, R13, 120 %macro WIN64_RESTORE_XMM 0 WIN64_RESTORE_XMM_INTERNAL - %assign stack_offset (stack_offset-stack_size_padded) - %assign stack_size_padded 0 - %assign xmm_regs_used 0 + RESET_STACK_STATE %endmacro %define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs @@ -599,12 +638,11 @@ DECLARE_REG 14, R13, 72 %macro PROLOGUE 2-5+ 0, 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 - %assign xmm_regs_used %3 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 PUSH_IF_USED 9, 10, 11, 12, 13, 14 - ALLOC_STACK %4 + ALLOC_STACK %4, %3 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 %if %0 > 4 %ifnum %4 @@ -668,7 +706,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 SETUP_STACK_POINTER %4 ASSERT regs_used <= 7 PUSH_IF_USED 3, 4, 5, 6 - ALLOC_STACK %4 + ALLOC_STACK %4, %3 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 %if %0 > 4 %ifnum %4 @@ -701,13 +739,21 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif ;====================================================================== %if WIN64 == 0 - %macro WIN64_SPILL_XMM 1 + %macro WIN64_SPILL_XMM 1-2 + RESET_STACK_STATE + %if mmsize != 8 + %assign xmm_regs_used %1 + %endif %endmacro %macro WIN64_RESTORE_XMM_INTERNAL 0 %endmacro %macro WIN64_RESTORE_XMM 0 + RESET_STACK_STATE %endmacro - %macro WIN64_PUSH_XMM 0 + %macro WIN64_PUSH_XMM 0-2 + %if mmsize != 8 && %0 >= 1 + %assign xmm_regs_used %1 + %endif %endmacro %endif @@ -827,16 +873,16 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %macro cextern 1 %xdefine %1 mangle(private_prefix %+ _ %+ %1) - CAT_XDEFINE cglobaled_, %1, 1 + CAT_XDEFINE cglobaled_, %1, 2 extern %1 %endmacro -; like cextern, but without the prefix +; Like cextern, but without the prefix. This should be used for symbols from external libraries. %macro cextern_naked 1 %ifdef PREFIX %xdefine %1 mangle(%1) %endif - CAT_XDEFINE cglobaled_, %1, 1 + CAT_XDEFINE cglobaled_, %1, 3 extern %1 %endmacro @@ -852,9 +898,26 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %1: %2 %endmacro -; This is needed for ELF, otherwise the GNU linker assumes the stack is executable by default. %if FORMAT_ELF + ; The GNU linker assumes the stack is executable by default. [SECTION .note.GNU-stack noalloc noexec nowrite progbits] + + %ifdef __NASM_VERSION_ID__ + %if __NASM_VERSION_ID__ >= 0x020e0300 ; 2.14.03 + %if ARCH_X86_64 + ; Control-flow Enforcement Technology (CET) properties. + [SECTION .note.gnu.property alloc noexec nowrite note align=gprsize] + dd 0x00000004 ; n_namesz + dd gprsize + 8 ; n_descsz + dd 0x00000005 ; n_type = NT_GNU_PROPERTY_TYPE_0 + db "GNU",0 ; n_name + dd 0xc0000002 ; pr_type = GNU_PROPERTY_X86_FEATURE_1_AND + dd 0x00000004 ; pr_datasz + dd 0x00000002 ; pr_data = GNU_PROPERTY_X86_FEATURE_1_SHSTK + dd 0x00000000 ; pr_padding + %endif + %endif + %endif %endif ; Tell debuggers how large the function was. @@ -877,33 +940,35 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, ; cpuflags -%assign cpuflags_mmx (1<<0) -%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx -%assign cpuflags_3dnow (1<<2) | cpuflags_mmx -%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow -%assign cpuflags_sse (1<<4) | cpuflags_mmx2 -%assign cpuflags_sse2 (1<<5) | cpuflags_sse -%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 -%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 -%assign cpuflags_aesni (1<<12)| cpuflags_sse42 -%assign cpuflags_gfni (1<<13)| cpuflags_sse42 -%assign cpuflags_avx (1<<14)| cpuflags_sse42 -%assign cpuflags_xop (1<<15)| cpuflags_avx -%assign cpuflags_fma4 (1<<16)| cpuflags_avx -%assign cpuflags_fma3 (1<<17)| cpuflags_avx -%assign cpuflags_bmi1 (1<<18)| cpuflags_avx|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<19)| cpuflags_bmi1 -%assign cpuflags_avx2 (1<<20)| cpuflags_fma3|cpuflags_bmi2 -%assign cpuflags_avx512 (1<<21)| cpuflags_avx2 ; F, CD, BW, DQ, VL - -%assign cpuflags_cache32 (1<<22) -%assign cpuflags_cache64 (1<<23) -%assign cpuflags_aligned (1<<24) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<25) +%assign cpuflags_mmx (1<<0) +%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx +%assign cpuflags_3dnow (1<<2) | cpuflags_mmx +%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow +%assign cpuflags_sse (1<<4) | cpuflags_mmx2 +%assign cpuflags_sse2 (1<<5) | cpuflags_sse +%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10) | cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11) | cpuflags_sse4 +%assign cpuflags_aesni (1<<12) | cpuflags_sse42 +%assign cpuflags_clmul (1<<13) | cpuflags_sse42 +%assign cpuflags_gfni (1<<14) | cpuflags_aesni|cpuflags_clmul +%assign cpuflags_avx (1<<15) | cpuflags_sse42 +%assign cpuflags_xop (1<<16) | cpuflags_avx +%assign cpuflags_fma4 (1<<17) | cpuflags_avx +%assign cpuflags_fma3 (1<<18) | cpuflags_avx +%assign cpuflags_bmi1 (1<<19) | cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<20) | cpuflags_bmi1 +%assign cpuflags_avx2 (1<<21) | cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<22) | cpuflags_avx2 ; F, CD, BW, DQ, VL +%assign cpuflags_avx512icl (1<<23) | cpuflags_avx512|cpuflags_gfni ; VNNI, IFMA, VBMI, VBMI2, VPOPCNTDQ, BITALG, VAES, VPCLMULQDQ + +%assign cpuflags_cache32 (1<<24) +%assign cpuflags_cache64 (1<<25) +%assign cpuflags_aligned (1<<26) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<27) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) @@ -945,13 +1010,13 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif %if ARCH_X86_64 || cpuflag(sse2) - %ifdef __NASM_VER__ + %ifdef __NASM_VERSION_ID__ ALIGNMODE p6 %else CPU amdnop %endif %else - %ifdef __NASM_VER__ + %ifdef __NASM_VERSION_ID__ ALIGNMODE nop %else CPU basicnop @@ -1041,6 +1106,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %if WIN64 AVX512_MM_PERMUTATION 6 ; Swap callee-saved registers with volatile registers %endif + %xdefine bcstw 1to8 + %xdefine bcstd 1to4 + %xdefine bcstq 1to2 %endmacro %macro INIT_YMM 0-1+ @@ -1054,6 +1122,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, INIT_CPUFLAGS %1 DEFINE_MMREGS ymm AVX512_MM_PERMUTATION + %xdefine bcstw 1to16 + %xdefine bcstd 1to8 + %xdefine bcstq 1to4 %endmacro %macro INIT_ZMM 0-1+ @@ -1067,6 +1138,9 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, INIT_CPUFLAGS %1 DEFINE_MMREGS zmm AVX512_MM_PERMUTATION + %xdefine bcstw 1to32 + %xdefine bcstd 1to16 + %xdefine bcstq 1to8 %endmacro INIT_XMM @@ -1203,12 +1277,27 @@ INIT_XMM %endmacro %macro call_internal 2 %xdefine %%i %2 + %define %%j %%i %ifndef cglobaled_%2 %ifdef cglobaled_%1 %xdefine %%i %1 %endif + %elif FORMAT_ELF + %if ARCH_X86_64 + %if cglobaled_%2 >= 2 + ; Always emit PLT relocations when calling external functions, + ; the linker will eliminate unnecessary PLT indirections anyway. + %define %%j %%i wrt ..plt + %endif + %elif PIC && cglobaled_%2 == 3 + ; Go through the GOT for functions declared using cextern_naked with + ; PIC, as such functions presumably exists in external libraries. + extern _GLOBAL_OFFSET_TABLE_ + LEA eax, $$+_GLOBAL_OFFSET_TABLE_ wrt ..gotpc + %define %%j [eax+%%i wrt ..got] + %endif %endif - call %%i + call %%j LOAD_MM_PERMUTATION %%i %endmacro @@ -1588,18 +1677,18 @@ AVX_INSTR orps, sse, 1, 0, 1 AVX_INSTR pabsb, ssse3 AVX_INSTR pabsd, ssse3 AVX_INSTR pabsw, ssse3 -AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packssdw, mmx, 0, 0, 0 -AVX_INSTR packuswb, mmx, 0, 0, 0 +AVX_INSTR packsswb, mmx, 0, 0, 0 AVX_INSTR packusdw, sse4, 0, 0, 0 +AVX_INSTR packuswb, mmx, 0, 0, 0 AVX_INSTR paddb, mmx, 0, 0, 1 -AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR paddd, mmx, 0, 0, 1 AVX_INSTR paddq, sse2, 0, 0, 1 AVX_INSTR paddsb, mmx, 0, 0, 1 AVX_INSTR paddsw, mmx, 0, 0, 1 AVX_INSTR paddusb, mmx, 0, 0, 1 AVX_INSTR paddusw, mmx, 0, 0, 1 +AVX_INSTR paddw, mmx, 0, 0, 1 AVX_INSTR palignr, ssse3, 0, 1, 0 AVX_INSTR pand, mmx, 0, 0, 1 AVX_INSTR pandn, mmx, 0, 0, 0 @@ -1607,71 +1696,71 @@ AVX_INSTR pavgb, mmx2, 0, 0, 1 AVX_INSTR pavgw, mmx2, 0, 0, 1 AVX_INSTR pblendvb, sse4, 0, 1, 0 ; last operand must be xmm0 with legacy encoding AVX_INSTR pblendw, sse4, 0, 1, 0 -AVX_INSTR pclmulqdq, fnord, 0, 1, 0 -AVX_INSTR pclmulhqhqdq, fnord, 0, 0, 0 -AVX_INSTR pclmulhqlqdq, fnord, 0, 0, 0 -AVX_INSTR pclmullqhqdq, fnord, 0, 0, 0 -AVX_INSTR pclmullqlqdq, fnord, 0, 0, 0 -AVX_INSTR pcmpestri, sse42 -AVX_INSTR pcmpestrm, sse42 -AVX_INSTR pcmpistri, sse42 -AVX_INSTR pcmpistrm, sse42 +AVX_INSTR pclmulhqhqdq, clmul, 0, 0, 0 +AVX_INSTR pclmulhqlqdq, clmul, 0, 0, 0 +AVX_INSTR pclmullqhqdq, clmul, 0, 0, 0 +AVX_INSTR pclmullqlqdq, clmul, 0, 0, 0 +AVX_INSTR pclmulqdq, clmul, 0, 1, 0 AVX_INSTR pcmpeqb, mmx, 0, 0, 1 -AVX_INSTR pcmpeqw, mmx, 0, 0, 1 AVX_INSTR pcmpeqd, mmx, 0, 0, 1 AVX_INSTR pcmpeqq, sse4, 0, 0, 1 +AVX_INSTR pcmpeqw, mmx, 0, 0, 1 +AVX_INSTR pcmpestri, sse42 +AVX_INSTR pcmpestrm, sse42 AVX_INSTR pcmpgtb, mmx, 0, 0, 0 -AVX_INSTR pcmpgtw, mmx, 0, 0, 0 AVX_INSTR pcmpgtd, mmx, 0, 0, 0 AVX_INSTR pcmpgtq, sse42, 0, 0, 0 +AVX_INSTR pcmpgtw, mmx, 0, 0, 0 +AVX_INSTR pcmpistri, sse42 +AVX_INSTR pcmpistrm, sse42 AVX_INSTR pextrb, sse4 AVX_INSTR pextrd, sse4 AVX_INSTR pextrq, sse4 AVX_INSTR pextrw, mmx2 -AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phaddd, ssse3, 0, 0, 0 AVX_INSTR phaddsw, ssse3, 0, 0, 0 +AVX_INSTR phaddw, ssse3, 0, 0, 0 AVX_INSTR phminposuw, sse4 -AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR phsubd, ssse3, 0, 0, 0 AVX_INSTR phsubsw, ssse3, 0, 0, 0 +AVX_INSTR phsubw, ssse3, 0, 0, 0 AVX_INSTR pinsrb, sse4, 0, 1, 0 AVX_INSTR pinsrd, sse4, 0, 1, 0 AVX_INSTR pinsrq, sse4, 0, 1, 0 AVX_INSTR pinsrw, mmx2, 0, 1, 0 -AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaddubsw, ssse3, 0, 0, 0 +AVX_INSTR pmaddwd, mmx, 0, 0, 1 AVX_INSTR pmaxsb, sse4, 0, 0, 1 -AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxsd, sse4, 0, 0, 1 +AVX_INSTR pmaxsw, mmx2, 0, 0, 1 AVX_INSTR pmaxub, mmx2, 0, 0, 1 -AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pmaxud, sse4, 0, 0, 1 +AVX_INSTR pmaxuw, sse4, 0, 0, 1 AVX_INSTR pminsb, sse4, 0, 0, 1 -AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminsd, sse4, 0, 0, 1 +AVX_INSTR pminsw, mmx2, 0, 0, 1 AVX_INSTR pminub, mmx2, 0, 0, 1 -AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pminud, sse4, 0, 0, 1 +AVX_INSTR pminuw, sse4, 0, 0, 1 AVX_INSTR pmovmskb, mmx2 -AVX_INSTR pmovsxbw, sse4 AVX_INSTR pmovsxbd, sse4 AVX_INSTR pmovsxbq, sse4 +AVX_INSTR pmovsxbw, sse4 +AVX_INSTR pmovsxdq, sse4 AVX_INSTR pmovsxwd, sse4 AVX_INSTR pmovsxwq, sse4 -AVX_INSTR pmovsxdq, sse4 -AVX_INSTR pmovzxbw, sse4 AVX_INSTR pmovzxbd, sse4 AVX_INSTR pmovzxbq, sse4 +AVX_INSTR pmovzxbw, sse4 +AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmovzxwd, sse4 AVX_INSTR pmovzxwq, sse4 -AVX_INSTR pmovzxdq, sse4 AVX_INSTR pmuldq, sse4, 0, 0, 1 AVX_INSTR pmulhrsw, ssse3, 0, 0, 1 AVX_INSTR pmulhuw, mmx2, 0, 0, 1 AVX_INSTR pmulhw, mmx, 0, 0, 1 -AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmulld, sse4, 0, 0, 1 +AVX_INSTR pmullw, mmx, 0, 0, 1 AVX_INSTR pmuludq, sse2, 0, 0, 1 AVX_INSTR por, mmx, 0, 0, 1 AVX_INSTR psadbw, mmx2, 0, 0, 1 @@ -1680,35 +1769,35 @@ AVX_INSTR pshufd, sse2 AVX_INSTR pshufhw, sse2 AVX_INSTR pshuflw, sse2 AVX_INSTR psignb, ssse3, 0, 0, 0 -AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR psignd, ssse3, 0, 0, 0 -AVX_INSTR psllw, mmx, 0, 0, 0 +AVX_INSTR psignw, ssse3, 0, 0, 0 AVX_INSTR pslld, mmx, 0, 0, 0 -AVX_INSTR psllq, mmx, 0, 0, 0 AVX_INSTR pslldq, sse2, 0, 0, 0 -AVX_INSTR psraw, mmx, 0, 0, 0 +AVX_INSTR psllq, mmx, 0, 0, 0 +AVX_INSTR psllw, mmx, 0, 0, 0 AVX_INSTR psrad, mmx, 0, 0, 0 -AVX_INSTR psrlw, mmx, 0, 0, 0 +AVX_INSTR psraw, mmx, 0, 0, 0 AVX_INSTR psrld, mmx, 0, 0, 0 -AVX_INSTR psrlq, mmx, 0, 0, 0 AVX_INSTR psrldq, sse2, 0, 0, 0 +AVX_INSTR psrlq, mmx, 0, 0, 0 +AVX_INSTR psrlw, mmx, 0, 0, 0 AVX_INSTR psubb, mmx, 0, 0, 0 -AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR psubd, mmx, 0, 0, 0 AVX_INSTR psubq, sse2, 0, 0, 0 AVX_INSTR psubsb, mmx, 0, 0, 0 AVX_INSTR psubsw, mmx, 0, 0, 0 AVX_INSTR psubusb, mmx, 0, 0, 0 AVX_INSTR psubusw, mmx, 0, 0, 0 +AVX_INSTR psubw, mmx, 0, 0, 0 AVX_INSTR ptest, sse4 AVX_INSTR punpckhbw, mmx, 0, 0, 0 -AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpckhdq, mmx, 0, 0, 0 AVX_INSTR punpckhqdq, sse2, 0, 0, 0 +AVX_INSTR punpckhwd, mmx, 0, 0, 0 AVX_INSTR punpcklbw, mmx, 0, 0, 0 -AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR punpckldq, mmx, 0, 0, 0 AVX_INSTR punpcklqdq, sse2, 0, 0, 0 +AVX_INSTR punpcklwd, mmx, 0, 0, 0 AVX_INSTR pxor, mmx, 0, 0, 1 AVX_INSTR rcpps, sse, 1 AVX_INSTR rcpss, sse, 1, 0, 0 @@ -1740,8 +1829,8 @@ AVX_INSTR xorps, sse, 1, 0, 1 ; 3DNow instructions, for sharing code between AVX, SSE and 3DN AVX_INSTR pfadd, 3dnow, 1, 0, 1 -AVX_INSTR pfsub, 3dnow, 1, 0, 0 AVX_INSTR pfmul, 3dnow, 1, 0, 1 +AVX_INSTR pfsub, 3dnow, 1, 0, 0 ;%1 == instruction ;%2 == minimal instruction set @@ -1763,9 +1852,10 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1 GPR_INSTR andn, bmi1 GPR_INSTR bextr, bmi1 GPR_INSTR blsi, bmi1 -GPR_INSTR blsr, bmi1 GPR_INSTR blsmsk, bmi1 +GPR_INSTR blsr, bmi1 GPR_INSTR bzhi, bmi2 +GPR_INSTR crc32, sse42 GPR_INSTR mulx, bmi2 GPR_INSTR pdep, bmi2 GPR_INSTR pext, bmi2 @@ -1806,9 +1896,9 @@ GPR_INSTR shrx, bmi2 %endmacro %endmacro -FMA_INSTR pmacsww, pmullw, paddw -FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation -FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation +FMA_INSTR pmacsww, pmullw, paddw FMA_INSTR pmadcswd, pmaddwd, paddd ; Macros for consolidating FMA3 and FMA4 using 4-operand (dst, src1, src2, src3) syntax. diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index d785a4f25..8226e7af7 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* x86util.asm: x86 utility macros ;***************************************************************************** -;* Copyright (C) 2008-2023 x264 project +;* Copyright (C) 2008-2024 x264 project ;* ;* Authors: Holger Lubitz ;* Loren Merritt diff --git a/config.guess b/config.guess index 14c129632..7eec710e2 100755 --- a/config.guess +++ b/config.guess @@ -934,6 +934,9 @@ EOF ia64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; + loongarch32:Linux:*:* | loongarch64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; m32r*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; diff --git a/config.sub b/config.sub index 72e9265be..3093784c0 100755 --- a/config.sub +++ b/config.sub @@ -274,6 +274,7 @@ case $basic_machine in | ip2k | iq2000 \ | le32 | le64 \ | lm32 \ + | loongarch32 | loongarch64 \ | m32c | m32r | m32rle | m68000 | m68k | m88k \ | maxq | mb | microblaze | microblazeel | mcore | mep | metag \ | mips | mipsbe | mipseb | mipsel | mipsle \ @@ -389,6 +390,7 @@ case $basic_machine in | ip2k-* | iq2000-* \ | le32-* | le64-* \ | lm32-* \ + | loongarch32-* | loongarch64-* \ | m32c-* | m32r-* | m32rle-* \ | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \ diff --git a/configure b/configure index e242e73cb..c1fb599f0 100755 --- a/configure +++ b/configure @@ -411,7 +411,8 @@ NL=" # list of all preprocessor HAVE values we can define CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON AARCH64 BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F SWSCALE \ LAVF FFMS GPAC AVS GPL VECTOREXT INTERLACED CPU_COUNT OPENCL THP LSMASH X86_INLINE_ASM AS_FUNC INTEL_DISPATCHER \ - MSA MMAP WINRT VSX ARM_INLINE_ASM STRTOK_R CLOCK_GETTIME BITDEPTH8 BITDEPTH10" + MSA LSX MMAP WINRT VSX ARM_INLINE_ASM STRTOK_R CLOCK_GETTIME BITDEPTH8 BITDEPTH10 \ + SVE SVE2" # parse options @@ -822,6 +823,12 @@ case $host_cpu in AS="${AS-${CC}}" AS_EXT=".c" ;; + loongarch*) + ARCH="LOONGARCH" + ASFLAGS="$ASFLAGS -c" + AS="${AS-${CC}}" + AS_EXT=".S" + ;; aarch64|arm64*) ARCH="AARCH64" stack_alignment=16 @@ -997,6 +1004,8 @@ if [ $asm = auto -a $ARCH = AARCH64 ] ; then elif cc_check '' '' '__asm__("cmeq v0.8h, v0.8h, #0");' ; then define HAVE_AARCH64 define HAVE_NEON + cc_check '' '' '__asm__(".arch armv8.2-a+sve \n ptrue p0.b, vl16");' && define HAVE_SVE + cc_check '' '' '__asm__(".arch armv8.2-a+sve2 \n smlalb z10.s, z2.h, z1.h");' && define HAVE_SVE2 ASFLAGS="$ASFLAGS -c" else echo "no NEON support, try adding -mfpu=neon to CFLAGS" @@ -1024,6 +1033,13 @@ if [ $asm = auto -a $ARCH = MIPS ] ; then fi fi +if [ $asm = auto -a $ARCH = LOONGARCH ] ; then + if cc_check '' '' '__asm__("xvadd.b $xr0, $xr1, $xr2");' ; then + # Use HAVE_LSX as the base flag, compiler support LA SIMD(LSX and LASX) + define HAVE_LSX + fi +fi + [ $asm = no ] && AS="" [ "x$AS" = x ] && asm="no" || asm="yes" @@ -1674,7 +1690,7 @@ cat conftest.log >> config.log cat conftest.log [ "$SRCPATH" != "." ] && ln -sf ${SRCPATH}/Makefile ./Makefile -mkdir -p common/{aarch64,arm,mips,ppc,x86} encoder extras filters/video input output tools +mkdir -p common/{aarch64,arm,mips,ppc,x86,loongarch} encoder extras filters/video input output tools echo echo "You can run 'make' or 'make fprofiled' now." diff --git a/encoder/analyse.c b/encoder/analyse.c index a615a89ce..2acc9d6c3 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -1,7 +1,7 @@ /***************************************************************************** * analyse.c: macroblock analysis ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/encoder/analyse.h b/encoder/analyse.h index b143523a6..44c0a70fc 100644 --- a/encoder/analyse.h +++ b/encoder/analyse.h @@ -1,7 +1,7 @@ /***************************************************************************** * analyse.h: macroblock analysis ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/encoder/api.c b/encoder/api.c index afc385373..20b14419f 100644 --- a/encoder/api.c +++ b/encoder/api.c @@ -1,7 +1,7 @@ /***************************************************************************** * api.c: bit depth independent interface ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Vittorio Giovara * Luca Barbato diff --git a/encoder/cabac.c b/encoder/cabac.c index 914387510..b96ee8122 100644 --- a/encoder/cabac.c +++ b/encoder/cabac.c @@ -1,7 +1,7 @@ /***************************************************************************** * cabac.c: cabac bitstream writing ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/encoder/cavlc.c b/encoder/cavlc.c index d893c4b26..74dbb75bd 100644 --- a/encoder/cavlc.c +++ b/encoder/cavlc.c @@ -1,7 +1,7 @@ /***************************************************************************** * cavlc.c: cavlc bitstream writing ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/encoder/encoder.c b/encoder/encoder.c index cf0da6801..a1d03f976 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -1,7 +1,7 @@ /***************************************************************************** * encoder.c: top-level encoder functions ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/encoder/lookahead.c b/encoder/lookahead.c index d49ba09e9..844cc6e1c 100644 --- a/encoder/lookahead.c +++ b/encoder/lookahead.c @@ -1,7 +1,7 @@ /***************************************************************************** * lookahead.c: high-level lookahead functions ***************************************************************************** - * Copyright (C) 2010-2023 Avail Media and x264 project + * Copyright (C) 2010-2024 Avail Media and x264 project * * Authors: Michael Kazmier * Alex Giladi diff --git a/encoder/macroblock.c b/encoder/macroblock.c index fffc9ad16..fcdd2bfc4 100644 --- a/encoder/macroblock.c +++ b/encoder/macroblock.c @@ -1,7 +1,7 @@ /***************************************************************************** * macroblock.c: macroblock encoding ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/encoder/macroblock.h b/encoder/macroblock.h index 291d8eb35..453cc27fc 100644 --- a/encoder/macroblock.h +++ b/encoder/macroblock.h @@ -1,7 +1,7 @@ /***************************************************************************** * macroblock.h: macroblock encoding ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/encoder/me.c b/encoder/me.c index 748dcaac0..e715da080 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -1,7 +1,7 @@ /***************************************************************************** * me.c: motion estimation ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/encoder/me.h b/encoder/me.h index afe94ad57..34b312f06 100644 --- a/encoder/me.h +++ b/encoder/me.h @@ -1,7 +1,7 @@ /***************************************************************************** * me.h: motion estimation ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c index ff8f27f5a..109ba8ff9 100644 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -1,7 +1,7 @@ /***************************************************************************** * ratecontrol.c: ratecontrol ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Loren Merritt * Michael Niedermayer @@ -156,6 +156,7 @@ struct x264_ratecontrol_t /* MBRC stuff */ volatile float frame_size_estimated; /* Access to this variable must be atomic: double is * not atomic on all arches we care about */ + volatile float bits_so_far; double frame_size_maximum; /* Maximum frame size due to MinCR */ double frame_size_planned; double slice_size_planned; @@ -1629,20 +1630,24 @@ int x264_ratecontrol_mb( x264_t *h, int bits ) float step_size = 0.5f; float slice_size_planned = h->param.b_sliced_threads ? rc->slice_size_planned : rc->frame_size_planned; float bits_so_far = row_bits_so_far( h, y ); + rc->bits_so_far = bits_so_far; float max_frame_error = x264_clip3f( 1.0 / h->mb.i_mb_height, 0.05, 0.25 ); float max_frame_size = rc->frame_size_maximum - rc->frame_size_maximum * max_frame_error; max_frame_size = X264_MIN( max_frame_size, rc->buffer_fill - rc->buffer_rate * max_frame_error ); float size_of_other_slices = 0; if( h->param.b_sliced_threads ) { - float size_of_other_slices_planned = 0; + float bits_so_far_of_other_slices = 0; for( int i = 0; i < h->param.i_threads; i++ ) if( h != h->thread[i] ) { size_of_other_slices += h->thread[i]->rc->frame_size_estimated; - size_of_other_slices_planned += h->thread[i]->rc->slice_size_planned; + bits_so_far_of_other_slices += h->thread[i]->rc->bits_so_far; } - float weight = rc->slice_size_planned / rc->frame_size_planned; + float weight = x264_clip3f( (bits_so_far_of_other_slices + rc->frame_size_estimated) / (size_of_other_slices + rc->frame_size_estimated), 0.0, 1.0 ); + float frame_size_planned = rc->frame_size_planned - rc->frame_size_planned * max_frame_error; + float size_of_other_slices_planned = X264_MIN( frame_size_planned, max_frame_size ) - rc->slice_size_planned; + size_of_other_slices_planned = X264_MAX( size_of_other_slices_planned, bits_so_far_of_other_slices ); size_of_other_slices = (size_of_other_slices - size_of_other_slices_planned) * weight + size_of_other_slices_planned; } if( y < h->i_threadslice_end-1 ) @@ -2233,7 +2238,7 @@ static void update_vbv_plan( x264_t *h, int overhead ) rcc->buffer_fill -= overhead; } -// apply VBV constraints and clip qscale to between lmin and lmax +// clip qscale to between lmin and lmax static double clip_qscale( x264_t *h, int pict_type, double q ) { x264_ratecontrol_t *rcc = h->rc; @@ -2241,13 +2246,32 @@ static double clip_qscale( x264_t *h, int pict_type, double q ) double lmax = rcc->lmax[pict_type]; if( rcc->rate_factor_max_increment ) lmax = X264_MIN( lmax, qp2qscale( rcc->qp_novbv + rcc->rate_factor_max_increment ) ); - double q0 = q; + if( lmin==lmax ) + return lmin; + else if( rcc->b_2pass ) + { + double min2 = log( lmin ); + double max2 = log( lmax ); + q = (log(q) - min2)/(max2-min2) - 0.5; + q = 1.0/(1.0 + exp( -4*q )); + q = q*(max2-min2) + min2; + return exp( q ); + } + else + return x264_clip3f( q, lmin, lmax ); +} + +// apply VBV constraints +static double vbv_pass1( x264_t *h, int pict_type, double q ) +{ + x264_ratecontrol_t *rcc = h->rc; /* B-frames are not directly subject to VBV, * since they are controlled by the P-frames' QPs. */ if( rcc->b_vbv && rcc->last_satd > 0 ) { + double q0 = q; double fenc_cpb_duration = (double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale; /* Lookahead VBV: raise the quantizer as necessary such that no frames in @@ -2365,29 +2389,11 @@ static double clip_qscale( x264_t *h, int pict_type, double q ) q = X264_MAX( q0/2, q ); } - /* Apply MinCR and buffer fill restrictions */ - double bits = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd ); - double frame_size_maximum = X264_MIN( rcc->frame_size_maximum, X264_MAX( rcc->buffer_fill, 0.001 ) ); - if( bits > frame_size_maximum ) - q *= bits / frame_size_maximum; - if( !rcc->b_vbv_min_rate ) q = X264_MAX( q0, q ); } - if( lmin==lmax ) - return lmin; - else if( rcc->b_2pass ) - { - double min2 = log( lmin ); - double max2 = log( lmax ); - q = (log(q) - min2)/(max2-min2) - 0.5; - q = 1.0/(1.0 + exp( -4*q )); - q = q*(max2-min2) + min2; - return exp( q ); - } - else - return x264_clip3f( q, lmin, lmax ); + return clip_qscale( h, pict_type, q ); } // update qscale for 1 frame based on actual bits used so far @@ -2449,9 +2455,18 @@ static float rate_estimate_qscale( x264_t *h ) rcc->frame_size_planned = qscale2bits( &rce, q ); else rcc->frame_size_planned = predict_size( rcc->pred_b_from_p, q, h->fref[1][h->i_ref[1]-1]->i_satd ); - /* Limit planned size by MinCR */ + + /* Apply MinCR and buffer fill restrictions */ if( rcc->b_vbv ) - rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum ); + { + double frame_size_maximum = X264_MIN( rcc->frame_size_maximum, X264_MAX( rcc->buffer_fill, 0.001 ) ); + if( rcc->frame_size_planned > frame_size_maximum ) + { + q *= rcc->frame_size_planned / frame_size_maximum; + rcc->frame_size_planned = frame_size_maximum; + } + } + rcc->frame_size_estimated = rcc->frame_size_planned; /* For row SATDs */ @@ -2613,8 +2628,7 @@ static float rate_estimate_qscale( x264_t *h ) } rcc->qp_novbv = qscale2qp( q ); - //FIXME use get_diff_limited_q() ? - q = clip_qscale( h, pict_type, q ); + q = vbv_pass1( h, pict_type, q ); } rcc->last_qscale_for[pict_type] = @@ -2628,12 +2642,21 @@ static float rate_estimate_qscale( x264_t *h ) else rcc->frame_size_planned = predict_size( &rcc->pred[h->sh.i_type], q, rcc->last_satd ); - /* Always use up the whole VBV in this case. */ - if( rcc->single_frame_vbv ) - rcc->frame_size_planned = rcc->buffer_rate; - /* Limit planned size by MinCR */ + /* Apply MinCR and buffer fill restrictions */ if( rcc->b_vbv ) - rcc->frame_size_planned = X264_MIN( rcc->frame_size_planned, rcc->frame_size_maximum ); + { + double frame_size_maximum = X264_MIN( rcc->frame_size_maximum, X264_MAX( rcc->buffer_fill, 0.001 ) ); + if( rcc->frame_size_planned > frame_size_maximum ) + { + q *= rcc->frame_size_planned / frame_size_maximum; + rcc->frame_size_planned = frame_size_maximum; + } + + /* Always use up the whole VBV in this case. */ + if( rcc->single_frame_vbv ) + rcc->frame_size_planned = X264_MIN( rcc->buffer_rate, frame_size_maximum ); + } + rcc->frame_size_estimated = rcc->frame_size_planned; return q; } diff --git a/encoder/ratecontrol.h b/encoder/ratecontrol.h index 8691c8180..5a942fae2 100644 --- a/encoder/ratecontrol.h +++ b/encoder/ratecontrol.h @@ -1,7 +1,7 @@ /***************************************************************************** * ratecontrol.h: ratecontrol ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/encoder/rdo.c b/encoder/rdo.c index 5940459b5..25c1bd059 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -1,7 +1,7 @@ /***************************************************************************** * rdo.c: rate-distortion optimization ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Loren Merritt * Fiona Glaser diff --git a/encoder/set.c b/encoder/set.c index 1bd610121..70c67aeee 100644 --- a/encoder/set.c +++ b/encoder/set.c @@ -1,7 +1,7 @@ /***************************************************************************** * set: header writing ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt @@ -608,7 +608,7 @@ int x264_sei_version_write( x264_t *h, bs_t *s ) memcpy( payload, uuid, 16 ); sprintf( payload+16, "x264 - core %d%s - H.264/MPEG-4 AVC codec - " - "Copy%s 2003-2023 - http://www.videolan.org/x264.html - options: %s", + "Copy%s 2003-2024 - http://www.videolan.org/x264.html - options: %s", X264_BUILD, X264_VERSION, HAVE_GPL?"left":"right", opts ); length = strlen(payload)+1; diff --git a/encoder/set.h b/encoder/set.h index 8ac1a2760..d9c06f767 100644 --- a/encoder/set.h +++ b/encoder/set.h @@ -1,7 +1,7 @@ /***************************************************************************** * set.h: header writing ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/encoder/slicetype-cl.c b/encoder/slicetype-cl.c index 0d86fd742..d9dbababb 100644 --- a/encoder/slicetype-cl.c +++ b/encoder/slicetype-cl.c @@ -1,7 +1,7 @@ /***************************************************************************** * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead) ***************************************************************************** - * Copyright (C) 2012-2023 x264 project + * Copyright (C) 2012-2024 x264 project * * Authors: Steve Borho * diff --git a/encoder/slicetype-cl.h b/encoder/slicetype-cl.h index bfb89cbd3..9c8ea843a 100644 --- a/encoder/slicetype-cl.h +++ b/encoder/slicetype-cl.h @@ -1,7 +1,7 @@ /***************************************************************************** * slicetype-cl.h: OpenCL slicetype decision code (lowres lookahead) ***************************************************************************** - * Copyright (C) 2017-2023 x264 project + * Copyright (C) 2017-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/encoder/slicetype.c b/encoder/slicetype.c index 4df66c711..278a8fded 100644 --- a/encoder/slicetype.c +++ b/encoder/slicetype.c @@ -1,7 +1,7 @@ /***************************************************************************** * slicetype.c: lookahead analysis ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Fiona Glaser * Loren Merritt diff --git a/example.c b/example.c index 3e2e563fa..530f0549a 100644 --- a/example.c +++ b/example.c @@ -1,7 +1,7 @@ /***************************************************************************** * example.c: libx264 API usage example ***************************************************************************** - * Copyright (C) 2014-2023 x264 project + * Copyright (C) 2014-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/extras/intel_dispatcher.h b/extras/intel_dispatcher.h index d6d44913b..f7cb2a012 100644 --- a/extras/intel_dispatcher.h +++ b/extras/intel_dispatcher.h @@ -1,7 +1,7 @@ /***************************************************************************** * intel_dispatcher.h: intel compiler cpu dispatcher override ***************************************************************************** - * Copyright (C) 2014-2023 x264 project + * Copyright (C) 2014-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/filters/filters.c b/filters/filters.c index d02e44959..79c513c3c 100644 --- a/filters/filters.c +++ b/filters/filters.c @@ -1,7 +1,7 @@ /***************************************************************************** * filters.c: common filter functions ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Diogo Franco * Steven Walters diff --git a/filters/filters.h b/filters/filters.h index cadbe6d9d..6af397084 100644 --- a/filters/filters.h +++ b/filters/filters.h @@ -1,7 +1,7 @@ /***************************************************************************** * filters.h: common filter functions ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Diogo Franco * Steven Walters diff --git a/filters/video/cache.c b/filters/video/cache.c index d67780e04..be86e4f1f 100644 --- a/filters/video/cache.c +++ b/filters/video/cache.c @@ -1,7 +1,7 @@ /***************************************************************************** * cache.c: cache video filter ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/filters/video/crop.c b/filters/video/crop.c index 0111fdac9..8df49a8ce 100644 --- a/filters/video/crop.c +++ b/filters/video/crop.c @@ -1,7 +1,7 @@ /***************************************************************************** * crop.c: crop video filter ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * James Darnley diff --git a/filters/video/depth.c b/filters/video/depth.c index 19f69201e..3060a48d0 100644 --- a/filters/video/depth.c +++ b/filters/video/depth.c @@ -1,7 +1,7 @@ /***************************************************************************** * depth.c: bit-depth conversion video filter ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Oskar Arvidsson * diff --git a/filters/video/fix_vfr_pts.c b/filters/video/fix_vfr_pts.c index f6cc0be0f..46262aae3 100644 --- a/filters/video/fix_vfr_pts.c +++ b/filters/video/fix_vfr_pts.c @@ -1,7 +1,7 @@ /***************************************************************************** * fix_vfr_pts.c: vfr pts fixing video filter ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/filters/video/internal.c b/filters/video/internal.c index 24b5a45df..a70834c3a 100644 --- a/filters/video/internal.c +++ b/filters/video/internal.c @@ -1,7 +1,7 @@ /***************************************************************************** * internal.c: video filter utilities ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/filters/video/internal.h b/filters/video/internal.h index 38876857d..fca8072b4 100644 --- a/filters/video/internal.h +++ b/filters/video/internal.h @@ -1,7 +1,7 @@ /***************************************************************************** * internal.h: video filter utilities ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/filters/video/resize.c b/filters/video/resize.c index 86ba96322..25dc95e01 100644 --- a/filters/video/resize.c +++ b/filters/video/resize.c @@ -1,7 +1,7 @@ /***************************************************************************** * resize.c: resize video filter ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/filters/video/select_every.c b/filters/video/select_every.c index 03aaf7dd0..deaa0e4fc 100644 --- a/filters/video/select_every.c +++ b/filters/video/select_every.c @@ -1,7 +1,7 @@ /***************************************************************************** * select_every.c: select-every video filter ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/filters/video/source.c b/filters/video/source.c index ce91c9317..f7c6b8701 100644 --- a/filters/video/source.c +++ b/filters/video/source.c @@ -1,7 +1,7 @@ /***************************************************************************** * source.c: source video filter ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/filters/video/video.c b/filters/video/video.c index 0fc93cead..29031efdb 100644 --- a/filters/video/video.c +++ b/filters/video/video.c @@ -1,7 +1,7 @@ /***************************************************************************** * video.c: video filters ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/filters/video/video.h b/filters/video/video.h index 215a91ff5..491d14e6c 100644 --- a/filters/video/video.h +++ b/filters/video/video.h @@ -1,7 +1,7 @@ /***************************************************************************** * video.h: video filters ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * diff --git a/input/avs.c b/input/avs.c index 1e77f484b..cbd221bd1 100644 --- a/input/avs.c +++ b/input/avs.c @@ -1,7 +1,7 @@ /***************************************************************************** * avs.c: avisynth input ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Steven Walters * Anton Mitrofanov diff --git a/input/ffms.c b/input/ffms.c index 6853c322c..91d362cd9 100644 --- a/input/ffms.c +++ b/input/ffms.c @@ -1,7 +1,7 @@ /***************************************************************************** * ffms.c: ffmpegsource input ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Mike Gurlitz * Steven Walters diff --git a/input/input.c b/input/input.c index b82c73a50..677d762ff 100644 --- a/input/input.c +++ b/input/input.c @@ -1,7 +1,7 @@ /***************************************************************************** * input.c: common input functions ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Steven Walters * Henrik Gramner diff --git a/input/input.h b/input/input.h index 8ed03eecf..341e92b69 100644 --- a/input/input.h +++ b/input/input.h @@ -1,7 +1,7 @@ /***************************************************************************** * input.h: file input ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/input/lavf.c b/input/lavf.c index f5ba6e41e..cdc0ed189 100644 --- a/input/lavf.c +++ b/input/lavf.c @@ -1,7 +1,7 @@ /***************************************************************************** * lavf.c: libavformat input ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Mike Gurlitz * Steven Walters diff --git a/input/raw.c b/input/raw.c index 6f25d7cf2..72185ce1b 100644 --- a/input/raw.c +++ b/input/raw.c @@ -1,7 +1,7 @@ /***************************************************************************** * raw.c: raw input ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/input/thread.c b/input/thread.c index 763ca0296..7713bc5d8 100644 --- a/input/thread.c +++ b/input/thread.c @@ -1,7 +1,7 @@ /***************************************************************************** * thread.c: threaded input ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/input/timecode.c b/input/timecode.c index e647bee58..6d106392a 100644 --- a/input/timecode.c +++ b/input/timecode.c @@ -1,7 +1,7 @@ /***************************************************************************** * timecode.c: timecode file input ***************************************************************************** - * Copyright (C) 2010-2023 x264 project + * Copyright (C) 2010-2024 x264 project * * Authors: Yusuke Nakamura * diff --git a/input/y4m.c b/input/y4m.c index ab6550167..5764d6ac8 100644 --- a/input/y4m.c +++ b/input/y4m.c @@ -1,7 +1,7 @@ /***************************************************************************** * y4m.c: y4m input ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/output/flv.c b/output/flv.c index 7a418f451..6a7d97f60 100644 --- a/output/flv.c +++ b/output/flv.c @@ -1,7 +1,7 @@ /***************************************************************************** * flv.c: flv muxer ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Kieran Kunhya * diff --git a/output/flv_bytestream.c b/output/flv_bytestream.c index 77f3a4a43..995ef6ea5 100644 --- a/output/flv_bytestream.c +++ b/output/flv_bytestream.c @@ -1,7 +1,7 @@ /***************************************************************************** * flv_bytestream.c: flv muxer utilities ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Kieran Kunhya * diff --git a/output/flv_bytestream.h b/output/flv_bytestream.h index 1f039eaec..f0f369c48 100644 --- a/output/flv_bytestream.h +++ b/output/flv_bytestream.h @@ -1,7 +1,7 @@ /***************************************************************************** * flv_bytestream.h: flv muxer utilities ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Kieran Kunhya * diff --git a/output/matroska.c b/output/matroska.c index 5bea27eda..18d09293d 100644 --- a/output/matroska.c +++ b/output/matroska.c @@ -1,7 +1,7 @@ /***************************************************************************** * matroska.c: matroska muxer ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Mike Matsnev * diff --git a/output/matroska_ebml.c b/output/matroska_ebml.c index 3f0423201..b1aca8c32 100644 --- a/output/matroska_ebml.c +++ b/output/matroska_ebml.c @@ -1,7 +1,7 @@ /***************************************************************************** * matroska_ebml.c: matroska muxer utilities ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Mike Matsnev * diff --git a/output/matroska_ebml.h b/output/matroska_ebml.h index fc8cf765d..90b5315a2 100644 --- a/output/matroska_ebml.h +++ b/output/matroska_ebml.h @@ -1,7 +1,7 @@ /***************************************************************************** * matroska_ebml.h: matroska muxer utilities ***************************************************************************** - * Copyright (C) 2005-2023 x264 project + * Copyright (C) 2005-2024 x264 project * * Authors: Mike Matsnev * diff --git a/output/mp4.c b/output/mp4.c index 8e665ff12..bb575946e 100644 --- a/output/mp4.c +++ b/output/mp4.c @@ -1,7 +1,7 @@ /***************************************************************************** * mp4.c: mp4 muxer ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/output/mp4_lsmash.c b/output/mp4_lsmash.c index 2b65a55d9..72314dbe2 100644 --- a/output/mp4_lsmash.c +++ b/output/mp4_lsmash.c @@ -1,7 +1,7 @@ /***************************************************************************** * mp4_lsmash.c: mp4 muxer using L-SMASH ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/output/output.h b/output/output.h index 8bdae2176..433d700a4 100644 --- a/output/output.h +++ b/output/output.h @@ -1,7 +1,7 @@ /***************************************************************************** * output.h: x264 file output modules ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/output/raw.c b/output/raw.c index bcdd66d08..149cd5a8b 100644 --- a/output/raw.c +++ b/output/raw.c @@ -1,7 +1,7 @@ /***************************************************************************** * raw.c: raw muxer ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/tools/checkasm-a.asm b/tools/checkasm-a.asm index 9cb678321..4d8f3c647 100644 --- a/tools/checkasm-a.asm +++ b/tools/checkasm-a.asm @@ -1,7 +1,7 @@ ;***************************************************************************** ;* checkasm-a.asm: assembly check tool ;***************************************************************************** -;* Copyright (C) 2008-2023 x264 project +;* Copyright (C) 2008-2024 x264 project ;* ;* Authors: Loren Merritt ;* Henrik Gramner @@ -152,11 +152,7 @@ cglobal checkasm_call, 2,15,16,-1*(((max_args+1)*8+STACK_ALIGNMENT-1) & ~(STACK_ mov r9, rax mov r10, rdx lea r0, [error_message] -%if FORMAT_ELF - call puts wrt ..plt -%else call puts -%endif mov r1, [rsp+max_args*8] mov dword [r1], 0 mov rdx, r10 diff --git a/tools/checkasm-aarch64.S b/tools/checkasm-aarch64.S index d303f9ab0..b19f2f761 100644 --- a/tools/checkasm-aarch64.S +++ b/tools/checkasm-aarch64.S @@ -1,7 +1,7 @@ /**************************************************************************** * checkasm-aarch64.S: assembly check tool ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Martin Storsjo * @@ -164,3 +164,13 @@ function checkasm_call, export=1 ldp x29, x30, [sp], #16 ret endfunc + +#if HAVE_SVE +.arch armv8-a+sve + +function checkasm_sve_length, export=1 + cntb x0 + lsl x0, x0, #3 + ret +endfunc +#endif diff --git a/tools/checkasm-arm.S b/tools/checkasm-arm.S index 1c4186e37..d185aa9ae 100644 --- a/tools/checkasm-arm.S +++ b/tools/checkasm-arm.S @@ -1,7 +1,7 @@ /**************************************************************************** * checkasm-arm.S: assembly check tool ***************************************************************************** - * Copyright (C) 2015-2023 x264 project + * Copyright (C) 2015-2024 x264 project * * Authors: Martin Storsjo * diff --git a/tools/checkasm-loongarch.S b/tools/checkasm-loongarch.S new file mode 100644 index 000000000..716974e6e --- /dev/null +++ b/tools/checkasm-loongarch.S @@ -0,0 +1,210 @@ +/**************************************************************************** + * checkasm-loongarch.S: assembly check tool + ***************************************************************************** + * Copyright (C) 2024 x264 project + * + * Authors: Xiwei Gu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at licensing@x264.com. + *****************************************************************************/ + +#include "../common/loongarch/loongson_asm.S" + +const register_init, align=3 +.quad 0x21f86d66c8ca00ce +.quad 0x75b6ba21077c48ad +.quad 0xed56bb2dcb3c7736 +.quad 0x8bda43d3fd1a7e06 +.quad 0xb64a9c9e5d318408 +.quad 0xdf9a54b303f1d3a3 +.quad 0x4a75479abd64e097 +.quad 0x249214109d5d1c88 +.quad 0x1a1b2550a612b48c +.quad 0x79445c159ce79064 +.quad 0x2eed899d5a28ddcd +.quad 0x86b2536fcd8cf636 +.quad 0xb0856806085e7943 +.quad 0x3f2bf84fc0fcca4e +.quad 0xacbd382dcf5b8de2 +.quad 0xd229e1f5b281303f +.quad 0x71aeaff20b095fd9 +endconst + +const error_message +.asciz "failed to preserve register" +endconst + +.text + +// max number of args used by any x264 asm function. +#define MAX_ARGS 15 + +#define CLOBBER_STACK ((8*MAX_ARGS + 15) & ~15) + +// Fill dirty data at stack space +function x264_checkasm_stack_clobber + move t0, sp + addi.d t1, zero, CLOBBER_STACK +1: + st.d a0, sp, 0x00 + st.d a1, sp, -0x08 + addi.d sp, sp, -0x10 + addi.d t1, t1, -0x10 + blt zero,t1, 1b + move sp, t0 +endfunc + +#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15) + +function x264_checkasm_call + // Saved s0 - s8, fs0 - fs7 + move t4, sp + addi.d sp, sp, -136 + st.d s0, sp, 0 + st.d s1, sp, 8 + st.d s2, sp, 16 + st.d s3, sp, 24 + st.d s4, sp, 32 + st.d s5, sp, 40 + st.d s6, sp, 48 + st.d s7, sp, 56 + st.d s8, sp, 64 + fst.d fs0, sp, 72 + fst.d fs1, sp, 80 + fst.d fs2, sp, 88 + fst.d fs3, sp, 96 + fst.d fs4, sp, 104 + fst.d fs5, sp, 112 + fst.d fs6, sp, 120 + fst.d fs7, sp, 128 + + la.local t1, register_init + ld.d s0, t1, 0 + ld.d s1, t1, 8 + ld.d s2, t1, 16 + ld.d s3, t1, 24 + ld.d s4, t1, 32 + ld.d s5, t1, 40 + ld.d s6, t1, 48 + ld.d s7, t1, 56 + ld.d s8, t1, 64 + fld.d fs0, t1, 72 + fld.d fs1, t1, 80 + fld.d fs2, t1, 88 + fld.d fs3, t1, 96 + fld.d fs4, t1, 104 + fld.d fs5, t1, 112 + fld.d fs6, t1, 120 + fld.d fs7, t1, 128 + + addi.d sp, sp, -16 + st.d a1, sp, 0 // ok + st.d ra, sp, 8 // Ret address + + addi.d sp, sp, -ARG_STACK + + addi.d t0, zero, 8*8 + xor t1, t1, t1 +.rept MAX_ARGS - 8 + // Skip the first 8 args, that are loaded into registers + ldx.d t2, t4, t0 + stx.d t2, sp, t1 + addi.d t0, t0, 8 + addi.d t1, t1, 8 +.endr + move t3, a0 // Func + ld.d a0, t4, 0 + ld.d a1, t4, 8 + ld.d a2, t4, 16 + ld.d a3, t4, 24 + ld.d a4, t4, 32 + ld.d a5, t4, 40 + ld.d a6, t4, 48 + ld.d a7, t4, 56 + + jirl ra, t3, 0 + + addi.d sp, sp, ARG_STACK + ld.d t2, sp, 0 // ok + ld.d ra, sp, 8 // Ret address + addi.d sp, sp, 16 + + la.local t1, register_init + xor t3, t3, t3 + +.macro check_reg_gr reg1 + ld.d t0, t1, 0 + xor t0, $s\reg1, t0 + or t3, t3, t0 + addi.d t1, t1, 8 +.endm + check_reg_gr 0 + check_reg_gr 1 + check_reg_gr 2 + check_reg_gr 3 + check_reg_gr 4 + check_reg_gr 5 + check_reg_gr 6 + check_reg_gr 7 + check_reg_gr 8 + +.macro check_reg_fr reg1 + ld.d t0, t1, 0 + movfr2gr.d t4,$fs\reg1 + xor t0, t0, t4 + or t3, t3, t0 + addi.d t1, t1, 8 +.endm + check_reg_fr 0 + check_reg_fr 1 + check_reg_fr 2 + check_reg_fr 3 + check_reg_fr 4 + check_reg_fr 5 + check_reg_fr 6 + check_reg_fr 7 + + beqz t3, 0f + + st.d zero,t2, 0x00 // Set OK to 0 + la.local a0, error_message + addi.d sp, sp, -8 + st.d ra, sp, 0 + bl puts + ld.d ra, sp, 0 + addi.d sp, sp, 8 +0: + ld.d s0, sp, 0 + ld.d s1, sp, 8 + ld.d s2, sp, 16 + ld.d s3, sp, 24 + ld.d s4, sp, 32 + ld.d s5, sp, 40 + ld.d s6, sp, 48 + ld.d s7, sp, 56 + ld.d s8, sp, 64 + fld.d fs0, sp, 72 + fld.d fs1, sp, 80 + fld.d fs2, sp, 88 + fld.d fs3, sp, 96 + fld.d fs4, sp, 104 + fld.d fs5, sp, 112 + fld.d fs6, sp, 120 + fld.d fs7, sp, 128 + addi.d sp, sp, 136 +endfunc diff --git a/tools/checkasm.c b/tools/checkasm.c index 20775714e..6ff088f18 100644 --- a/tools/checkasm.c +++ b/tools/checkasm.c @@ -1,7 +1,7 @@ /***************************************************************************** * checkasm.c: assembly check tool ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar @@ -117,6 +117,9 @@ static inline uint32_t read_time(void) a = b; #elif ARCH_MIPS asm volatile( "rdhwr %0, $2" : "=r"(a) :: "memory" ); +#elif ARCH_LOONGARCH + uint32_t id = 0; + asm volatile( "rdtimel.w %0, %1" : "=r"(a), "=r"(id) :: "memory" ); #endif return a; } @@ -211,10 +214,15 @@ static void print_bench(void) b->cpu&X264_CPU_NEON ? "neon" : b->cpu&X264_CPU_ARMV6 ? "armv6" : #elif ARCH_AARCH64 + b->cpu&X264_CPU_SVE2 ? "sve2" : + b->cpu&X264_CPU_SVE ? "sve" : b->cpu&X264_CPU_NEON ? "neon" : b->cpu&X264_CPU_ARMV8 ? "armv8" : #elif ARCH_MIPS b->cpu&X264_CPU_MSA ? "msa" : +#elif ARCH_LOONGARCH + b->cpu&X264_CPU_LASX ? "lasx" : + b->cpu&X264_CPU_LSX ? "lsx" : #endif "c", #if ARCH_X86 || ARCH_X86_64 @@ -254,6 +262,10 @@ intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); #if HAVE_AARCH64 intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); + +#if HAVE_SVE +int x264_checkasm_sve_length( void ); +#endif #endif #if HAVE_ARMV6 @@ -262,6 +274,10 @@ intptr_t x264_checkasm_call_noneon( intptr_t (*func)(), int *ok, ... ); intptr_t (*x264_checkasm_call)( intptr_t (*func)(), int *ok, ... ) = x264_checkasm_call_noneon; #endif +#if ARCH_LOONGARCH +intptr_t x264_checkasm_call( intptr_t (*func)(), int *ok, ... ); +#endif + #define call_c1(func,...) func(__VA_ARGS__) #if HAVE_MMX && ARCH_X86_64 @@ -288,6 +304,12 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... ); x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); }) #elif HAVE_MMX || HAVE_ARMV6 #define call_a1(func,...) x264_checkasm_call( (intptr_t(*)())func, &ok, __VA_ARGS__ ) +#elif ARCH_LOONGARCH && HAVE_LSX +void x264_checkasm_stack_clobber( uint64_t clobber, ... ); +#define call_a1(func,...) ({ \ + uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \ + x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+8 */ \ + x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, 0, 0, __VA_ARGS__ ); }) #else #define call_a1 call_c1 #endif @@ -2880,6 +2902,9 @@ static int check_all_flags( void ) simd_warmup_func = x264_checkasm_warmup_avx; #endif simd_warmup(); +#if ARCH_AARCH64 && HAVE_SVE + char buf[20]; +#endif #if ARCH_X86 || ARCH_X86_64 if( cpu_detect & X264_CPU_MMX2 ) @@ -2973,9 +2998,24 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_ARMV8, "ARMv8" ); if( cpu_detect & X264_CPU_NEON ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_NEON, "NEON" ); +#if HAVE_SVE + if( cpu_detect & X264_CPU_SVE ) { + snprintf( buf, sizeof( buf ), "SVE (%d bits)", x264_checkasm_sve_length() ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SVE, buf ); + } + if( cpu_detect & X264_CPU_SVE2 ) { + snprintf( buf, sizeof( buf ), "SVE2 (%d bits)", x264_checkasm_sve_length() ); + ret |= add_flags( &cpu0, &cpu1, X264_CPU_SVE2, buf ); + } +#endif #elif ARCH_MIPS if( cpu_detect & X264_CPU_MSA ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_MSA, "MSA" ); +#elif ARCH_LOONGARCH + if( cpu_detect & X264_CPU_LSX ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LSX, "LSX" ); + if( cpu_detect & X264_CPU_LASX ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LASX, "LASX" ); #endif return ret; } @@ -2989,7 +3029,7 @@ REALIGN_STACK int main( int argc, char **argv ) if( argc > 1 && !strncmp( argv[1], "--bench", 7 ) ) { -#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS +#if !ARCH_X86 && !ARCH_X86_64 && !ARCH_PPC && !ARCH_ARM && !ARCH_AARCH64 && !ARCH_MIPS && !ARCH_LOONGARCH fprintf( stderr, "no --bench for your cpu until you port rdtsc\n" ); return 1; #endif diff --git a/x264.c b/x264.c index 5f05b5541..6c9bb62d3 100644 --- a/x264.c +++ b/x264.c @@ -1,7 +1,7 @@ /***************************************************************************** * x264: top-level x264cli functions ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Loren Merritt * Laurent Aimar diff --git a/x264.h b/x264.h index b8619d4be..2cd5df525 100644 --- a/x264.h +++ b/x264.h @@ -1,7 +1,7 @@ /***************************************************************************** * x264.h: x264 public header ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt @@ -177,10 +177,16 @@ typedef struct x264_nal_t #define X264_CPU_NEON 0x0000002U /* ARM NEON */ #define X264_CPU_FAST_NEON_MRC 0x0000004U /* Transfer from NEON to ARM register is fast (Cortex-A9) */ #define X264_CPU_ARMV8 0x0000008U +#define X264_CPU_SVE 0x0000010U /* AArch64 SVE */ +#define X264_CPU_SVE2 0x0000020U /* AArch64 SVE2 */ /* MIPS */ #define X264_CPU_MSA 0x0000001U /* MIPS MSA */ +/* LOONGARCH */ +#define X264_CPU_LSX 0x0000001U /* LOONGARCH LSX */ +#define X264_CPU_LASX 0x0000002U /* LOONGARCH LASX */ + /* Analyse flags */ #define X264_ANALYSE_I4x4 0x0001U /* Analyse i4x4 */ #define X264_ANALYSE_I8x8 0x0002U /* Analyse i8x8 (requires 8x8 transform) */ diff --git a/x264cli.h b/x264cli.h index 161f6c896..41ff3193e 100644 --- a/x264cli.h +++ b/x264cli.h @@ -1,7 +1,7 @@ /***************************************************************************** * x264cli.h: x264cli common ***************************************************************************** - * Copyright (C) 2003-2023 x264 project + * Copyright (C) 2003-2024 x264 project * * Authors: Laurent Aimar * Loren Merritt diff --git a/x264dll.c b/x264dll.c index b816aac43..8e150308e 100644 --- a/x264dll.c +++ b/x264dll.c @@ -1,7 +1,7 @@ /***************************************************************************** * x264dll: x264 DLLMain for win32 ***************************************************************************** - * Copyright (C) 2009-2023 x264 project + * Copyright (C) 2009-2024 x264 project * * Authors: Anton Mitrofanov * diff --git a/x264res.rc b/x264res.rc index 9204ab4cf..10a994270 100644 --- a/x264res.rc +++ b/x264res.rc @@ -1,7 +1,7 @@ /***************************************************************************** * x264res.rc: windows resource file ***************************************************************************** - * Copyright (C) 2012-2023 x264 project + * Copyright (C) 2012-2024 x264 project * * Authors: Henrik Gramner * @@ -68,7 +68,7 @@ BEGIN #endif VALUE "FileVersion", X264_POINTVER VALUE "InternalName", "x264" - VALUE "LegalCopyright", "Copyright (C) 2003-2023 x264 project" + VALUE "LegalCopyright", "Copyright (C) 2003-2024 x264 project" #ifdef DLL VALUE "OriginalFilename", "libx264-" xstr(X264_BUILD) ".dll" #else